Skip to main content

A python package for data quality unit testing.

Project description

Glossary of Checks

CheckColumnAllSame

check = CheckColumnAllSame("Check-Same-All", "Col_Same", "A")

CheckColumnIsNotNull

check = CheckColumnIsNotNull("Check-Id-NotNull", "Col_Id")

CheckColumnIsNull

check = CheckColumnIsNull("Check-Empty-IsNull", "Col_Empty")

CheckColumnIsUnique

check = CheckColumnIsUnique("Check-Id-Uniuqe", "Col_Id")

CheckColumnFunctionAll

Column values equal (eq / ==)

Can be used instead of CheckColumnAllSame

check = CheckColumnFunctionAll("Check-Same-eq", "Col_Same", "A", Functions.check_column_equal)

Column values not equal (ne / !=)

check = CheckColumnFunctionAll("Check-Same-ne", "Col_Same", "B", Functions.check_column_not_equal)

Column values lower then (lt / <)

check = CheckColumnFunctionAll("Check-Year-lt", "Col_Year", 2100, Functions.check_column_lower_than)

Column values greater then (gt / >)

check = CheckColumnFunctionAll("Check-Year-gt", "Col_Year", 1900, Functions.check_column_greater_than)

Column values lower or equal then (le / <=)

check = CheckColumnFunctionAll("Check-Year-le", "Col_Year", 2022, Functions.check_column_lower_equal_than)

Column values greater or equal then (ge / >=)

check = CheckColumnFunctionAll("Check-Year-ge", "Col_Year", 1901, Functions.check_column_greater_equal_than)

Column values between

vals = [1900, datetime.today().year + 1]
check = CheckColumnFunctionAll("Check-Year-between", "Col_Year", vals, Functions.check_column_between)

Column values in set of values

vals = {"M", "F"}
check = CheckColumnFunctionAll("Check-Gender-InSet", "Col_Gender", vals, Functions.check_column_in_set)

Column values in set of values

vals = {1, 2}
check = CheckColumnFunctionAll("Check-Gender-NotInSet", "Col_Gender", vals, Functions.check_column_not_in_set)

Column values length between

vals = [0, 4]
check = CheckColumnFunctionAll("Check-Year-LengthBetween", "Col_Year", vals, Functions.check_column_length_between)

Column values length equal

check = CheckColumnFunctionAll("Check-Gender-LengthEqual", "Col_Gender", 1, Functions.check_column_length_equal)

CheckColumnRegexAll

Column values match regex

regex = r"^[0-9]{4}$"
check = CheckColumnRegexAll("Check-Year-Match-4digit-Regex", "Col_Year", regex)

Column values don't match regex

regex = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"
check = CheckColumnRegexAll("Check-Year-Dont-Match-EMail-Regex", "Col_Year", regex, False)

CheckColumnLikeAll

Column values match like pattern (% end)

year = datetime.today().strftime("%Y")
like = f"{year}-%"
check = CheckColumnLikeAll("Check-Export-Match-Like-End", "Col_Export", like)

Column values match like pattern (% start & end)

month = datetime.today().strftime("%m")
like = f"%-{month}-%"
check = CheckColumnLikeAll("Check-Export-Match-Like-Start&End", "Col_Export", like)

Column values match like pattern (% start)

day = datetime.today().strftime("%d")
like = f"%-{day}"
check = CheckColumnLikeAll("Check-Export-Match-Like-Start", "Col_Export", like)

Column values not match like pattern (% end)

day = datetime.today().strftime("%d")
like = f"{day}.%"
check = CheckColumnLikeAll("Check-Export-Not-Match-Like-End", "Col_Export", like, False)

Column values not match like pattern (% start & end)

month = datetime.today().strftime("%m")
like = f"%.{month}.%"
check = CheckColumnLikeAll("Check-Export-Not-Match-Like-Start&End", "Col_Export", like, False)

Column values not match like pattern (% start)

year = datetime.today().strftime("%Y")
like = f"%.{year}"
check = CheckColumnLikeAll("Check-Export-Not-Match-Like-Start", "Col_Export", like, False)

CheckColumnMatchStrftime

format = "%Y-%m-%d"
check = CheckColumnMatchStrftime("Check-Export-Match-Strftim-Format", "Col_Export", format)

CheckColumnDateutilParseable

check = CheckColumnDateutilParseable("Check-Export-Dateuitl-Parseable", "Col_Export")

Full Example

from os import linesep
from io import StringIO
from datetime import datetime
import pandas as pd

from random import seed, random, randint

from unswamp.objects import CheckSuite
from unswamp.checks import (
    CheckColumnIsNotNull,
    CheckColumnIsUnique,
    CheckColumnIsNull,
    CheckColumnAllSame,
    CheckColumnFunctionAll,
    CheckColumnLikeAll,
    CheckColumnRegexAll,
    CheckColumnMatchStrftime,
    CheckColumnDateutilParseable,
    Functions,
)

#################################################
# Methods for dummy data
#################################################
def build_csv(records=1000000, curr_seed=42, same="A"):
    seed(curr_seed)
    csv = f"Col_Id,Col_Empty,Col_Same,Col_Year,Col_Gender,Col_Export{linesep}"
    for pos in range(records):
        year = randint(1901, datetime.today().year)
        gender = "M" if random() > 0.5 else "F"
        export = datetime.today().strftime("%Y-%m-%d")
        csv += f"{pos},,{same},{year},{gender},{export}{linesep}"
    return csv


def build_dataset(csv):
    data = StringIO(csv)
    dataset = pd.read_csv(data)
    return dataset


#################################################
# Create dataset
#################################################
csv = build_csv()
dataset = build_dataset(csv)

#################################################
# Create CheckSuite
#################################################
dataset_name = "Dummy"
suite = CheckSuite(dataset_name, dataset)

#################################################
# Add checks to CheckSuite
#################################################
# CheckColumnAllSame
check = CheckColumnAllSame("Check-Same-All", "Col_Same", "A")
suite.add_check(check)

# CheckColumnIsNotNull
check = CheckColumnIsNotNull("Check-Id-NotNull", "Col_Id")
suite.add_check(check)

# CheckColumnIsNull
check = CheckColumnIsNull("Check-Empty-IsNull", "Col_Empty")
suite.add_check(check)

# CheckColumnIsUnique
check = CheckColumnIsUnique("Check-Id-Uniuqe", "Col_Id")
suite.add_check(check)

# CheckColumnFunctionAll
# eq / ==
check = CheckColumnFunctionAll("Check-Same-eq", "Col_Same", "A", Functions.check_column_equal)
suite.add_check(check)

# ne / !=
check = CheckColumnFunctionAll("Check-Same-ne", "Col_Same", "B", Functions.check_column_not_equal)
suite.add_check(check)

# lt / <
check = CheckColumnFunctionAll("Check-Year-lt", "Col_Year", 2100, Functions.check_column_lower_than)
suite.add_check(check)

# gt / >
check = CheckColumnFunctionAll("Check-Year-gt", "Col_Year", 1900, Functions.check_column_greater_than)
suite.add_check(check)

# le / <=
check = CheckColumnFunctionAll("Check-Year-le", "Col_Year", 2022, Functions.check_column_lower_equal_than)
suite.add_check(check)

# ge / >=
check = CheckColumnFunctionAll("Check-Year-ge", "Col_Year", 1901, Functions.check_column_greater_equal_than)
suite.add_check(check)

# between
vals = [1900, datetime.today().year + 1]
check = CheckColumnFunctionAll("Check-Year-between", "Col_Year", vals, Functions.check_column_between)
suite.add_check(check)

# values in set
vals = {"M", "F"}
check = CheckColumnFunctionAll("Check-Gender-InSet", "Col_Gender", vals, Functions.check_column_in_set)
suite.add_check(check)

# values not in set
vals = {1, 2}
check = CheckColumnFunctionAll("Check-Gender-NotInSet", "Col_Gender", vals, Functions.check_column_not_in_set)
suite.add_check(check)

# values length between
vals = [0, 4]
check = CheckColumnFunctionAll("Check-Year-LengthBetween", "Col_Year", vals, Functions.check_column_length_between)
suite.add_check(check)

# values length equal
check = CheckColumnFunctionAll("Check-Gender-LengthEqual", "Col_Gender", 1, Functions.check_column_length_equal)
suite.add_check(check)

# CheckColumnRegexAll
# match values regex
regex = r"^[0-9]{4}$"
check = CheckColumnRegexAll("Check-Year-Match-4digit-Regex", "Col_Year", regex)
suite.add_check(check)

# don't match values regex
regex = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"
check = CheckColumnRegexAll("Check-Year-Dont-Match-EMail-Regex", "Col_Year", regex, False)
suite.add_check(check)

# CheckColumnLikeAll
# match regex like pattern % end
year = datetime.today().strftime("%Y")
like = f"{year}-%"
check = CheckColumnLikeAll("Check-Export-Match-Like-End", "Col_Export", like)
suite.add_check(check)

# match regex like pattern % start & end
month = datetime.today().strftime("%m")
like = f"%-{month}-%"
check = CheckColumnLikeAll("Check-Export-Match-Like-Start&End", "Col_Export", like)
suite.add_check(check)

# match regex like pattern % start
day = datetime.today().strftime("%d")
like = f"%-{day}"
check = CheckColumnLikeAll("Check-Export-Match-Like-Start", "Col_Export", like)
suite.add_check(check)

# not match regex like pattern % end
day = datetime.today().strftime("%d")
like = f"{day}.%"
check = CheckColumnLikeAll("Check-Export-Not-Match-Like-End", "Col_Export", like, False)
suite.add_check(check)

# match regex like pattern % start & end
month = datetime.today().strftime("%m")
like = f"%.{month}.%"
check = CheckColumnLikeAll("Check-Export-Not-Match-Like-Start&End", "Col_Export", like, False)
suite.add_check(check)

# not match regex like pattern % start
year = datetime.today().strftime("%Y")
like = f"%.{year}"
check = CheckColumnLikeAll("Check-Export-Not-Match-Like-Start", "Col_Export", like, False)
suite.add_check(check)

# CheckColumnMatchStrftime
format = "%Y-%m-%d"
check = CheckColumnMatchStrftime("Check-Export-Match-Strftim-Format", "Col_Export", format)
suite.add_check(check)

# CheckColumnDateutilParseable
check = CheckColumnDateutilParseable("Check-Export-Dateuitl-Parseable", "Col_Export")
suite.add_check(check)

#################################################
# Run Checks
#################################################
check_run = suite.run()

#################################################
# Print result
#################################################
for result in check_run.results:
    print(f"{result.passed} - {result.duration} - {result.id}")

Credits

Icon by Ary Prasetyo https://thenounproject.com/search/?q=swamp&i=1592639

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

unswamp-1.0.2.tar.gz (11.9 kB view hashes)

Uploaded Source

Built Distribution

unswamp-1.0.2-py3-none-any.whl (20.1 kB view hashes)

Uploaded Python 3

Supported by

AWS AWS Cloud computing and Security Sponsor Datadog Datadog Monitoring Fastly Fastly CDN Google Google Download Analytics Microsoft Microsoft PSF Sponsor Pingdom Pingdom Monitoring Sentry Sentry Error logging StatusPage StatusPage Status page