Skip to content

Commit

Permalink
adding configuration options to uniques functionality (#224)
Browse files Browse the repository at this point in the history
adding configuration options to uniques functionality (#224)

Docs update

Co-authored-by: Kevin Klein <[email protected]>

Docs update

Co-authored-by: Kevin Klein <[email protected]>

Docs update

Co-authored-by: Kevin Klein <[email protected]>

Docs update

Co-authored-by: Kevin Klein <[email protected]>

Docs update

Co-authored-by: Kevin Klein <[email protected]>

Docs update

Co-authored-by: Kevin Klein <[email protected]>

Docs update

Co-authored-by: Kevin Klein <[email protected]>

Docs update

Co-authored-by: Kevin Klein <[email protected]>

Docs update

Co-authored-by: Kevin Klein <[email protected]>

Docs update

Co-authored-by: Kevin Klein <[email protected]>

Docs update

Co-authored-by: Kevin Klein <[email protected]>

Docs update

Co-authored-by: Kevin Klein <[email protected]>

Docs update

Co-authored-by: Kevin Klein <[email protected]>

Docs update

Co-authored-by: Kevin Klein <[email protected]>

Docs update

Co-authored-by: Kevin Klein <[email protected]>

Docs update

Co-authored-by: Kevin Klein <[email protected]>

Docs update

Co-authored-by: Kevin Klein <[email protected]>

update doc string on null columns everywhere and fix typo

Update docs

Co-authored-by: Kevin Klein <[email protected]>

Update docs

Co-authored-by: Kevin Klein <[email protected]>

Update docs

Co-authored-by: Kevin Klein <[email protected]>

docs updates

update docs

filternull docs clarification

replace assert by raise ValueError

shorten name to apply_output_formatting

add unit tests for new utils functions

set default to limit 100 elements

ensure all relevant tests run for impala and ensure they pass

disable extralong test for bigquery due to slow speed

capitalization test handle parallel if table already created
  • Loading branch information
SimonLangerQC authored and kklein committed Jun 25, 2024
1 parent 2043974 commit 28742ec
Show file tree
Hide file tree
Showing 11 changed files with 1,427 additions and 53 deletions.
8 changes: 6 additions & 2 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ jobs:
uses: ./.github/actions/pytest
with:
backend: bigquery
args: -n auto tests/integration
args: -n 16 -v tests/integration

impala-column:
if: ${{ contains(github.event.pull_request.labels.*.name, 'impala') || contains(github.event.pull_request.labels.*.name, 'ready') || github.ref == 'refs/heads/main' }}
Expand All @@ -275,7 +275,11 @@ jobs:
matrix:
PYTHON_VERSION: [ '3.8' ]
SA_VERSION: ["<2.0"]
PYTEST_ARG: ["tests/integration/test_column_capitalization.py", "tests/integration/test_data_source.py", "tests/integration/test_integration.py -k row", "tests/integration/test_integration.py -k uniques", "tests/integration/test_integration.py -k date", "tests/integration/test_integration.py -k varchar", "tests/integration/test_integration.py -k numeric"]
# PYTEST_ARG: ["tests/integration/test_column_capitalization.py", "tests/integration/test_data_source.py", "tests/integration/test_integration.py -k row", "tests/integration/test_integration.py -k uniques", "tests/integration/test_integration.py -k date", "tests/integration/test_integration.py -k varchar", "tests/integration/test_integration.py -k numeric"]

# more comprehensive matching; note that tests which start with test_i and not test_integer are not matched and must be added here

PYTEST_ARG: ["tests/integration/test_integration.py -k 'test_a or test_b or test_c or test_d or test_e or test_f or test_g or test_h or test_integer or test_j or test_k or test_l or test_m'", "tests/integration/test_integration.py -k 'test_n or test_o or test_p or test_q or test_r or test_s or test_t or test_u or test_v or test_w or test_x or test_y or test_z'"]

steps:
- name: Checkout branch
Expand Down
15 changes: 15 additions & 0 deletions run_integration_tests_postgres.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/bash

docker stop $(docker ps -q --filter name=postgres_datajudge)

./start_postgres.sh &
bash -c "while true; do printf '\nPress enter once postgres is ready: '; sleep 1; done" &

read -p "Press enter to once postgres is ready: "
kill %%

echo "STARTING PYTEST"
pytest tests/integration -vv --backend=postgres "$@"

docker stop $(docker ps -q --filter name=postgres_datajudge)

25 changes: 23 additions & 2 deletions src/datajudge/constraints/base.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import abc
from dataclasses import dataclass, field
from functools import lru_cache
from typing import Any, Callable, List, Optional, Tuple, TypeVar
from typing import Any, Callable, Collection, List, Optional, Tuple, TypeVar, Union

import sqlalchemy as sa

from ..db_access import DataReference
from ..formatter import Formatter
from ..utils import OutputProcessor, output_processor_limit

DEFAULT_FORMATTER = Formatter()

Expand Down Expand Up @@ -113,7 +114,15 @@ class Constraint(abc.ABC):
"""

def __init__(
self, ref: DataReference, *, ref2=None, ref_value: Any = None, name: str = None
self,
ref: DataReference,
*,
ref2=None,
ref_value: Any = None,
name: str = None,
output_processors: Optional[
Union[OutputProcessor, List[OutputProcessor]]
] = output_processor_limit,
):
self._check_if_valid_between_or_within(ref2, ref_value)
self.ref = ref
Expand All @@ -125,6 +134,12 @@ def __init__(
self.factual_queries: Optional[List[str]] = None
self.target_queries: Optional[List[str]] = None

if (output_processors is not None) and (
not isinstance(output_processors, list)
):
output_processors = [output_processors]
self.output_processors = output_processors

def _check_if_valid_between_or_within(
self, ref2: Optional[DataReference], ref_value: Optional[Any]
):
Expand Down Expand Up @@ -241,6 +256,12 @@ def test(self, engine: sa.engine.Engine) -> TestResult:
target_queries,
)

def apply_output_formatting(self, values: Collection) -> Collection:
if self.output_processors is not None:
for output_processor in self.output_processors:
values, _ = output_processor(values)
return values


def format_sample(sample, ref: DataReference) -> str:
"""Build a string from a database row indicating its column values."""
Expand Down
11 changes: 9 additions & 2 deletions src/datajudge/constraints/miscs.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,15 @@ def test(self, engine: sa.engine.Engine) -> TestResult:
return TestResult.success()

assertion_text = (
f"{self.ref} has violations of functional dependence, e.g.:\n"
+ "\n".join([f"{tuple(violation)}" for violation in violations][:5])
f"{self.ref} has violations of functional dependence (in total {len(violations)} rows):\n"
+ "\n".join(
[
f"{violation}"
for violation in self.apply_output_formatting(
[tuple(elem) for elem in violations]
)
]
)
)
return TestResult.failure(assertion_text)

Expand Down
122 changes: 103 additions & 19 deletions src/datajudge/constraints/uniques.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import abc
import warnings
from collections import Counter
from itertools import zip_longest
from math import ceil, floor
Expand All @@ -8,6 +9,7 @@

from .. import db_access
from ..db_access import DataReference
from ..utils import OutputProcessor, filternull_element, output_processor_limit
from .base import Constraint, OptionalSelections, T, TestResult, ToleranceGetter


Expand Down Expand Up @@ -42,9 +44,21 @@ class Uniques(Constraint, abc.ABC):
are part of a reference set of expected values - either externally supplied
through parameter `uniques` or obtained from another `DataSource`.
Null values in the column are ignored. To assert the non-existence of them use
the `NullAbsence` constraint via the `add_null_absence_constraint` helper method for
`WithinRequirement`.
Null values in the columns ``columns`` are ignored. To assert the non-existence of them use
the :meth:`~datajudge.requirements.WithinRequirement.add_null_absence_constraint`` helper method
for ``WithinRequirement``.
By default, the null filtering does not trigger if multiple columns are fetched at once.
It can be configured in more detail by supplying a custom ``filter_func`` function.
Some exemplary implementations are available as :func:`~datajudge.utils.filternull_element`,
:func:`~datajudge.utils.filternull_never`, :func:`~datajudge.utils.filternull_element_or_tuple_all`,
:func:`~datajudge.utils.filternull_element_or_tuple_any`.
Passing ``None`` as the argument is equivalent to :func:`~datajudge.utils.filternull_element` but triggers a warning.
The current default of :func:`~datajudge.utils.filternull_element`
Cause (possibly often unintended) changes in behavior when the users adds a second column
(filtering no longer can trigger at all).
The default will be changed to :func:`~datajudge.utils.filternull_element_or_tuple_all` in future versions.
To silence the warning, set ``filter_func`` explicitly..
There are two ways to do some post processing of the data obtained from the
database by providing a function to be executed. In general, no postprocessing
Expand All @@ -63,6 +77,29 @@ class Uniques(Constraint, abc.ABC):
(eager or lazy) of the same type as the type of the values of the column (in their
Python equivalent).
Furthermore, the `max_relative_violations` parameter can be used to set a tolerance
threshold for the proportion of elements in the data that can violate the constraint
(default: 0).
Setting this argument is currently not supported for `UniquesEquality`.
For `UniquesSubset`, by default,
the number of occurrences affects the computed fraction of violations.
To disable this weighting, set `compare_distinct=True`.
This argument does not have an effect on the test results for other `Uniques` constraints,
or if `max_relative_violations` is 0.
By default, the assertion messages make use of sets,
thus, they may differ from run to run despite the exact same situation being present,
and can have an arbitrary length.
To enforce a reproducible, limited output via (e.g.) sorting and slicing,
set `output_processors` to a callable or a list of callables. By default, only the first 100 elements are displayed (:func:`~datajudge.utils.output_processor_limit`).
Each callable takes in two collections, and returns modified (e.g. sorted) versions of them.
In most cases, the second argument is simply None,
but for `UniquesSubset` it is the counts of each of the elements.
The suggested functions are :func:`~datajudge.utils.output_processor_sort` and :func:`~datajudge.utils.output_processor_limit`
- see their respective docstrings for details.
One use is of this constraint is to test for consistency in columns with expected
categorical values.
"""
Expand All @@ -71,26 +108,44 @@ def __init__(
self,
ref: DataReference,
name: str = None,
output_processors: Optional[
Union[OutputProcessor, List[OutputProcessor]]
] = output_processor_limit,
*,
ref2: DataReference = None,
uniques: Collection = None,
filter_func: Callable[[List[T]], List[T]] = None,
map_func: Callable[[T], T] = None,
reduce_func: Callable[[Collection], Collection] = None,
max_relative_violations=0,
compare_distinct=False,
):
ref_value: Optional[Tuple[Collection, List]]
ref_value = (uniques, []) if uniques else None
super().__init__(ref, ref2=ref2, ref_value=ref_value, name=name)
super().__init__(
ref,
ref2=ref2,
ref_value=ref_value,
name=name,
output_processors=output_processors,
)

if filter_func is None:
warnings.warn("Using deprecated default null filter function.")
filter_func = filternull_element

self.filter_func = filter_func
self.local_func = map_func
self.global_func = reduce_func
self.max_relative_violations = max_relative_violations
self.compare_distinct = compare_distinct

def retrieve(
self, engine: sa.engine.Engine, ref: DataReference
) -> Tuple[Tuple[List[T], List[int]], OptionalSelections]:
uniques, selection = db_access.get_uniques(engine, ref)
values = list(uniques.keys())
values = list(filter(lambda value: value is not None, values))
values = self.filter_func(values)
counts = [uniques[value] for value in values]
if self.local_func:
values = list(map(self.local_func, values))
Expand All @@ -106,7 +161,11 @@ def retrieve(
class UniquesEquality(Uniques):
def __init__(self, args, name: str = None, **kwargs):
if kwargs.get("max_relative_violations"):
raise RuntimeError("Some useful message")
raise RuntimeError(
"max_relative_violations is not supported for UniquesEquality."
)
if kwargs.get("compare_distinct"):
raise RuntimeError("compare_distinct is not supported for UniquesEquality.")
super().__init__(args, name=name, **kwargs)

def compare(
Expand All @@ -123,22 +182,22 @@ def compare(
if not is_subset and not is_superset:
assertion_text = (
f"{self.ref} doesn't have the element(s) "
f"'{lacking_values}' and has the excess element(s) "
f"'{excess_values}' when compared with the reference values. "
f"'{self.apply_output_formatting(lacking_values)}' and has the excess element(s) "
f"'{self.apply_output_formatting(excess_values)}' when compared with the reference values. "
f"{self.condition_string}"
)
return False, assertion_text
if not is_subset:
assertion_text = (
f"{self.ref} has the excess element(s) "
f"'{excess_values}' when compared with the reference values. "
f"'{self.apply_output_formatting(excess_values)}' when compared with the reference values. "
f"{self.condition_string}"
)
return False, assertion_text
if not is_superset:
assertion_text = (
f"{self.ref} doesn't have the element(s) "
f"'{lacking_values}' when compared with the reference values. "
f"'{self.apply_output_formatting(lacking_values)}' when compared with the reference values. "
f"{self.condition_string}"
)
return False, assertion_text
Expand All @@ -153,28 +212,49 @@ def compare(
) -> Tuple[bool, Optional[str]]:
factual_values, factual_counts = factual
target_values, _ = target

is_subset, remainder = _subset_violation_counts(
factual_values, factual_counts, target_values
)
n_rows = sum(factual_counts)
n_violations = sum(remainder.values())
if not self.compare_distinct:
n_rows = sum(factual_counts)
n_violations = sum(remainder.values())
else:
n_rows = len(factual_values)
n_violations = len(remainder)

if (
n_rows > 0
and (relative_violations := (n_violations / n_rows))
> self.max_relative_violations
):
output_elemes, output_counts = list(remainder.keys()), list(
remainder.values()
)
if self.output_processors is not None:
for output_processor in self.output_processors:
output_elemes, output_counts = output_processor(
output_elemes, output_counts
)

assertion_text = (
f"{self.ref} has a fraction of {relative_violations} > "
f"{self.max_relative_violations} values not being an element of "
f"'{set(target_values)}'. It has e.g. excess elements "
f"'{list(remainder.keys())[:5]}'."
f"{self.max_relative_violations} {'DISTINCT ' if self.compare_distinct else ''}values ({n_violations} / {n_rows}) not being an element of "
f"'{self.apply_output_formatting(set(target_values))}'. It has excess elements "
f"'{output_elemes}' "
f"with counts {output_counts}."
f"{self.condition_string}"
)
return False, assertion_text
return True, None


class UniquesSuperset(Uniques):
def __init__(self, args, name: str = None, **kwargs):
if kwargs.get("compare_distinct"):
raise RuntimeError("compare_distinct is not supported for UniquesSuperset.")
super().__init__(args, name=name, **kwargs)

def compare(
self,
factual: Tuple[List[T], List[int]],
Expand All @@ -185,14 +265,18 @@ def compare(
is_superset, remainder = _is_superset(factual_values, target_values)
if (
len(factual_values) > 0
and (relative_violations := (len(remainder) / len(target_values)))
and (
relative_violations := (
(n_violations := (len(remainder))) / (n_rows := len(target_values))
)
)
> self.max_relative_violations
):
assertion_text = (
f"{self.ref} has a fraction of "
f"{relative_violations} > {self.max_relative_violations} "
f"lacking unique values of '{set(target_values)}'. E.g. it "
f"doesn't have the unique value(s) '{list(remainder)[:5]}'."
f"{relative_violations} > {self.max_relative_violations} ({n_violations} / {n_rows}) "
f"lacking unique values of '{self.apply_output_formatting(set(target_values))}'. It "
f"doesn't have the unique value(s) '{self.apply_output_formatting(list(remainder))}'."
f"{self.condition_string}"
)
return False, assertion_text
Expand Down
Loading

0 comments on commit 28742ec

Please sign in to comment.