Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

expand test column dtypes to full scale #492

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 20 additions & 2 deletions tests/integration/release/test_release.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,13 @@

from typing import Any

import numpy as np
import pandas as pd
import pytest
from _pytest.fixtures import FixtureRequest
from vivarium_testing_utils import FuzzyChecker

from pseudopeople.dataset import Dataset
from pseudopeople.schema_entities import COLUMNS, DATASET_SCHEMAS
from tests.constants import DATASET_GENERATION_FUNCS
from tests.integration.conftest import IDX_COLS, _get_common_datasets, get_unnoised_data
from tests.utilities import (
initialize_dataset_with_sample,
Expand Down Expand Up @@ -49,6 +48,25 @@ def test_row_noising_omit_row_or_do_not_respond(
run_omit_row_or_do_not_respond_tests(dataset_name, config, original_data, noised_data)


def test_column_dtypes(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've asked this multiple times but remind me again - how will we run this test as it previously existed (on sample data) but NOT during release-testing? i.e. we need to continue running the previous test every night like we currently are.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The fixtures are set up to read in sample data if we don't run pytest with the --release flag.

unnoised_dataset: Dataset,
noised_data: pd.DataFrame,
dataset_name: str,
config: dict[str, Any],
) -> None:
"""Tests that column dtypes are as expected"""
for col_name in noised_data.columns:
col = COLUMNS.get_column(col_name)
expected_dtype = col.dtype_name
if expected_dtype == np.dtype(object):
# str dtype is 'object'
# Check that they are actually strings and not some other
# type of object.
actual_types = noised_data[col.name].dropna().apply(type)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is using apply here vectorized? I don't think it is, but maybe we don't have any other options?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe it's not vectorized, but it might be the fastest way according to this answer:

https://stackoverflow.com/questions/55754713/fastest-way-to-find-all-data-types-in-a-pandas-series

assert (actual_types == str).all(), actual_types.unique()
assert noised_data[col.name].dtype == expected_dtype


def test_unnoised_id_cols(dataset_name: str, request: FixtureRequest) -> None:
"""Tests that all datasets retain unnoised simulant_id and household_id
(except for SSA which does not include household_id)
Expand Down
49 changes: 0 additions & 49 deletions tests/integration/test_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,55 +175,6 @@ def test_seed_behavior(
assert not noised_data.equals(noised_data_different_seed)


@pytest.mark.parametrize(
"dataset_name",
[
DATASET_SCHEMAS.census.name,
DATASET_SCHEMAS.acs.name,
DATASET_SCHEMAS.cps.name,
DATASET_SCHEMAS.ssa.name,
DATASET_SCHEMAS.tax_w2_1099.name,
DATASET_SCHEMAS.wic.name,
DATASET_SCHEMAS.tax_1040.name,
],
)
@pytest.mark.parametrize(
"engine",
[
"pandas",
"dask",
],
)
def test_column_dtypes(
dataset_name: str, engine: str, config: dict[str, Any], request: FixtureRequest
) -> None:
"""Tests that column dtypes are as expected"""
if "TODO" in dataset_name:
pytest.skip(reason=dataset_name)

if engine == "dask":
generation_function = DATASET_GENERATION_FUNCS[dataset_name]
noised_data = generation_function(
seed=SEED,
year=None,
config=config,
engine=engine,
).compute()
else:
noised_data = request.getfixturevalue(f"noised_sample_data_{dataset_name}")

for col_name in noised_data.columns:
col = COLUMNS.get_column(col_name)
expected_dtype = col.dtype_name
if expected_dtype == np.dtype(object):
# str dtype is 'object'
# Check that they are actually strings and not some other
# type of object.
actual_types = noised_data[col.name].dropna().apply(type)
assert (actual_types == str).all(), actual_types.unique()
assert noised_data[col.name].dtype == expected_dtype


@pytest.mark.skip(reason="TODO: Implement duplication row noising")
@pytest.mark.parametrize(
"dataset_name",
Expand Down