-
Notifications
You must be signed in to change notification settings - Fork 2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
expand test column dtypes to full scale #492
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,14 +2,13 @@ | |
|
||
from typing import Any | ||
|
||
import numpy as np | ||
import pandas as pd | ||
import pytest | ||
from _pytest.fixtures import FixtureRequest | ||
from vivarium_testing_utils import FuzzyChecker | ||
|
||
from pseudopeople.dataset import Dataset | ||
from pseudopeople.schema_entities import COLUMNS, DATASET_SCHEMAS | ||
from tests.constants import DATASET_GENERATION_FUNCS | ||
from tests.integration.conftest import IDX_COLS, _get_common_datasets, get_unnoised_data | ||
from tests.utilities import ( | ||
initialize_dataset_with_sample, | ||
|
@@ -49,6 +48,25 @@ def test_row_noising_omit_row_or_do_not_respond( | |
run_omit_row_or_do_not_respond_tests(dataset_name, config, original_data, noised_data) | ||
|
||
|
||
def test_column_dtypes( | ||
unnoised_dataset: Dataset, | ||
noised_data: pd.DataFrame, | ||
dataset_name: str, | ||
config: dict[str, Any], | ||
) -> None: | ||
"""Tests that column dtypes are as expected""" | ||
for col_name in noised_data.columns: | ||
col = COLUMNS.get_column(col_name) | ||
expected_dtype = col.dtype_name | ||
if expected_dtype == np.dtype(object): | ||
# str dtype is 'object' | ||
# Check that they are actually strings and not some other | ||
# type of object. | ||
actual_types = noised_data[col.name].dropna().apply(type) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is using apply here vectorized? I don't think it is, but maybe we don't have any other options? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe it's not vectorized, but it might be the fastest way according to this answer: https://stackoverflow.com/questions/55754713/fastest-way-to-find-all-data-types-in-a-pandas-series |
||
assert (actual_types == str).all(), actual_types.unique() | ||
assert noised_data[col.name].dtype == expected_dtype | ||
|
||
|
||
def test_unnoised_id_cols(dataset_name: str, request: FixtureRequest) -> None: | ||
"""Tests that all datasets retain unnoised simulant_id and household_id | ||
(except for SSA which does not include household_id) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've asked this multiple times but remind me again - how will we run this test as it previously existed (on sample data) but NOT during release-testing? i.e. we need to continue running the previous test every night like we currently are.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The fixtures are set up to read in sample data if we don't run pytest with the --release flag.