diff --git a/src/mostlyai/qa/filesystem.py b/src/mostlyai/qa/filesystem.py index aa41d30..4e66aa1 100644 --- a/src/mostlyai/qa/filesystem.py +++ b/src/mostlyai/qa/filesystem.py @@ -25,7 +25,7 @@ from sklearn.decomposition import PCA -_OLD_COL_PREFIX = r"(\w+)\." +_OLD_COL_PREFIX = r"^(tgt|ctx|nxt)\." _NEW_COL_PREFIX = r"\1⁝" diff --git a/tests/end_to_end/test_report.py b/tests/end_to_end/test_report.py index 719043e..cefad37 100644 --- a/tests/end_to_end/test_report.py +++ b/tests/end_to_end/test_report.py @@ -224,3 +224,25 @@ def test_report_sequential_few_records(tmp_path): ctx_primary_key="id", ) assert metrics is not None + + +def test_odd_column_names(tmp_path): + values = ["a", "b"] * 50 + df = pd.DataFrame( + { + "some.test": values, + "foo%bar|this-long{c[u]rly} *": values, + "3": values, + } + ) + path, metrics = report( + syn_tgt_data=df, + trn_tgt_data=df, + statistics_path=tmp_path / "stats", + ) + assert metrics is not None + path = report_from_statistics( + syn_tgt_data=df, + statistics_path=tmp_path / "stats", + ) + assert path is not None