diff --git a/coh.ipynb b/coh.ipynb new file mode 100644 index 0000000..9bf44f5 --- /dev/null +++ b/coh.ipynb @@ -0,0 +1,185 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# CATEGORIES PER SEQUENCE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "df_tgt = pd.read_csv(\"https://github.com/mostly-ai/public-demo-data/raw/refs/heads/dev/baseball/batting.csv.gz\")\n", + "df_tgt.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from mostlyai.qa._coherence import pull_data_for_coherence\n", + "\n", + "df_tgt = pull_data_for_coherence(df_tgt=df_tgt, tgt_context_key=\"players_id\")\n", + "df_tgt.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from mostlyai.qa._coherence import calculate_categories_per_sequence\n", + "\n", + "categories_per_sequence_df = calculate_categories_per_sequence(df=df_tgt, context_key=\"players_id\")\n", + "categories_per_sequence_df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from mostlyai.qa._accuracy import calculate_numeric_uni_kdes\n", + "\n", + "trn_num_kdes = calculate_numeric_uni_kdes(categories_per_sequence_df)\n", + "trn_num_kdes[\"team\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from mostlyai.qa._accuracy import bin_data\n", + "\n", + "\n", + "cat_share_per_sequence_binned = bin_data(categories_per_sequence_df, bins=10)[0]\n", + "cat_share_per_sequence_binned.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from mostlyai.qa._accuracy import calculate_categorical_uni_counts\n", + "\n", + "\n", + "trn_bin_col_cnts = calculate_categorical_uni_counts(df=cat_share_per_sequence_binned, hash_rare_values=False)\n", + "trn_bin_col_cnts[\"team\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from mostlyai.qa._accuracy import plot_univariate\n", + "\n", + "for col in categories_per_sequence_df.columns:\n", + " if col != \"players_id\": # Skip the context key\n", + " display(\n", + " plot_univariate(\n", + " col_name=col,\n", + " trn_num_kde=trn_num_kdes.get(col),\n", + " syn_num_kde=trn_num_kdes.get(col),\n", + " trn_cat_col_cnts=None,\n", + " syn_cat_col_cnts=None,\n", + " trn_bin_col_cnts=trn_bin_col_cnts[col],\n", + " syn_bin_col_cnts=trn_bin_col_cnts[col],\n", + " accuracy=0.5,\n", + " )\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# SEQUENCES PER CATEGORY" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from mostlyai.qa._coherence import calculate_sequences_per_category\n", + "\n", + "sequences_per_category_dict, sequences_per_category_binned_dict = calculate_sequences_per_category(\n", + " df=df_tgt, context_key=\"players_id\"\n", + ")\n", + "display(sequences_per_category_dict[\"team\"].head(2))\n", + "display(sequences_per_category_binned_dict[\"team\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from mostlyai.qa._accuracy import plot_univariate\n", + "\n", + "for col in categories_per_sequence_df.columns:\n", + " if col != \"players_id\": # Skip the context key\n", + " display(\n", + " plot_univariate(\n", + " col_name=col,\n", + " trn_num_kde=None,\n", + " syn_num_kde=None,\n", + " trn_cat_col_cnts=sequences_per_category_dict[col],\n", + " syn_cat_col_cnts=sequences_per_category_dict[col],\n", + " trn_bin_col_cnts=sequences_per_category_binned_dict[col],\n", + " syn_bin_col_cnts=sequences_per_category_binned_dict[col],\n", + " accuracy=0.5,\n", + " trn_col_total=df_tgt[\"players_id\"].nunique(),\n", + " syn_col_total=df_tgt[\"players_id\"].nunique(),\n", + " )\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.16" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/mostlyai/qa/_accuracy.py b/mostlyai/qa/_accuracy.py index 25433b7..4e7d8ca 100644 --- a/mostlyai/qa/_accuracy.py +++ b/mostlyai/qa/_accuracy.py @@ -434,6 +434,8 @@ def plot_univariate( trn_bin_col_cnts: pd.Series, syn_bin_col_cnts: pd.Series, accuracy: float | None, + trn_col_total: int | None = None, + syn_col_total: int | None = None, ) -> go.Figure: # either numerical/datetime KDEs or categorical counts must be provided @@ -485,8 +487,16 @@ def plot_univariate( fig.layout.xaxis2.update(type="category") else: fig.layout.yaxis.update(tickformat=".0%") - trn_line1, syn_line1 = plot_univariate_distribution_categorical(trn_cat_col_cnts, syn_cat_col_cnts) - trn_line2, syn_line2 = plot_univariate_binned(trn_bin_col_cnts, syn_bin_col_cnts, sort_by_frequency=True) + trn_line1, syn_line1 = plot_univariate_distribution_categorical( + trn_cat_col_cnts, syn_cat_col_cnts, trn_col_total, syn_col_total + ) + trn_line2, syn_line2 = plot_univariate_binned( + trn_bin_col_cnts, + syn_bin_col_cnts, + sort_by_frequency=True, + trn_col_total=trn_col_total, + syn_col_total=syn_col_total, + ) # prevent Plotly from trying to convert strings to dates fig.layout.xaxis.update(type="category") fig.layout.xaxis2.update(type="category") @@ -505,6 +515,8 @@ def plot_univariate( def prepare_categorical_plot_data_distribution( trn_col_cnts: pd.Series, syn_col_cnts: pd.Series, + trn_col_total: int | None = None, + syn_col_total: int | None = None, ) -> pd.DataFrame: trn_col_cnts_idx = trn_col_cnts.index.to_series().astype("string").fillna(NA_BIN).replace("", EMPTY_BIN) syn_col_cnts_idx = syn_col_cnts.index.to_series().astype("string").fillna(NA_BIN).replace("", EMPTY_BIN) @@ -517,8 +529,8 @@ def prepare_categorical_plot_data_distribution( df["synthetic_cnt"] = df["synthetic_cnt"].fillna(0.0) df["avg_cnt"] = (df["target_cnt"] + df["synthetic_cnt"]) / 2 df = df[df["avg_cnt"] > 0] - df["target_pct"] = df["target_cnt"] / df["target_cnt"].sum() - df["synthetic_pct"] = df["synthetic_cnt"] / df["synthetic_cnt"].sum() + df["target_pct"] = df["target_cnt"] / (trn_col_total or df["target_cnt"].sum()) + df["synthetic_pct"] = df["synthetic_cnt"] / (syn_col_total or df["synthetic_cnt"].sum()) df = df.rename(columns={"index": "category"}) if df["category"].dtype.name == "category": df["category_code"] = df["category"].cat.codes @@ -532,6 +544,8 @@ def prepare_categorical_plot_data_binned( trn_bin_col_cnts: pd.Series, syn_bin_col_cnts: pd.Series, sort_by_frequency: bool, + trn_col_total: int | None = None, + syn_col_total: int | None = None, ) -> pd.DataFrame: t = trn_bin_col_cnts.to_frame("target_cnt").reset_index(names="category") s = syn_bin_col_cnts.to_frame("synthetic_cnt").reset_index(names="category") @@ -540,8 +554,8 @@ def prepare_categorical_plot_data_binned( df["synthetic_cnt"] = df["synthetic_cnt"].fillna(0.0) df["avg_cnt"] = (df["target_cnt"] + df["synthetic_cnt"]) / 2 df = df[df["avg_cnt"] > 0] - df["target_pct"] = df["target_cnt"] / df["target_cnt"].sum() - df["synthetic_pct"] = df["synthetic_cnt"] / df["synthetic_cnt"].sum() + df["target_pct"] = df["target_cnt"] / (trn_col_total or df["target_cnt"].sum()) + df["synthetic_pct"] = df["synthetic_cnt"] / (syn_col_total or df["synthetic_cnt"].sum()) if df["category"].dtype.name == "category": df["category_code"] = df["category"].cat.codes else: @@ -554,10 +568,13 @@ def prepare_categorical_plot_data_binned( def plot_univariate_distribution_categorical( - trn_cat_col_cnts: pd.Series, syn_cat_col_cnts: pd.Series + trn_cat_col_cnts: pd.Series, + syn_cat_col_cnts: pd.Series, + trn_col_total: int | None = None, + syn_col_total: int | None = None, ) -> tuple[go.Scatter, go.Scatter]: # prepare data - df = prepare_categorical_plot_data_distribution(trn_cat_col_cnts, syn_cat_col_cnts) + df = prepare_categorical_plot_data_distribution(trn_cat_col_cnts, syn_cat_col_cnts, trn_col_total, syn_col_total) df = df.sort_values("avg_cnt", ascending=False) # trim labels df["category"] = trim_labels(df["category"], max_length=10) @@ -587,9 +604,13 @@ def plot_univariate_binned( trn_bin_col_cnts: pd.Series, syn_bin_col_cnts: pd.Series, sort_by_frequency: bool = False, + trn_col_total: int | None = None, + syn_col_total: int | None = None, ) -> tuple[go.Scatter, go.Scatter]: # prepare data - df = prepare_categorical_plot_data_binned(trn_bin_col_cnts, syn_bin_col_cnts, sort_by_frequency) + df = prepare_categorical_plot_data_binned( + trn_bin_col_cnts, syn_bin_col_cnts, sort_by_frequency, trn_col_total, syn_col_total + ) # prepare plots trn_line = go.Scatter( mode="lines+markers", diff --git a/mostlyai/qa/_coherence.py b/mostlyai/qa/_coherence.py index a9e91fd..9abc6a9 100644 --- a/mostlyai/qa/_coherence.py +++ b/mostlyai/qa/_coherence.py @@ -24,6 +24,7 @@ ) from mostlyai.qa._common import CHARTS_COLORS, CHARTS_FONTS from mostlyai.qa._filesystem import TemporaryWorkspace +from mostlyai.qa._sampling import harmonize_dtype def plot_store_coherences( @@ -215,3 +216,90 @@ def calculate_coh_univariates( ) accuracies["accuracy"], accuracies["accuracy_max"] = zip(*results) return accuracies + + +def pull_data_for_coherence( + *, + df_tgt: pd.DataFrame, + tgt_context_key: str, + max_sequence_length: int = 100, +) -> pd.DataFrame: + """ + Prepare sequential dataset for coherence metrics. + """ + # randomly sample at most max_sequence_length rows per sequence + df_tgt = df_tgt.sample(frac=1).reset_index(drop=True) + df_tgt = df_tgt[df_tgt.groupby(tgt_context_key).cumcount() < max_sequence_length].reset_index(drop=True) + + # harmonize dtypes + # apply harmonize_dtype to all columns except tgt_context_key + df_tgt = df_tgt.apply(lambda col: harmonize_dtype(col) if col.name != tgt_context_key else col) + + # TODO: discretize columns + for col in df_tgt.columns: + if col == tgt_context_key: + continue + df_tgt[col] = pd.Categorical(df_tgt[col], ordered=True) + + # Example output (pd.DataFrame): + # | players_id | year | team | league | G | AB | R | H | HR | RBI | SB | CS | BB | SO | + # |------------|--------|------|--------|------|-------|------|------|------|------|------|------|------|------| + # | borowha01 | 1943.0 | NYA | AL | 29.0 | 74.0 | 2.0 | 15.0 | 0.0 | 7.0 | 0.0 | 0.0 | 5.0 | 17.0 | + # | wallaja02 | 1946.0 | PHA | AL | 63.0 | 194.0 | 16.0 | 38.0 | 5.0 | 11.0 | 1.0 | 0.0 | 14.0 | 47.0 | + # players_id dtype: original, other columns dtype: category + return df_tgt + + +def calculate_categories_per_sequence(df: pd.DataFrame, context_key: str) -> pd.DataFrame: + """ + Calculate the number of categories per sequence for all columns except the context key. + """ + # Example output (pd.DataFrame): + # | players_id | year | team | league | G | AB | R | H | HR | RBI | SB | CS | BB | SO | + # |------------|------|------|--------|----|----|----|----|----|-----|----|----|----|----| + # | aardsda01 | 9 | 8 | 2 | 9 | 3 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 2 | + # | aaronha01 | 23 | 3 | 2 | 18 | 21 | 20 | 23 | 17 | 20 | 15 | 10 | 22 | 19 | + # players_id dtype: original, other columns dtype: int64 + return df.groupby(context_key).nunique().reset_index() + + +def calculate_sequences_per_category( + df: pd.DataFrame, context_key: str +) -> tuple[dict[str, pd.Series], dict[str, pd.Series]]: + """ + Calculate the number of sequences per category for all columns except the context key. + """ + # replace all null values with '(n/a)' + df = df.copy() + for col in df.columns: + if col == context_key: + continue + # Add '(n/a)' category if needed and replace nulls + if df[col].isna().any(): + df[col] = df[col].cat.add_categories("(n/a)") + df.loc[df[col].isna(), col] = "(n/a)" + + # Example output for "team" (pd.Series): + # team + # ALT 18 + # ANA 164 + # Name: players_id, dtype: int64 + sequences_per_category_dict = { + col: df.groupby(col)[context_key].nunique().rename_axis(None) for col in df.columns if col != context_key + } + + # convert df to have top 9 categories w.r.t. frequency of belonging to sequences + '(other)' for all other categories + df = df.copy() + for col in df.columns: + if col == context_key: + continue + top_categories = sequences_per_category_dict[col].nlargest(9).index.tolist() + not_in_top_categories_mask = ~df[col].isin(top_categories) + if not_in_top_categories_mask.any(): + df[col] = df[col].cat.add_categories("(other)") + df.loc[not_in_top_categories_mask, col] = "(other)" + df[col] = df[col].cat.remove_unused_categories() + sequences_per_category_binned_dict = { + col: df.groupby(col)[context_key].nunique().rename_axis(None) for col in df.columns if col != context_key + } + return sequences_per_category_dict, sequences_per_category_binned_dict diff --git a/mostlyai/qa/_sampling.py b/mostlyai/qa/_sampling.py index 198e376..ccd7389 100644 --- a/mostlyai/qa/_sampling.py +++ b/mostlyai/qa/_sampling.py @@ -169,21 +169,6 @@ def sample_two_consecutive_rows( return first_rows, second_rows -def pull_data_for_coherence( - *, - df_tgt: pd.DataFrame, - tgt_context_key: str, - max_sequence_length: int = 100, -) -> pd.DataFrame: - """ - Prepare sequential dataset for coherence metrics. - """ - # randomly sample at most max_sequence_length rows per sequence - df_tgt = df_tgt.sample(frac=1).reset_index(drop=True) - df_tgt = df_tgt[df_tgt.groupby(tgt_context_key).cumcount() < max_sequence_length].reset_index(drop=True) - return df_tgt - - def pull_data_for_embeddings( *, df_tgt: pd.DataFrame, diff --git a/mostlyai/qa/reporting.py b/mostlyai/qa/reporting.py index 6defcdd..298fa2e 100644 --- a/mostlyai/qa/reporting.py +++ b/mostlyai/qa/reporting.py @@ -36,12 +36,11 @@ plot_store_univariates, plot_store_bivariates, ) -from mostlyai.qa._coherence import calculate_coh_univariates, plot_store_coherences +from mostlyai.qa._coherence import calculate_coh_univariates, plot_store_coherences, pull_data_for_coherence from mostlyai.qa.metrics import ModelMetrics, Accuracy, Similarity, Distances from mostlyai.qa._sampling import ( calculate_embeddings, pull_data_for_accuracy, - pull_data_for_coherence, pull_data_for_embeddings, ) from mostlyai.qa._common import (