Skip to content

Commit

Permalink
fix compatibility with pandas 2.0
Browse files Browse the repository at this point in the history
  • Loading branch information
mplatzer authored Dec 4, 2024
1 parent 8b3c5ce commit de46272
Show file tree
Hide file tree
Showing 7 changed files with 81 additions and 38 deletions.
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@ The latest release of `mostlyai-qa` can be installed via pip:
pip install -U mostlyai-qa
```

The latest development version can be installed directly from GitHub:

```bash
pip install -U git+https://github.com/mostly-ai/mostlyai-qa.git@main
```

## Quick Start

```python
Expand Down
4 changes: 3 additions & 1 deletion mostlyai/qa/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import os

import pandas as pd
from packaging.version import Version

from mostlyai.qa.report import report
from mostlyai.qa.report_from_statistics import report_from_statistics
Expand All @@ -23,4 +24,5 @@
__version__ = "1.3.0"

os.environ["TOKENIZERS_PARALLELISM"] = "false"
pd.set_option("future.no_silent_downcasting", True)
if Version(pd.__version__) >= Version("2.2.0"):
pd.set_option("future.no_silent_downcasting", True)
60 changes: 37 additions & 23 deletions mostlyai/qa/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,40 +233,54 @@ def report(
hol_sample_size or float("inf"),
)

if max_sample_size_embeddings_final >= 10_000 and max_sample_size_embeddings is None:
if max_sample_size_embeddings_final > 10_000 and max_sample_size_embeddings is None:
warnings.warn(
UserWarning(
"More than 10k embeddings will be calculated per dataset. "
"Consider setting a limit via `max_sample_size_embeddings`."
)
)

def _calc_pull_embeds(
df_tgt: pd.DataFrame, df_ctx: pd.DataFrame, progress_from: int, progress_to: int
) -> np.ndarray:
strings = pull_data_for_embeddings(
df_tgt=df_tgt,
df_ctx=df_ctx,
_LOG.info("calculate embeddings for synthetic")
syn_embeds = calculate_embeddings(
strings=pull_data_for_embeddings(
df_tgt=syn_tgt_data,
df_ctx=syn_ctx_data,
ctx_primary_key=ctx_primary_key,
tgt_context_key=tgt_context_key,
max_sample_size=max_sample_size_embeddings_final,
)
# split into buckets for calculating embeddings to avoid memory issues and report continuous progress
buckets = np.array_split(strings, progress_to - progress_from)
buckets = [b for b in buckets if len(b) > 0]
embeds = []
for i, bucket in enumerate(buckets, 1):
embeds += [calculate_embeddings(bucket.tolist())]
progress.update(completed=progress_from + i, total=100)
progress.update(completed=progress_to, total=100)
embeds = np.concatenate(embeds, axis=0)
_LOG.info(f"calculated embeddings {embeds.shape}")
return embeds

syn_embeds = _calc_pull_embeds(df_tgt=syn_tgt_data, df_ctx=syn_ctx_data, progress_from=20, progress_to=40)
trn_embeds = _calc_pull_embeds(df_tgt=trn_tgt_data, df_ctx=trn_ctx_data, progress_from=40, progress_to=60)
),
progress=progress,
progress_from=20,
progress_to=40,
)
_LOG.info("calculate embeddings for training")
trn_embeds = calculate_embeddings(
strings=pull_data_for_embeddings(
df_tgt=trn_tgt_data,
df_ctx=trn_ctx_data,
ctx_primary_key=ctx_primary_key,
tgt_context_key=tgt_context_key,
max_sample_size=max_sample_size_embeddings_final,
),
progress=progress,
progress_from=40,
progress_to=60,
)
if hol_tgt_data is not None:
hol_embeds = _calc_pull_embeds(df_tgt=hol_tgt_data, df_ctx=hol_ctx_data, progress_from=60, progress_to=80)
_LOG.info("calculate embeddings for holdout")
hol_embeds = calculate_embeddings(
strings=pull_data_for_embeddings(
df_tgt=hol_tgt_data,
df_ctx=hol_ctx_data,
ctx_primary_key=ctx_primary_key,
tgt_context_key=tgt_context_key,
max_sample_size=max_sample_size_embeddings_final,
),
progress=progress,
progress_from=60,
progress_to=80,
)
else:
hol_embeds = None
progress.update(completed=80, total=100)
Expand Down
7 changes: 5 additions & 2 deletions mostlyai/qa/report_from_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,13 +107,16 @@ def report_from_statistics(

_LOG.info("calculate embeddings for synthetic")
syn_embeds = calculate_embeddings(
pull_data_for_embeddings(
strings=pull_data_for_embeddings(
df_tgt=syn_tgt_data,
df_ctx=syn_ctx_data,
ctx_primary_key=ctx_primary_key,
tgt_context_key=tgt_context_key,
max_sample_size=max_sample_size_embeddings,
)
),
progress=progress,
progress_from=30,
progress_to=50,
)

_LOG.info("report similarity")
Expand Down
33 changes: 26 additions & 7 deletions mostlyai/qa/sampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
NXT_COLUMN_PREFIX,
COUNT_COLUMN,
ACCURACY_MAX_COLUMNS,
ProgressCallbackWrapper,
)
from mostlyai.qa.assets import load_embedder, load_tokenizer

Expand Down Expand Up @@ -221,8 +222,9 @@ def sequence_to_string(sequence: pd.DataFrame) -> str:
return ", ".join(sequence.apply(row_to_string, axis=1))

strings = (
df_tgt.groupby(tgt_context_key)
.apply(sequence_to_string, include_groups=False)
df_tgt.set_index(tgt_context_key)
.groupby(tgt_context_key)
.apply(sequence_to_string)
.sample(frac=1)
.reset_index(drop=True)
)
Expand All @@ -233,13 +235,30 @@ def sequence_to_string(sequence: pd.DataFrame) -> str:
return strings.to_list()


def calculate_embeddings(strings: list[str]) -> np.ndarray:
def calculate_embeddings(
strings: list[str],
progress: ProgressCallbackWrapper | None = None,
progress_from: int | None = None,
progress_to: int | None = None,
) -> np.ndarray:
t0 = time.time()
# load embedder
embedder = load_embedder(device="cuda" if torch.cuda.is_available() else "cpu")
embeddings = embedder.encode(strings)
time_elapsed = time.time() - t0
_LOG.info(f"created embeddings for {len(strings):,} records ({time_elapsed=:.2f}s)")
return embeddings
# split into buckets for calculating embeddings to avoid memory issues and report continuous progress
steps = progress_to - progress_from if progress_to is not None and progress_from is not None else 1
buckets = np.array_split(strings, steps)
buckets = [b for b in buckets if len(b) > 0]
# calculate embeddings for each bucket
embeds = []
for i, bucket in enumerate(buckets, 1):
embeds += [embedder.encode(bucket.tolist(), show_progress_bar=False)]
if progress is not None:
progress.update(completed=progress_from + i, total=100)
if progress is not None:
progress.update(completed=progress_to, total=100)
embeds = np.concatenate(embeds, axis=0)
_LOG.info(f"calculated embeddings {embeds.shape} in {time.time() - t0:.2f}s")
return embeds


def sample_text_tokens(df: pd.DataFrame) -> pd.DataFrame:
Expand Down
2 changes: 1 addition & 1 deletion mostlyai/qa/similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def calculate_mean_auc(embeds1, embeds2):

# calculate the AUC score
auc_score = roc_auc_score(y_holdout, y_holdout_pred)
auc_scores.append(auc_score)
auc_scores.append(round(auc_score, 4))

_LOG.info(f"{auc_scores=}")

Expand Down
7 changes: 3 additions & 4 deletions tests/unit/test_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,17 @@
# limitations under the License.

import numpy as np
import pandas as pd

from mostlyai.qa.similarity import calculate_cosine_similarities, calculate_discriminator_auc
from mostlyai.qa.sampling import calculate_embeddings


def test_calculate_embeddings():
trn = pd.Series(["apple recipe", "car engine repair", "apple recipe"])
trn = ["apple recipe", "car engine repair", "apple recipe"]
# semantically close synthetic data
syn_close = pd.Series(["apple pie", "car maintenance"])
syn_close = ["apple pie", "car maintenance"]
# semantically distant synthetic data
syn_distant = pd.Series(["quantum physics theory", "deep space exploration"])
syn_distant = ["quantum physics theory", "deep space exploration"]

trn_embeds = calculate_embeddings(trn)
syn_close_embeds = calculate_embeddings(syn_close)
Expand Down

0 comments on commit de46272

Please sign in to comment.