diff --git a/association_measures/measures.py b/association_measures/measures.py index 82dff78..5beff2d 100644 --- a/association_measures/measures.py +++ b/association_measures/measures.py @@ -4,7 +4,7 @@ """ import numpy as np -from pandas import concat +from pandas import concat, merge from scipy.stats import norm, beta from warnings import warn @@ -95,14 +95,22 @@ def score(df, measures=None, f1=None, N=None, N1=None, N2=None, else: measures = [ams_all[k] for k in ams_all] + # reduce df to unique frequency signatures + vocab = len(df) if vocab is None else vocab + df_reduced = df.drop_duplicates(subset=list(freq_columns)).copy() + # calculate measures for measure in measures: - df[measure.__name__] = measure( - df, disc=disc, discounting=discounting, signed=signed, alpha=alpha, + df_reduced[measure.__name__] = measure( + df_reduced, disc=disc, discounting=discounting, signed=signed, alpha=alpha, correct=correct, boundary=boundary, vocab=vocab, one_sided=one_sided ) - # frequency columns? + # join on frequency columns (NB: thanks to pandas API, we have to take care of index names ourselves) + index_names = ['index'] if df.index.names == [None] else df.index.names + df = merge(df.reset_index(), df_reduced, how='left', on=list(freq_columns)).set_index(index_names) + + # keep frequency columns? if not freq: df = df.drop(freq_columns, axis=1) else: diff --git a/performance.md b/performance.md index a2a34e0..83b529d 100644 --- a/performance.md +++ b/performance.md @@ -1,27 +1,28 @@ # Performance - performance is calculated on a Lenovo X1 Carbon (10th generation, i7) - input data are 24,167 observations from [brown.csv](tests/data/brown.csv) -- we report for 1000 iterations +- NB: dataframe contains 4241 duplicated frequency signatures (for which calculation will only be run once since v0.2.7) +- for each measure, we report time needed for 1000 scorings of the whole dataframe ## v0.2.7 - major performance improvement regarding conservative log-ratio with Poisson boundary (factor 50) ``` settings: iterations=1000, df_size=24167 -- 0.0874 :: contingency_table -- 1.5254 :: expected_frequencies -- 0.1510 :: z_score -- 0.2906 :: t_score -- 1.7408 :: log_likelihood -- 0.6146 :: simple_ll -- 1.3270 :: min_sensitivity -- 0.2604 :: liddell -- 0.2502 :: dice -- 0.4494 :: log_ratio -- 4.6467 :: binomial_likelihood -- 2.1923 :: conservative_log_ratio -- 31.2882 :: conservative_log_ratio_poisson -- 0.3840 :: mutual_information -- 0.4441 :: local_mutual_information +- 0.0871 :: contingency_table +- 1.5258 :: expected_frequencies +- 0.1507 :: z_score +- 0.2899 :: t_score +- 1.7406 :: log_likelihood +- 0.6125 :: simple_ll +- 1.2981 :: min_sensitivity +- 0.2584 :: liddell +- 0.2491 :: dice +- 0.4460 :: log_ratio +- 4.5788 :: binomial_likelihood +- 2.1891 :: conservative_log_ratio +- 29.8616 :: conservative_log_ratio_poisson +- 0.3702 :: mutual_information +- 0.4314 :: local_mutual_information ``` ## v0.2.6