Merge pull request #33 from fau-klue/v0.2.7

v0.2.7
fau-klue · Nov 8, 2023 · da5d030 · da5d030
2 parents 4de5c57 + fab2a2c
commit da5d030
Show file tree

Hide file tree

Showing 11 changed files with 125 additions and 802 deletions.
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,2 @@
+[flake8]
+max-line-length = 160
diff --git a/.github/workflows/python-build.yml b/.github/workflows/python-build.yml
@@ -8,7 +8,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [ 3.6, 3.7, 3.8, 3.9 ]
+        python-version: [ 3.7, 3.8, 3.9, "3.10" ]
 
     steps:
     - uses: actions/checkout@v2

diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,8 @@
 # Standard Python ignores
 # -----------------------
 
+Pipfile.lock
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/README.md b/README.md
@@ -136,7 +136,7 @@ The following association measures are currently implemented (v0.2.2):
   - **Liddell** (`liddell`)
   - **minimum sensitivity** (`min_sensitivity`)
   - [**log-ratio**](http://cass.lancs.ac.uk/log-ratio-an-informal-introduction/) (`log_ratio`)
-    - parameter: `disc`
+    - parameters: `disc`, `discounting`
   - **Dice coefficient** (`dice`)
 - information theory:
   - **mutual information** (`mutual_information`)

diff --git a/association_measures/measures.py b/association_measures/measures.py
@@ -4,7 +4,7 @@
 """
 
 import numpy as np
-from pandas import concat
+from pandas import concat, merge
 from scipy.stats import norm, beta
 from warnings import warn
 
@@ -47,8 +47,9 @@ def list_measures():
 
 def score(df, measures=None, f1=None, N=None, N1=None, N2=None,
           freq=True, per_million=True, digits=6, disc=.001,
-          signed=True, alpha=.001, correct='Bonferroni',
-          boundary='normal', vocab=None, one_sided=False):
+          discounting='Walter1975', signed=True, alpha=.001,
+          correct='Bonferroni', boundary='poisson', vocab=None,
+          one_sided=False):
     """Calculate a list of association measures on columns of df. Defaults
     to all available (and numerically stable) measures.
 
@@ -67,6 +68,7 @@ def score(df, measures=None, f1=None, N=None, N1=None, N2=None,
 
     Further keyword arguments will be passed to the respective measures:
     :param float disc: discounting (or smoothing) parameter for O11 == 0 (and O21 == 0)
+    :param str discounting: LR: discounting strategy (Walter1975 vs. Hardie2014)
     :param bool signed: enforce negative values for rows with O11 < E11?
     :param float alpha: CLR: significance level
     :param str boundary: CLR: exact CI boundary of [poisson] distribution or [normal] approximation?
@@ -93,14 +95,22 @@ def score(df, measures=None, f1=None, N=None, N1=None, N2=None,
     else:
         measures = [ams_all[k] for k in ams_all]
 
+    # reduce df to unique frequency signatures
+    vocab = len(df) if vocab is None else vocab
+    df_reduced = df.drop_duplicates(subset=list(freq_columns)).copy()
+
     # calculate measures
     for measure in measures:
-        df[measure.__name__] = measure(
-            df, disc=disc, signed=signed, alpha=alpha,
+        df_reduced[measure.__name__] = measure(
+            df_reduced, disc=disc, discounting=discounting, signed=signed, alpha=alpha,
             correct=correct, boundary=boundary, vocab=vocab, one_sided=one_sided
         )
 
-    # frequency columns?
+    # join on frequency columns (NB: thanks to pandas API, we have to take care of index names ourselves)
+    index_names = ['index'] if df.index.names == [None] else df.index.names
+    df = merge(df.reset_index(), df_reduced, how='left', on=list(freq_columns)).set_index(index_names)
+
+    # keep frequency columns?
     if not freq:
         df = df.drop(freq_columns, axis=1)
     else:
@@ -263,20 +273,25 @@ def dice(df, **kwargs):
     return am
 
 
-def log_ratio(df, disc=.5, **kwargs):
+def log_ratio(df, disc=.5, discounting='Walter1975', **kwargs):
     """Calculate log-ratio, i.e. binary logarithm of relative risk
 
     :param DataFrame df: pd.DataFrame with columns O11, O21, R1, R2
     :param float disc: discounting (or smoothing) parameter for O11 == 0 and O21 == 0
+    :param str discounting: discounting according to Walter1975 or Hardie2014?
     :return: log-ratio
     :rtype: pd.Series
     """
 
-    # questionable discounting according to Hardie (2014)
-    O11_disc = df['O11'].where(df['O11'] != 0, disc)
-    O21_disc = df['O21'].where(df['O21'] != 0, disc)
+    if discounting == 'Walter1975':
+        # mathematically sensible discounting according to Walter (1975)
+        am = np.log2(((df['O11'] + disc) / (df['R1'] + disc)) / ((df['O21'] + disc) / (df['R2'] + disc)))
 
-    am = np.log2((O11_disc / O21_disc) / (df['R1'] / df['R2']))
+    elif discounting == 'Hardie2014':
+        # questionable discounting according to Hardie (2014)
+        O11_disc = df['O11'].where(df['O11'] != 0, disc)
+        O21_disc = df['O21'].where(df['O21'] != 0, disc)
+        am = np.log2((O11_disc / O21_disc) / (df['R1'] / df['R2']))
 
     return am
 
@@ -335,34 +350,6 @@ def binomial_likelihood(df, **kwargs):
 # CONSERVATIVE ESTIMATES #
 ##########################
 
-def get_poisson_ci_boundary(alpha, O11, R1, O21, R2):
-    """
-    Get the lower (if O11 / R1 >= O21 / R2) or upper (else) bound of
-    the CI of a Poisson distribution
-
-    :param float alpha: sig. level
-    :param int O11:
-    :param int R1:
-    :param int O21:
-    :param int R2:
-    """
-
-    if O11 == O21 == 0:
-        return 0
-
-    if (O11 / R1) >= (O21 / R2):
-        lower = beta.ppf(alpha, O11, O21 + 1)
-        boundary = max(np.log2((R2 / R1) * lower / (1 - lower)), 0)
-    else:
-        upper = beta.ppf(1 - alpha, O11 + 1, O21)
-        boundary = min(np.log2((R2 / R1) * upper / (1 - upper)), 0)
-
-    return boundary
-
-
-BOUNDARY = np.vectorize(get_poisson_ci_boundary, otypes=[float])
-
-
 def conservative_log_ratio(df, disc=.5, alpha=.001, boundary='normal',
                            correct='Bonferroni', vocab=None,
                            one_sided=False, **kwargs):
@@ -407,7 +394,21 @@ def conservative_log_ratio(df, disc=.5, alpha=.001, boundary='normal',
 
     # Poisson approximation (Evert 2022)
     if boundary == 'poisson':
-        clrr = BOUNDARY(alpha, df['O11'], df['R1'], df['O21'], df['R2'])
+
+        # only calculate where_lower
+        lower = beta.ppf(alpha, df['O11'], df['O21'] + 1)
+        lower_boundary = np.log2((df['R2'] / df['R1']) * lower / (1 - lower)).clip(lower=0)
+
+        # only calculate where_upper
+        upper = beta.ppf(1 - alpha, df['O11'] + 1, df['O21'])
+        upper_boundary = np.log2((df['R2'] / df['R1']) * upper / (1 - upper)).clip(upper=0)
+
+        # combine, set to 0 where (df['O11'] == 0) & (df['O12'] == 0)
+        clrr = lower_boundary.where(
+            (df['O11'] / df['R1']) >= (df['O21'] / df['R2']),
+            upper_boundary
+        )
+        clrr = clrr.where(~((df['O11'] == 0) & (df['O12'] == 0)), 0).fillna(0)
 
     # Normal approximation (Hardie 2014)
     elif boundary == 'normal':

diff --git a/association_measures/version.py b/association_measures/version.py
@@ -2,5 +2,5 @@
 Association measures are mathematical formulae that interpret cooccurrence frequency data.
 """
 
-VERSION = (0, 2, 6)
+VERSION = (0, 2, 7)
 __version__ = '.'.join(map(str, VERSION))
diff --git a/performance.md b/performance.md
@@ -0,0 +1,47 @@
+# Performance
+- performance is calculated on a Lenovo X1 Carbon (10th generation, i7)
+- input data are 24,167 observations from [brown.csv](tests/data/brown.csv)
+- NB: dataframe contains 4241 duplicated frequency signatures (for which calculation will only be run once since v0.2.7)
+- for each measure, we report time needed for 1000 scorings of the whole dataframe
+
+## v0.2.7
+- major performance improvement regarding conservative log-ratio with Poisson boundary (factor 50)
+```
+settings: iterations=1000, df_size=24167
+-  0.0871 :: contingency_table
+-  1.5258 :: expected_frequencies
+-  0.1507 :: z_score
+-  0.2899 :: t_score
+-  1.7406 :: log_likelihood
+-  0.6125 :: simple_ll
+-  1.2981 :: min_sensitivity
+-  0.2584 :: liddell
+-  0.2491 :: dice
+-  0.4460 :: log_ratio
+-  4.5788 :: binomial_likelihood
+-  2.1891 :: conservative_log_ratio
+- 29.8616 :: conservative_log_ratio_poisson
+-  0.3702 :: mutual_information
+-  0.4314 :: local_mutual_information
+```
+
+## v0.2.6
+```
+Calculate contingency_table (iterations=1000, df_size=24168):         0.0873531000688672
+Calculate expected_frequencies (iterations=1000, df_size=24168):      1.5203204130521044
+Calculate z_score (iterations=1000, df_size=24168):                   0.14853612298611552
+Calculate t_score (iterations=1000, df_size=24168):                   0.2881241790018976
+Calculate log_likelihood (iterations=1000, df_size=24168):            1.7284309939714149
+Calculate simple_ll (iterations=1000, df_size=24168):                 0.6111006899736822
+Calculate min_sensitivity (iterations=1000, df_size=24168):           1.3227944150567055
+Calculate liddell (iterations=1000, df_size=24168):                   0.25499376200605184
+Calculate dice (iterations=1000, df_size=24168):                      0.2465739679755643
+Calculate log_ratio (iterations=1000, df_size=24168):                 0.751795751042664
+Calculate binomial_likelihood (iterations=1000, df_size=24168):       4.606213430990465
+Calculate conservative_log_ratio (iterations=1000, df_size=24168):    2.2395021530101076
+Calculate mutual_information (iterations=1000, df_size=24168):        0.3618475969415158
+Calculate local_mutual_information (iterations=1000, df_size=24168):  0.41407940594945103
+```
+additionally:
+- conservative log ratio with Poisson boundary: ~1.5s for 1 iteration
+- hypergeometric likelihood: ~2.5s for 1 iteration
diff --git a/performance.py b/performance.py
@@ -70,7 +70,7 @@
         'code': 'am.log_ratio(df)'
     },
     # likelihood measures
-    # ~2.5s for a ~25,000 rows on 8 threads
+    # ~2.5s for 1x ~25,000
     # {
     #     'name': 'hypergeometric_likelihood',
     #     'code': 'am.hypergeometric_likelihood(df)'
@@ -84,11 +84,10 @@
         'name': 'conservative_log_ratio',
         'code': 'am.conservative_log_ratio(df)'
     },
-    # ~1.5s for a ~25,000 rows on 8 threads
-    # {
-    #     'name': 'conservative_log_ratio_poisson',
-    #     'code': 'am.conservative_log_ratio(df, boundary="poisson")'
-    # },
+    {
+        'name': 'conservative_log_ratio_poisson',
+        'code': 'am.conservative_log_ratio(df, boundary="poisson")'
+    },
     # information theory
     {
         'name': 'mutual_information',
@@ -100,8 +99,9 @@
     },
 ]
 
+size = 24167
+print(f'settings: iterations={iterations}, df_size={size}')
 for code in codes:
     res = timeit.timeit(setup=setup, stmt=code['code'], number=iterations)
-    print('Calculate {func} (iterations={iter}, df_size={size}): {res}'.format(
-        iter=iterations, size=24168, res=res, func=code['name']
-    ))
+    func = code['name']
+    print(f'- {res:7.4f} :: {func}')
diff --git a/setup.py b/setup.py
@@ -1,7 +1,5 @@
 #!/usr/bin/env python3
 
-
-import io
 import os
 import sys
 from setuptools import find_packages, Command
@@ -101,10 +99,10 @@ def run(self):
         'License :: OSI Approved :: MIT License',
         'Programming Language :: Python',
         'Programming Language :: Python :: 3',
-        'Programming Language :: Python :: 3.6',
         'Programming Language :: Python :: 3.7',
         'Programming Language :: Python :: 3.8',
         'Programming Language :: Python :: 3.9',
+        'Programming Language :: Python :: 3.10',
         'Programming Language :: Cython',
     ],
 )