Skip to content

Commit

Permalink
Merge pull request #33 from fau-klue/v0.2.7
Browse files Browse the repository at this point in the history
v0.2.7
  • Loading branch information
ausgerechnet authored Nov 8, 2023
2 parents 4de5c57 + fab2a2c commit da5d030
Show file tree
Hide file tree
Showing 11 changed files with 125 additions and 802 deletions.
2 changes: 2 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[flake8]
max-line-length = 160
2 changes: 1 addition & 1 deletion .github/workflows/python-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [ 3.6, 3.7, 3.8, 3.9 ]
python-version: [ 3.7, 3.8, 3.9, "3.10" ]

steps:
- uses: actions/checkout@v2
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Standard Python ignores
# -----------------------

Pipfile.lock

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
728 changes: 0 additions & 728 deletions Pipfile.lock

This file was deleted.

2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ The following association measures are currently implemented (v0.2.2):
- **Liddell** (`liddell`)
- **minimum sensitivity** (`min_sensitivity`)
- [**log-ratio**](http://cass.lancs.ac.uk/log-ratio-an-informal-introduction/) (`log_ratio`)
- parameter: `disc`
- parameters: `disc`, `discounting`
- **Dice coefficient** (`dice`)
- information theory:
- **mutual information** (`mutual_information`)
Expand Down
81 changes: 41 additions & 40 deletions association_measures/measures.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"""

import numpy as np
from pandas import concat
from pandas import concat, merge
from scipy.stats import norm, beta
from warnings import warn

Expand Down Expand Up @@ -47,8 +47,9 @@ def list_measures():

def score(df, measures=None, f1=None, N=None, N1=None, N2=None,
freq=True, per_million=True, digits=6, disc=.001,
signed=True, alpha=.001, correct='Bonferroni',
boundary='normal', vocab=None, one_sided=False):
discounting='Walter1975', signed=True, alpha=.001,
correct='Bonferroni', boundary='poisson', vocab=None,
one_sided=False):
"""Calculate a list of association measures on columns of df. Defaults
to all available (and numerically stable) measures.
Expand All @@ -67,6 +68,7 @@ def score(df, measures=None, f1=None, N=None, N1=None, N2=None,
Further keyword arguments will be passed to the respective measures:
:param float disc: discounting (or smoothing) parameter for O11 == 0 (and O21 == 0)
:param str discounting: LR: discounting strategy (Walter1975 vs. Hardie2014)
:param bool signed: enforce negative values for rows with O11 < E11?
:param float alpha: CLR: significance level
:param str boundary: CLR: exact CI boundary of [poisson] distribution or [normal] approximation?
Expand All @@ -93,14 +95,22 @@ def score(df, measures=None, f1=None, N=None, N1=None, N2=None,
else:
measures = [ams_all[k] for k in ams_all]

# reduce df to unique frequency signatures
vocab = len(df) if vocab is None else vocab
df_reduced = df.drop_duplicates(subset=list(freq_columns)).copy()

# calculate measures
for measure in measures:
df[measure.__name__] = measure(
df, disc=disc, signed=signed, alpha=alpha,
df_reduced[measure.__name__] = measure(
df_reduced, disc=disc, discounting=discounting, signed=signed, alpha=alpha,
correct=correct, boundary=boundary, vocab=vocab, one_sided=one_sided
)

# frequency columns?
# join on frequency columns (NB: thanks to pandas API, we have to take care of index names ourselves)
index_names = ['index'] if df.index.names == [None] else df.index.names
df = merge(df.reset_index(), df_reduced, how='left', on=list(freq_columns)).set_index(index_names)

# keep frequency columns?
if not freq:
df = df.drop(freq_columns, axis=1)
else:
Expand Down Expand Up @@ -263,20 +273,25 @@ def dice(df, **kwargs):
return am


def log_ratio(df, disc=.5, **kwargs):
def log_ratio(df, disc=.5, discounting='Walter1975', **kwargs):
"""Calculate log-ratio, i.e. binary logarithm of relative risk
:param DataFrame df: pd.DataFrame with columns O11, O21, R1, R2
:param float disc: discounting (or smoothing) parameter for O11 == 0 and O21 == 0
:param str discounting: discounting according to Walter1975 or Hardie2014?
:return: log-ratio
:rtype: pd.Series
"""

# questionable discounting according to Hardie (2014)
O11_disc = df['O11'].where(df['O11'] != 0, disc)
O21_disc = df['O21'].where(df['O21'] != 0, disc)
if discounting == 'Walter1975':
# mathematically sensible discounting according to Walter (1975)
am = np.log2(((df['O11'] + disc) / (df['R1'] + disc)) / ((df['O21'] + disc) / (df['R2'] + disc)))

am = np.log2((O11_disc / O21_disc) / (df['R1'] / df['R2']))
elif discounting == 'Hardie2014':
# questionable discounting according to Hardie (2014)
O11_disc = df['O11'].where(df['O11'] != 0, disc)
O21_disc = df['O21'].where(df['O21'] != 0, disc)
am = np.log2((O11_disc / O21_disc) / (df['R1'] / df['R2']))

return am

Expand Down Expand Up @@ -335,34 +350,6 @@ def binomial_likelihood(df, **kwargs):
# CONSERVATIVE ESTIMATES #
##########################

def get_poisson_ci_boundary(alpha, O11, R1, O21, R2):
"""
Get the lower (if O11 / R1 >= O21 / R2) or upper (else) bound of
the CI of a Poisson distribution
:param float alpha: sig. level
:param int O11:
:param int R1:
:param int O21:
:param int R2:
"""

if O11 == O21 == 0:
return 0

if (O11 / R1) >= (O21 / R2):
lower = beta.ppf(alpha, O11, O21 + 1)
boundary = max(np.log2((R2 / R1) * lower / (1 - lower)), 0)
else:
upper = beta.ppf(1 - alpha, O11 + 1, O21)
boundary = min(np.log2((R2 / R1) * upper / (1 - upper)), 0)

return boundary


BOUNDARY = np.vectorize(get_poisson_ci_boundary, otypes=[float])


def conservative_log_ratio(df, disc=.5, alpha=.001, boundary='normal',
correct='Bonferroni', vocab=None,
one_sided=False, **kwargs):
Expand Down Expand Up @@ -407,7 +394,21 @@ def conservative_log_ratio(df, disc=.5, alpha=.001, boundary='normal',

# Poisson approximation (Evert 2022)
if boundary == 'poisson':
clrr = BOUNDARY(alpha, df['O11'], df['R1'], df['O21'], df['R2'])

# only calculate where_lower
lower = beta.ppf(alpha, df['O11'], df['O21'] + 1)
lower_boundary = np.log2((df['R2'] / df['R1']) * lower / (1 - lower)).clip(lower=0)

# only calculate where_upper
upper = beta.ppf(1 - alpha, df['O11'] + 1, df['O21'])
upper_boundary = np.log2((df['R2'] / df['R1']) * upper / (1 - upper)).clip(upper=0)

# combine, set to 0 where (df['O11'] == 0) & (df['O12'] == 0)
clrr = lower_boundary.where(
(df['O11'] / df['R1']) >= (df['O21'] / df['R2']),
upper_boundary
)
clrr = clrr.where(~((df['O11'] == 0) & (df['O12'] == 0)), 0).fillna(0)

# Normal approximation (Hardie 2014)
elif boundary == 'normal':
Expand Down
2 changes: 1 addition & 1 deletion association_measures/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
Association measures are mathematical formulae that interpret cooccurrence frequency data.
"""

VERSION = (0, 2, 6)
VERSION = (0, 2, 7)
__version__ = '.'.join(map(str, VERSION))
47 changes: 47 additions & 0 deletions performance.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Performance
- performance is calculated on a Lenovo X1 Carbon (10th generation, i7)
- input data are 24,167 observations from [brown.csv](tests/data/brown.csv)
- NB: dataframe contains 4241 duplicated frequency signatures (for which calculation will only be run once since v0.2.7)
- for each measure, we report time needed for 1000 scorings of the whole dataframe

## v0.2.7
- major performance improvement regarding conservative log-ratio with Poisson boundary (factor 50)
```
settings: iterations=1000, df_size=24167
- 0.0871 :: contingency_table
- 1.5258 :: expected_frequencies
- 0.1507 :: z_score
- 0.2899 :: t_score
- 1.7406 :: log_likelihood
- 0.6125 :: simple_ll
- 1.2981 :: min_sensitivity
- 0.2584 :: liddell
- 0.2491 :: dice
- 0.4460 :: log_ratio
- 4.5788 :: binomial_likelihood
- 2.1891 :: conservative_log_ratio
- 29.8616 :: conservative_log_ratio_poisson
- 0.3702 :: mutual_information
- 0.4314 :: local_mutual_information
```

## v0.2.6
```
Calculate contingency_table (iterations=1000, df_size=24168): 0.0873531000688672
Calculate expected_frequencies (iterations=1000, df_size=24168): 1.5203204130521044
Calculate z_score (iterations=1000, df_size=24168): 0.14853612298611552
Calculate t_score (iterations=1000, df_size=24168): 0.2881241790018976
Calculate log_likelihood (iterations=1000, df_size=24168): 1.7284309939714149
Calculate simple_ll (iterations=1000, df_size=24168): 0.6111006899736822
Calculate min_sensitivity (iterations=1000, df_size=24168): 1.3227944150567055
Calculate liddell (iterations=1000, df_size=24168): 0.25499376200605184
Calculate dice (iterations=1000, df_size=24168): 0.2465739679755643
Calculate log_ratio (iterations=1000, df_size=24168): 0.751795751042664
Calculate binomial_likelihood (iterations=1000, df_size=24168): 4.606213430990465
Calculate conservative_log_ratio (iterations=1000, df_size=24168): 2.2395021530101076
Calculate mutual_information (iterations=1000, df_size=24168): 0.3618475969415158
Calculate local_mutual_information (iterations=1000, df_size=24168): 0.41407940594945103
```
additionally:
- conservative log ratio with Poisson boundary: ~1.5s for 1 iteration
- hypergeometric likelihood: ~2.5s for 1 iteration
18 changes: 9 additions & 9 deletions performance.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@
'code': 'am.log_ratio(df)'
},
# likelihood measures
# ~2.5s for a ~25,000 rows on 8 threads
# ~2.5s for 1x ~25,000
# {
# 'name': 'hypergeometric_likelihood',
# 'code': 'am.hypergeometric_likelihood(df)'
Expand All @@ -84,11 +84,10 @@
'name': 'conservative_log_ratio',
'code': 'am.conservative_log_ratio(df)'
},
# ~1.5s for a ~25,000 rows on 8 threads
# {
# 'name': 'conservative_log_ratio_poisson',
# 'code': 'am.conservative_log_ratio(df, boundary="poisson")'
# },
{
'name': 'conservative_log_ratio_poisson',
'code': 'am.conservative_log_ratio(df, boundary="poisson")'
},
# information theory
{
'name': 'mutual_information',
Expand All @@ -100,8 +99,9 @@
},
]

size = 24167
print(f'settings: iterations={iterations}, df_size={size}')
for code in codes:
res = timeit.timeit(setup=setup, stmt=code['code'], number=iterations)
print('Calculate {func} (iterations={iter}, df_size={size}): {res}'.format(
iter=iterations, size=24168, res=res, func=code['name']
))
func = code['name']
print(f'- {res:7.4f} :: {func}')
4 changes: 1 addition & 3 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
#!/usr/bin/env python3


import io
import os
import sys
from setuptools import find_packages, Command
Expand Down Expand Up @@ -101,10 +99,10 @@ def run(self):
'License :: OSI Approved :: MIT License',
'Programming Language :: Python',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
'Programming Language :: Cython',
],
)
Loading

0 comments on commit da5d030

Please sign in to comment.