Skip to content

Commit

Permalink
Merge pull request #27 from fau-klue/v0.2.2
Browse files Browse the repository at this point in the history
v0.2.2
  • Loading branch information
ausgerechnet authored Aug 20, 2022
2 parents e512589 + 4fbf8f7 commit e373c2b
Show file tree
Hide file tree
Showing 21 changed files with 1,449 additions and 1,260 deletions.
2 changes: 1 addition & 1 deletion Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ verify_ssl = true
pytest = "==7.0.1"
pylint = "==2.13.9"
pytest-cov = "==3.0.0"
cython = "==0.29.30"
twine = "==3.7.1"
setuptools = "==59.6.0"
cython = "==0.29.30"

[packages]
wheel = ">=0.37.1"
Expand Down
962 changes: 493 additions & 469 deletions Pipfile.lock

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ You can thus `join` the results directly to the input.

## Association Measures

The following association measures are currently implemented (v0.2.1):
The following association measures are currently implemented (v0.2.2):

- asymptotic hypothesis tests:
- **z-score** (`z_score`)
Expand All @@ -148,7 +148,7 @@ The following association measures are currently implemented (v0.2.1):
- **local mutual information** (`local_mutual_information`)
- conservative estimates
- **conservative log-ratio** (`conservative_log_ratio`)
- parameters: `disc`, `alpha`, `correct`, `one_sided`
- parameters: `disc`, `alpha`, `correct`, `one_sided`, `boundary`

You can either calculate specific measures:

Expand Down
11 changes: 5 additions & 6 deletions association_measures/frequencies.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""
observed (O11, O12, O21, O22) and expected (E11, E12, E21, E22) frequencies
http://www.collocations.de/AM/index.html
"""

from pandas import DataFrame
Expand All @@ -19,10 +18,10 @@ def observed_frequencies(df):
Possible input formats:
- frequency signature (cf. Evert 2008: Figure 8):
f = O11 # co-occurrence freq. of token and node
f1 = R1 <int> # number of tokens in W(node)
f2 = C1 # marginal freq. of token
N = N <int> # size of corpus without nodes
f = O11 # co-occurrence freq. of token and node / freq. in corpus 1
f1 = R1 <int> # number of tokens in W(node) / size of corpus 1
f2 = C1 # marginal freq. of token / freq. in corpus 1 + 2
N = N <int> # size of corpus without nodes / size of corpus 1 + 2
- corpus frequencies ("keyword friendly"):
f1 = O11 # number of occurrences in corpus 1
f2 = O21 # number of occurrences in corpus 2
Expand Down Expand Up @@ -57,7 +56,7 @@ def observed_frequencies(df):
O22 = df['N2'] - O21

else:
raise ValueError('columns are not reasonably named: %s ' % str(df.columns))
raise ValueError(f'columns are not reasonably named: {str(df.columns)}')

return DataFrame(
index=df.index,
Expand Down
131 changes: 82 additions & 49 deletions association_measures/measures.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
"""
Association measures are mathematical formulae that interpret cooccurrence frequency data.
association measures
http://www.collocations.de/AM/index.html
"""


import numpy as np
# from statistics import NormalDist # requires python version >= 3.8
from scipy.stats import norm # requires scipy
from pandas import concat
from scipy.stats import norm, beta

from .binomial import choose
from .frequencies import expected_frequencies, observed_frequencies
Expand All @@ -32,25 +29,25 @@ def list_measures():
'log_likelihood': log_likelihood,
'simple_ll': simple_ll,
# point estimates of association strength
'liddell': liddell,
'min_sensitivity': min_sensitivity,
'liddell': liddell,
'dice': dice,
'log_ratio': log_ratio,
# likelihood measures
# 'hypergeometric_likelihood': hypergeometric_likelihood,
# 'binomial_likelihood': binomial_likelihood,
'binomial_likelihood': binomial_likelihood,
# conservative estimates
'conservative_log_ratio': conservative_log_ratio,
# information theory
'mutual_information': mutual_information,
'local_mutual_information': local_mutual_information,
# conservative estimates
'conservative_log_ratio': conservative_log_ratio
}


def score(df, f1=None, N=None, N1=None, N2=None,
measures=None, freq=True, per_million=True, digits=6,
disc=.5, signed=True, alpha=.01, correct='Bonferroni',
one_sided=False):
def score(df, measures=None, f1=None, N=None, N1=None, N2=None,
freq=True, per_million=True, digits=6, disc=.001,
signed=True, alpha=.001, correct='Bonferroni',
boundary='normal', vocab=None, one_sided=False):

"""Wrapper for `calculate_measures` that also allows integer counts to
be given as parameters. This is reasonable for the following notations:
Expand Down Expand Up @@ -95,6 +92,7 @@ def score(df, f1=None, N=None, N1=None, N2=None,

df = calculate_measures(df, measures, freq, per_million, digits,
disc=disc, signed=signed, alpha=alpha,
boundary=boundary, vocab=vocab,
correct=correct, one_sided=one_sided)

return df
Expand All @@ -120,7 +118,9 @@ def calculate_measures(df, measures=None, freq=False,
:param float disc: discounting (or smoothing) parameter for O11 == 0 (and O21 == 0)
:param bool signed: enforce negative values for rows with O11 < E11?
:param float alpha: CLR: significance level
:param str boundary: exact CI boundary of [poisson] distribution or [normal] approximation?
:param str correct: CLR: correction type repeated tests (None|"Bonferroni"|"Sidak")
:param int vocab: CLR: size of vocabulary (number of comparisons for correcting alpha)
:param bool one_sided: CLR: calculate one- or two-sided confidence interval
:return: association measures
Expand Down Expand Up @@ -382,8 +382,34 @@ def binomial_likelihood(df, **kwargs):
# CONSERVATIVE ESTIMATES #
##########################

def conservative_log_ratio(df, disc=.5, alpha=.01,
correct='Bonferroni', one_sided=False, **kwargs):
def get_poisson_ci_boundary(alpha, O11, N1, O21, N2):
"""
Get the lower (if O11 / N1 >= O21 / N2) or upper (else) bound of
the CI of a Poisson distribution
:param float alpha: sig. level
:param int O11:
:param int N1:
:param int O21:
:param int N2:
"""

if (O11 / N1) >= (O21 / N2):
lower = beta.ppf(alpha, O11, O21 + 1)
boundary = max(np.log2((N2 / N1) * lower / (1 - lower)), 0)
else:
upper = beta.ppf(1 - alpha, O11 + 1, O21)
boundary = min(np.log2((N2 / N1) * upper / (1 - upper)), 0)

return boundary


BOUNDARY = np.vectorize(get_poisson_ci_boundary)


def conservative_log_ratio(df, disc=.5, alpha=.001, boundary='normal',
correct='Bonferroni', vocab=None,
one_sided=False, **kwargs):
"""
Calculate conservative log-ratio, i.e. the binary logarithm of the
lower bound of the confidence interval of relative risk at the
Expand All @@ -392,54 +418,61 @@ def conservative_log_ratio(df, disc=.5, alpha=.01,
:param DataFrame df: pd.DataFrame with columns O11, O12, O21, O22
:param float disc: discounting (or smoothing) parameter for O11 == 0 and O21 == 0
:param float alpha: significance level
:param str boundary: exact CI boundary of [poisson] distribution or [normal] approximation?
:param str correct: correction type for several tests (None | "Bonferroni" | "Sidak")
:param int vocab: size of vocabulary (number of comparisons for correcting alpha)
:param bool one_sided: calculate one- or two-sided confidence interval
:return: conservative log-ratio
:rtype: pd.Series
"""

# questionable discounting according to Hardie (2014)
O11_disc = df['O11'].where(df['O11'] != 0, disc)
O21_disc = df['O21'].where(df['O21'] != 0, disc)

# compute natural logarithm of relative risk
# so we can use estimate for standard error of log(RR)
R1 = df['O11'] + df['O12']
R2 = df['O21'] + df['O22']
lrr = np.log((O11_disc / O21_disc) / (R1 / R2))
# correction of alpha for two-sided tests
if not one_sided:
alpha /= 2

# Bonferroni or Sidak correction
if isinstance(correct, str):
vocab = (df['O11'] >= 1).sum()
if correct == 'Bonferroni':
alpha /= vocab
elif correct == "Sidak": # TODO: improve computation
alpha = 1 - (1 - alpha) ** (1 / vocab)
if correct is not None:
if isinstance(correct, str):
vocab = (df['O11'] >= 1).sum() if vocab is None else vocab
if correct == 'Bonferroni':
alpha /= vocab
elif correct == "Sidak":
alpha = 1 - (1 - alpha) ** (1 / vocab)
# more stable alternative: alpha = 1 - exp(log(1 - alpha) / vocab)
# doesn't make any difference in practice though, e.g. alpha = .00001, vocab = 10**10
else:
raise ValueError('parameter "correct" should either be "Bonferroni" or "Sidak".')
else:
raise ValueError('parameter "correct" should either be "Bonferroni" or "Sidak".')
elif correct is None:
pass
else:
raise ValueError('parameter "correct" should either be None or a string.')
raise ValueError('parameter "correct" should either be None or a string.')

# get respective quantile of normal distribution
if not one_sided:
alpha /= 2
# z_factor = NormalDist().inv_cdf(1 - alpha)
z_factor = norm.ppf(1 - alpha)
# CONFIDENCE INTERVAL

# asymptotic standard deviation of log(RR) according to Wikipedia
lrr_sd = np.sqrt(1/O11_disc + 1/O21_disc - 1/R1 - 1/R2)
# Poisson approximation (Evert 2022)
if boundary == 'poisson':
tmp = df[['O11', 'O12', 'O21', 'O22']].copy()
tmp['N1'] = tmp['O11'] + tmp['O12']
tmp['N2'] = tmp['O21'] + tmp['O22']
clrr = BOUNDARY(alpha, tmp['O11'], tmp['N1'], tmp['O21'], tmp['N2'])

# calculate and apply appropriate boundary
ci_min = (lrr - lrr_sd * z_factor).clip(lower=0)
ci_max = (lrr + lrr_sd * z_factor).clip(upper=0)
clrr = ci_min.where(lrr >= 0, ci_max)

# adjust to binary logarithm
clrr /= np.log(2)
# Normal approximation (Hardie 2014)
elif boundary == 'normal':
R1 = df['O11'] + df['O12']
R2 = df['O21'] + df['O22']
# - questionable discounting according to Hardie (2014)
O11_disc = df['O11'].where(df['O11'] != 0, disc)
O21_disc = df['O21'].where(df['O21'] != 0, disc)
# - compute natural logarithm of relative risk so we can use estimate for standard error of log(RR)
lrr = np.log((O11_disc / O21_disc) / (R1 / R2))
# - asymptotic standard deviation of log(RR) according to Wikipedia
lrr_sd = np.sqrt(1/O11_disc + 1/O21_disc - 1/R1 - 1/R2)
# - calculate and apply appropriate boundary
z_factor = norm.ppf(1 - alpha)
ci_min = (lrr - lrr_sd * z_factor).clip(lower=0)
ci_max = (lrr + lrr_sd * z_factor).clip(upper=0)
clrr = ci_min.where(lrr >= 0, ci_max)
clrr /= np.log(2) # adjust to binary logarithm

return clrr

Expand Down
2 changes: 1 addition & 1 deletion association_measures/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
Association measures are mathematical formulae that interpret cooccurrence frequency data.
"""

VERSION = (0, 2, 1)
VERSION = (0, 2, 2)
__version__ = '.'.join(map(str, VERSION))
Loading

0 comments on commit e373c2b

Please sign in to comment.