added l-skew, l-kurt, changed l-cv, added range coefficient, xi corr …

…refactoring
glevv · Dec 16, 2023 · b557a4e · b557a4e
1 parent 9f065cd
commit b557a4e
Show file tree

Hide file tree

Showing 19 changed files with 229 additions and 75 deletions.
diff --git a/CITATION.cff b/CITATION.cff
@@ -18,5 +18,5 @@ repository-code: 'https://github.com/glevv/obscure_stats'
 repository-artifact: 'https://pypi.org/project/obscure_stats'
 abstract: Collection of lesser-known statistical measures
 license: MIT
-version: 0.1.6
+version: 0.1.7
 date-released: '2023-10-21'
diff --git a/README.md b/README.md
@@ -40,11 +40,13 @@
     * Groeneveld Skewness Coefficient;
     * Hossain-Adnan Skewness Coefficient;
     * Kelly Skewness Coefficient;
+    * L-Skewness;
     * Medeen Skewness Coefficient;
     * Pearson Median Skewness Coefficient;
     * Pearson Mode Skewness Coefficient.
 - Collection of measures of kurtosis - `obscure_stats/kurtosis`:
     * Crow-Siddiqui Kurtosis;
+    * L-Kurtosis;
     * Hogg Kurtosis;
     * Moors Kurtosis;
     * Moors Octile Kurtosis;

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "obscure_stats"
-version = "0.1.6"
+version = "0.1.7"
 description = "Collection of lesser-known statistical functions"
 authors = ["Gleb Levitski"]
 readme = "README.md"

diff --git a/src/obscure_stats/association/__init__.py b/src/obscure_stats/association/__init__.py
@@ -10,10 +10,10 @@
 )
 
 __all__ = [
+    "chatterjeexi",
     "concordance_corrcoef",
     "concordance_rate",
-    "chatterjeexi",
-    "zhangi",
-    "tanimoto_similarity",
     "symmetric_chatterjeexi",
+    "tanimoto_similarity",
+    "zhangi",
 ]
diff --git a/src/obscure_stats/association/association.py b/src/obscure_stats/association/association.py
@@ -95,14 +95,15 @@ def chatterjeexi(x: np.ndarray, y: np.ndarray) -> float:
     if _check_arrays(x, y):
         return np.nan
     x, y = _prep_arrays(x, y)
+    # heavily inspired by https://github.com/czbiohub-sf/xicor/issues/17#issue-965635013
     n = len(x)
-    x_ranked = stats.rankdata(x, method="ordinal")
-    y_forward_ranked = stats.rankdata(y, method="max")
-    y_backward_ranked = stats.rankdata(-y, method="max")
-    y_forward_ranked_ordered = y_forward_ranked[np.argsort(x_ranked)]
-    nom = np.sum(np.abs(np.diff(y_forward_ranked_ordered)))
-    denom = np.sum(y_backward_ranked * (n - y_backward_ranked)) / n**3
-    return 1.0 - nom / (2 * n**2 * denom)
+    y_forward_ordered = y[np.argsort(x)]
+    _, y_unique_indexes, y_counts = np.unique(
+        y_forward_ordered, return_inverse=True, return_counts=True
+    )
+    right = np.cumsum(y_counts)[y_unique_indexes]
+    left = np.cumsum(y_counts[::-1])[len(y_counts) - y_unique_indexes - 1]
+    return 1.0 - 0.5 * np.sum(np.abs(np.diff(right))) / np.mean(left * (n - left))
 
 
 def concordance_corrcoef(x: np.ndarray, y: np.ndarray) -> float:
@@ -231,7 +232,29 @@ def symmetric_chatterjeexi(x: np.ndarray, y: np.ndarray) -> float:
     --------
     obscure_stats.associaton.chatterjeexi - Chatterjee Xi coefficient.
     """
-    return max(chatterjeexi(x, y), chatterjeexi(y, x))
+    if _check_arrays(x, y):
+        return np.nan
+    x, y = _prep_arrays(x, y)
+    n = len(x)
+    # y ~ f(x)
+    y_forward_ordered = y[np.argsort(x)]
+    _, y_unique_indexes, y_counts = np.unique(
+        y_forward_ordered, return_inverse=True, return_counts=True
+    )
+    right_xy = np.cumsum(y_counts)[y_unique_indexes]
+    left_xy = np.cumsum(y_counts[::-1])[len(y_counts) - y_unique_indexes - 1]
+    # x ~ f(y)
+    x_forward_ordered = x[np.argsort(y)]
+    _, x_unique_indexes, x_counts = np.unique(
+        x_forward_ordered, return_inverse=True, return_counts=True
+    )
+    right_yx = np.cumsum(x_counts)[x_unique_indexes]
+    left_yx = np.cumsum(x_counts[::-1])[len(x_counts) - x_unique_indexes - 1]
+    # choose the highest from the two
+    return 1.0 - min(
+        0.5 * np.sum(np.abs(np.diff(right_xy))) / np.mean(left_xy * (n - left_xy)),
+        0.5 * np.sum(np.abs(np.diff(right_yx))) / np.mean(left_yx * (n - left_yx)),
+    )
 
 
 def zhangi(x: np.ndarray, y: np.ndarray) -> float:

diff --git a/src/obscure_stats/central_tendency/__init__.py b/src/obscure_stats/central_tendency/__init__.py
@@ -13,11 +13,11 @@
 
 __all__ = [
     "contraharmonic_mean",
+    "half_sample_mode",
+    "hodges_lehmann_sen_location",
     "midhinge",
     "midmean",
     "midrange",
-    "trimean",
-    "hodges_lehmann_sen_location",
     "standard_trimmed_harrell_davis_quantile",
-    "half_sample_mode",
+    "trimean",
 ]
diff --git a/src/obscure_stats/central_tendency/central_tendency.py b/src/obscure_stats/central_tendency/central_tendency.py
@@ -18,7 +18,7 @@ def midrange(x: np.ndarray) -> float:
 
     Returns
     -------
-    mr : float or array_like.
+    mr : float
         The value of the midrange.
 
     References
@@ -44,7 +44,7 @@ def midhinge(x: np.ndarray) -> float:
 
     Returns
     -------
-    mh : float or array_like.
+    mh : float
         The value of the midhinge.
 
     References
@@ -69,7 +69,7 @@ def trimean(x: np.ndarray) -> float:
 
     Returns
     -------
-    tm : float or array_like.
+    tm : float
         The value of the trimean.
 
     References
@@ -96,7 +96,7 @@ def contraharmonic_mean(x: np.ndarray) -> float:
 
     Returns
     -------
-    chm : float or array_like.
+    chm : float
         The value of the contraharmonic mean.
 
     References
@@ -120,7 +120,7 @@ def midmean(x: np.ndarray) -> float:
 
     Returns
     -------
-    iqm : float or array_like.
+    iqm : float
         The value of the interquartile mean.
 
     References

diff --git a/src/obscure_stats/dispersion/__init__.py b/src/obscure_stats/dispersion/__init__.py
@@ -2,6 +2,7 @@
 
 from .dispersion import (
     coefficient_of_lvariation,
+    coefficient_of_range,
     coefficient_of_variation,
     dispersion_ratio,
     lloyds_index,
@@ -15,13 +16,14 @@
 
 __all__ = [
     "coefficient_of_lvariation",
+    "coefficient_of_range",
     "coefficient_of_variation",
     "dispersion_ratio",
     "lloyds_index",
     "morisita_index",
     "quartile_coefficient_of_dispersion",
-    "standard_quantile_absolute_deviation",
-    "studentized_range",
     "robust_coefficient_of_variation",
     "shamos_estimator",
+    "standard_quantile_absolute_deviation",
+    "studentized_range",
 ]
diff --git a/src/obscure_stats/dispersion/dispersion.py b/src/obscure_stats/dispersion/dispersion.py
@@ -3,7 +3,7 @@
 import warnings
 
 import numpy as np
-from scipy import stats  # type: ignore[import-untyped]
+from scipy import special, stats  # type: ignore[import-untyped]
 
 EPS = 1e-6
 
@@ -18,7 +18,7 @@ def studentized_range(x: np.ndarray) -> float:
 
     Returns
     -------
-    sr : float or array_like.
+    sr : float
         The value of the studentized range.
 
     References
@@ -46,7 +46,7 @@ def coefficient_of_lvariation(x: np.ndarray) -> float:
 
     Returns
     -------
-    lcv : float or array_like.
+    lcv : float
         The value of the linear coefficient of variation.
 
     References
@@ -60,7 +60,11 @@ def coefficient_of_lvariation(x: np.ndarray) -> float:
     if abs(l1) <= EPS:
         warnings.warn("Mean is close to 0. Statistic is undefined.", stacklevel=2)
         return np.inf
-    l2 = np.nanmean(np.abs(x - l1)) * 0.5
+    n = len(x)
+    _x = np.sort(x)
+    common = 1 / special.comb(n - 1, 1) / n
+    beta_1 = common * np.nansum(special.comb(np.arange(1, n), 1) * _x[1:])
+    l2 = 2 * beta_1 - l1
     return l2 / l1
 
 
@@ -74,7 +78,7 @@ def coefficient_of_variation(x: np.ndarray) -> float:
 
     Returns
     -------
-    cv : float or array_like.
+    cv : float
         The value of the coefficient of variation.
 
     References
@@ -103,7 +107,7 @@ def robust_coefficient_of_variation(x: np.ndarray) -> float:
 
     Returns
     -------
-    rcv : float or array_like.
+    rcv : float
         The value of the robust coefficient of variation.
 
     References
@@ -130,7 +134,7 @@ def quartile_coefficient_of_dispersion(x: np.ndarray) -> float:
 
     Returns
     -------
-    qcd : float or array_like.
+    qcd : float
         The value of the quartile coefficient of dispersion.
 
     References
@@ -161,7 +165,7 @@ def dispersion_ratio(x: np.ndarray) -> float:
 
     Returns
     -------
-    dr : float or array_like.
+    dr : float
         The value of the dispersion ratio.
 
     References
@@ -189,7 +193,7 @@ def lloyds_index(x: np.ndarray) -> float:
 
     Returns
     -------
-    li : float or array_like.
+    li : float
         The value of the Lloyd's index.
 
     References
@@ -216,7 +220,7 @@ def morisita_index(x: np.ndarray) -> float:
 
     Returns
     -------
-    mi : float or array_like.
+    mi : float
         The value of the Morisita's index.
 
     References
@@ -242,7 +246,7 @@ def standard_quantile_absolute_deviation(x: np.ndarray) -> float:
 
     Returns
     -------
-    sqad : float or array_like.
+    sqad : float
         The value of the SQAD.
 
     References
@@ -294,3 +298,30 @@ def shamos_estimator(x: np.ndarray) -> float:
     # whole matrix, which is equvalent.
     product = np.meshgrid(x, x, sparse=True)
     return np.nanmedian(np.abs(product[0] - product[1]))
+
+
+def coefficient_of_range(x: np.ndarray) -> float:
+    """Calculate coefficient of range (Range / Midrange).
+
+    Parameters
+    ----------
+    x : array_like
+        Input array.
+
+    Returns
+    -------
+    cr : float
+        The value of the linear coefficient of variation.
+
+    References
+    ----------
+    Yadav, S. K., Singh, S.,  &  Gupta, R. (2019).
+    Measures of Dispersion.
+    In Biomedical Statistics (pp. 59-70). Springer, Singapore
+    """
+    min_ = np.nanmin(x)
+    max_ = np.nanmax(x)
+    if abs(min_ + max_) <= EPS:
+        warnings.warn("Midrange is close to 0. Statistic is undefined.", stacklevel=2)
+        return np.inf
+    return (max_ - min_) / (max_ + min_)
diff --git a/src/obscure_stats/kurtosis/__init__.py b/src/obscure_stats/kurtosis/__init__.py
@@ -3,6 +3,7 @@
 from .kurtosis import (
     crow_siddiqui_kurt,
     hogg_kurt,
+    l_kurt,
     moors_kurt,
     moors_octile_kurt,
     reza_ma_kurt,
@@ -14,4 +15,5 @@
     "moors_kurt",
     "moors_octile_kurt",
     "reza_ma_kurt",
+    "l_kurt",
 ]