Merge pull request #154 from glevv/add-new-stats

Added Grenader's M, LQW, RQW and Scmid-Trede measure of peakedness
glevv · Nov 24, 2024 · e34e92d · e34e92d
2 parents 477a61e + 63dd6d3
commit e34e92d
Show file tree

Hide file tree

Showing 12 changed files with 224 additions and 25 deletions.
diff --git a/CITATION.cff b/CITATION.cff
@@ -17,5 +17,5 @@ repository-code: 'https://github.com/glevv/obscure_stats'
 repository-artifact: 'https://pypi.org/project/obscure_stats'
 abstract: Collection of lesser-known statistical measures
 license: MIT
-version: 0.3.0
+version: 0.3.1
 date-released: '2023-10-21'
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "obscure_stats"
-version = "0.3.0"
+version = "0.3.1"
 description = "Collection of lesser-known statistical functions"
 authors = ["Hleb Levitski"]
 readme = "README.md"
@@ -31,7 +31,7 @@ scipy = "^1.9.1"
 mypy = "^1.6.1"
 pytest = "^8.0.0"
 pytest-cov = "^5.0.0"
-ruff = "^0.5.0"
+ruff = "^0.7.0"
 hypothesis = "^6.103.1"
 hypothesis-pytest = "^0.19.0"
 

diff --git a/src/obscure_stats/central_tendency/__init__.py b/src/obscure_stats/central_tendency/__init__.py
@@ -2,6 +2,7 @@
 
 from .central_tendency import (
     contraharmonic_mean,
+    grenanders_m,
     half_sample_mode,
     hodges_lehmann_sen_location,
     midhinge,
@@ -14,6 +15,7 @@
 
 __all__ = [
     "contraharmonic_mean",
+    "grenanders_m",
     "half_sample_mode",
     "hodges_lehmann_sen_location",
     "midhinge",

diff --git a/src/obscure_stats/central_tendency/central_tendency.py b/src/obscure_stats/central_tendency/central_tendency.py
@@ -257,7 +257,7 @@ def half_sample_mode(x: np.ndarray) -> float:
     y = y[np.isfinite(y)]
     _corner_cases = (4, 3)  # for 4 samples and 3 samples
     while (ny := len(y)) >= _corner_cases[0]:
-        half_y = ny // 2
+        half_y = math.ceil(ny / 2)
         w_min = y[-1] - y[0]
         for i in range(ny - half_y):
             w = y[i + half_y - 1] - y[i]
@@ -308,3 +308,51 @@ def tau_location(x: np.ndarray, c: float = 4.5) -> float:
     scaled_x = (x - med) / mad
     w = np.square(1.0 - np.square(scaled_x / c)) * (np.abs(scaled_x) <= c)
     return np.nansum(x * w) / np.nansum(w)
+
+
+def grenanders_m(x: np.ndarray, p: float = 1.5, k: int = 3) -> float:
+    """Calculate Grenander's Mode.
+
+    This measure is a direct nonparametric estimation of the mode.
+    It is not very robust to outliers.
+    It is recommended to set k > 2p, but it is only possible if x has
+    enough samples.
+
+    Parameters
+    ----------
+    x : array_like
+        Input array.
+    p : float
+        Smoothing constant.
+    k : int
+        The number of samples to exclude from the calculation.
+
+    Returns
+    -------
+    gm : float
+        The value of Grenander's Mode
+
+    References
+    ----------
+    Grenander, U. (1965).
+    Some direct estimates of the mode.
+    Annals of Mathematical Statistics, 36, 131-138.
+    """
+    x_sort = np.sort(x)
+    x_sort = x_sort[np.isfinite(x_sort)]
+
+    if p <= 1:
+        msg = "Parameter p should be a float greater than 1."
+        raise ValueError(msg)
+    if (k <= 0) or (not isinstance(k, int)):
+        msg = "Parameter k should be an integer between 1 and lenght of x."
+        raise ValueError(msg)
+
+    if len(x_sort) <= k:
+        return np.nan
+
+    return (
+        0.5
+        * np.sum((x_sort[k:] + x_sort[:-k]) / np.power(x_sort[k:] - x_sort[:-k], p))
+        / np.sum(np.power(x_sort[k:] - x_sort[:-k], -p))
+    )
diff --git a/src/obscure_stats/kurtosis/__init__.py b/src/obscure_stats/kurtosis/__init__.py
@@ -4,18 +4,24 @@
     crow_siddiqui_kurt,
     hogg_kurt,
     l_kurt,
+    left_quantile_weight,
     moors_kurt,
     moors_octile_kurt,
     reza_ma_kurt,
+    right_quantile_weight,
+    schmid_trede_peakedness,
     staudte_kurt,
 )
 
 __all__ = [
     "crow_siddiqui_kurt",
     "hogg_kurt",
     "l_kurt",
+    "left_quantile_weight",
     "moors_kurt",
     "moors_octile_kurt",
     "reza_ma_kurt",
+    "right_quantile_weight",
+    "schmid_trede_peakedness",
     "staudte_kurt",
 ]
diff --git a/src/obscure_stats/kurtosis/kurtosis.py b/src/obscure_stats/kurtosis/kurtosis.py
@@ -87,9 +87,7 @@ def moors_octile_kurt(x: np.ndarray) -> float:
     A quantile alternative for kurtosis.
     Journal of the Royal Statistical Society. Series D, 37(1):25-32.
     """
-    o1, o2, o3, o5, o6, o7 = np.nanquantile(
-        x, [0.125, 0.25, 0.375, 0.625, 0.750, 0.875]
-    )
+    o1, o2, o3, o5, o6, o7 = np.nanquantile(x, [0.125, 0.25, 0.375, 0.625, 0.75, 0.875])
     return ((o7 - o5) + (o3 - o1)) / (o6 - o2)
 
 
@@ -206,3 +204,103 @@ def staudte_kurt(x: np.ndarray) -> float:
     """
     p10, p33, p66, p90 = np.nanquantile(x, [0.1, 1 / 3, 2 / 3, 0.9])
     return (p90 - p10) / (p66 - p33)
+
+
+def schmid_trede_peakedness(x: np.ndarray) -> float:
+    """Calculate Schmid and Trder measure of peakedness P.
+
+    It is based on inter-percentile ranges (uncentered, unscaled) and
+    tries to compare two different measures of dispersion of the same
+    sample.
+    This measure should be more robust than moment based kurtosis.
+
+    Parameters
+    ----------
+    x : array_like
+        Input array.
+
+    Returns
+    -------
+    stp : float
+        The value of measure of peakedness P.
+
+    References
+    ----------
+    Schmid, F.; Trede, M. (2003).
+    Simple tests for peakedness, fat tails and leptokurtosis based on quantiles.
+    Computational Statistics and Data Analysis, 43, 1-12.
+    """
+    p125, p25, p75, p875 = np.nanquantile(x, [0.125, 0.25, 0.75, 0.875])
+    return (p875 - p125) / (p75 - p25)
+
+
+def left_quantile_weight(x: np.ndarray, q: float = 0.25) -> float:
+    """Calculate left quantile weight (LQW).
+
+    It is based on inter-percentile ranges (uncentered, unscaled) of the
+    left tail of the distribution.
+
+    Parameters
+    ----------
+    x : array_like
+        Input array.
+    q : float
+        Quantile to use for the anchor.
+
+    Returns
+    -------
+    lqw : float
+        The value of left quantile weight.
+
+    References
+    ----------
+    Brys, G.; Hubert, M.; Struyf, A. (2006).
+    Robust measures of tail weight.
+    Computational Statistics and Data Analysis 50(3), 733-759.
+    """
+    min_q, max_q = 0.0, 0.5
+    if q <= min_q or q >= max_q:
+        msg = "Parameter q should be in range (0, 0.5)."
+        raise ValueError(msg)
+    lower_quantile, q025, upper_quantile = np.nanquantile(
+        x, [q * 0.5, 0.25, (1 - q) * 0.5]
+    )
+    return -(upper_quantile + lower_quantile - 2 * q025) / (
+        upper_quantile - lower_quantile
+    )
+
+
+def right_quantile_weight(x: np.ndarray, q: float = 0.75) -> float:
+    """Calculate right quantile weight (RQW).
+
+    It is based on inter-percentile ranges (uncentered, unscaled) of the
+    right tail of the distribution.
+
+    Parameters
+    ----------
+    x : array_like
+        Input array.
+    q : float
+        Quantile to use for the anchor.
+
+    Returns
+    -------
+    rqw : float
+        The value of right quantile weight.
+
+    References
+    ----------
+    Brys, G.; Hubert, M.; Struyf, A. (2006).
+    Robust measures of tail weight.
+    Computational Statistics and Data Analysis 50(3), 733-759.
+    """
+    min_q, max_q = 0.5, 1.0
+    if q <= min_q or q >= max_q:
+        msg = "Parameter q should be in range (0.5, 1.0)."
+        raise ValueError(msg)
+    lower_quantile, q075, upper_quantile = np.nanquantile(
+        x, [1 - q * 0.5, 0.75, (1 + q) * 0.5]
+    )
+    return (lower_quantile + upper_quantile - 2 * q075) / (
+        lower_quantile - upper_quantile
+    )
diff --git a/tests/test_association.py b/tests/test_association.py
@@ -1,12 +1,14 @@
 """Collection of tests of association module."""
 
+import math
 import typing
 
 import numpy as np
 import pytest
 from hypothesis import given
 from hypothesis import strategies as st
 from hypothesis.extra.numpy import array_shapes, arrays
+
 from obscure_stats.association import (
     blomqvist_beta,
     chatterjee_xi,
@@ -118,7 +120,7 @@ def test_const(func: typing.Callable, y_array_float: np.ndarray) -> None:
     x = np.ones(shape=(len(y_array_float),))
     with pytest.warns(match="is constant"):
         res = func(x, y_array_float)
-        if res is not np.nan:
+        if not math.isnan(res):
             msg = f"Corr coef should be 0 with constant input, got {res}"
             raise ValueError(msg)
 
@@ -129,7 +131,7 @@ def test_const_after_prep(
 ) -> None:
     """Testing for constant input."""
     res = func(x_array_float, corr_test_data)
-    if res is not np.nan:
+    if not math.isnan(res):
         msg = f"Corr coef should be 0 with constant input, got {res}"
         raise ValueError(msg)
 
@@ -167,13 +169,13 @@ def test_notfinite_association(
     y_array_int: np.ndarray,
 ) -> None:
     """Test for correct nan behaviour."""
-    if np.isnan(func(x_array_nan, y_array_int)):
+    if math.isnan(func(x_array_nan, y_array_int)):
         msg = "Corr coef should support nans."
         raise ValueError(msg)
     with pytest.warns(match="too many missing values"):
         func(x_array_nan[:2], x_array_int[:2])
     with pytest.warns(match="contains inf"):
-        if not np.isnan(func(x_array_int, y_array_inf)):
+        if not math.isnan(func(x_array_int, y_array_inf)):
             msg = "Corr coef should support infs."
             raise ValueError(msg)
 

diff --git a/tests/test_central_tendency.py b/tests/test_central_tendency.py
@@ -2,15 +2,18 @@
 
 from __future__ import annotations
 
+import math
 import typing
 
 import numpy as np
 import pytest
 from hypothesis import given
 from hypothesis import strategies as st
 from hypothesis.extra.numpy import array_shapes, arrays
+
 from obscure_stats.central_tendency import (
     contraharmonic_mean,
+    grenanders_m,
     half_sample_mode,
     hodges_lehmann_sen_location,
     midhinge,
@@ -23,6 +26,7 @@
 
 all_functions = [
     contraharmonic_mean,
+    grenanders_m,
     half_sample_mode,
     hodges_lehmann_sen_location,
     midhinge,
@@ -67,7 +71,7 @@ def test_edge_cases(x_array_float: np.ndarray) -> None:
         msg = "Result does not match expected output."
         raise ValueError(msg)
     result = standard_trimmed_harrell_davis_quantile(x_array_float[:0])
-    if result is not np.nan:
+    if not math.isnan(result):
         msg = "Result does not match expected output."
         raise ValueError(msg)
 
@@ -114,7 +118,7 @@ def test_hsm(hsm_test_data: np.ndarray) -> None:
 @pytest.mark.parametrize("func", all_functions)
 def test_statistic_with_nans(func: typing.Callable, x_array_nan: np.ndarray) -> None:
     """Test for different data types."""
-    if np.isnan(func(x_array_nan)):
+    if math.isnan(func(x_array_nan)):
         msg = "Statistic should not return nans."
         raise ValueError(msg)
 
@@ -126,6 +130,19 @@ def test_tau_location(x_array_float: np.ndarray, c: float) -> None:
         tau_location(x_array_float, c=c)
 
 
+def test_grenaders_m(x_array_float: np.ndarray) -> None:
+    """Test that function will raise error if p or k parameters are incorrect."""
+    with pytest.raises(ValueError, match="Parameter p should be a float"):
+        grenanders_m(x_array_float, p=1)
+    with pytest.raises(ValueError, match="Parameter k should be an integer"):
+        grenanders_m(x_array_float, k=0)
+    with pytest.raises(ValueError, match="Parameter k should be an integer"):
+        grenanders_m(x_array_float, k=1.5)  # type: ignore[arg-type]
+    if not math.isnan(grenanders_m(x_array_float, k=len(x_array_float))):
+        msg = "Statistic should return nans."
+        raise ValueError(msg)
+
+
 @given(
     arrays(
         dtype=np.float64,

diff --git a/tests/test_dispersion.py b/tests/test_dispersion.py
@@ -1,12 +1,14 @@
 """Collection of tests of dispersion module."""
 
+import math
 import typing
 
 import numpy as np
 import pytest
 from hypothesis import given
 from hypothesis import strategies as st
 from hypothesis.extra.numpy import array_shapes, arrays
+
 from obscure_stats.dispersion import (
     coefficient_of_lvariation,
     coefficient_of_range,
@@ -76,8 +78,8 @@ def test_mock_aggregation_functions(
 def test_dispersion_sensibility(func: typing.Callable, seed: int) -> None:
     """Testing for result correctness."""
     rng = np.random.default_rng(seed)
-    low_disp = np.round(rng.exponential(scale=1, size=100) + 1, 2)
-    high_disp = np.round(rng.exponential(scale=10, size=100) + 1, 2)
+    low_disp = np.round(rng.exponential(scale=1, size=99) + 1, 2)
+    high_disp = np.round(rng.exponential(scale=10, size=99) + 1, 2)
     low_disp_res = func(low_disp)
     high_disp_res = func(high_disp)
     if low_disp_res > high_disp_res:
@@ -91,7 +93,7 @@ def test_dispersion_sensibility(func: typing.Callable, seed: int) -> None:
 @pytest.mark.parametrize("func", all_functions)
 def test_statistic_with_nans(func: typing.Callable, x_array_nan: np.ndarray) -> None:
     """Test for different data types."""
-    if np.isnan(func(x_array_nan)):
+    if math.isnan(func(x_array_nan)):
         msg = "Statistic should not return nans."
         raise ValueError(msg)