From 63dd6d3cbe96b449561708bd4e992e46392e9e36 Mon Sep 17 00:00:00 2001 From: Gleb Levitski <36483986+glevv@users.noreply.github.com> Date: Sat, 23 Nov 2024 18:56:30 +0200 Subject: [PATCH] added grenaders m, lqw, rqw and scmid-trede measure of peakedness --- CITATION.cff | 2 +- pyproject.toml | 4 +- .../central_tendency/__init__.py | 2 + .../central_tendency/central_tendency.py | 50 ++++++++- src/obscure_stats/kurtosis/__init__.py | 6 + src/obscure_stats/kurtosis/kurtosis.py | 104 +++++++++++++++++- tests/test_association.py | 10 +- tests/test_central_tendency.py | 21 +++- tests/test_dispersion.py | 8 +- tests/test_kurtosis.py | 26 ++++- tests/test_skewness.py | 8 +- tests/test_variation.py | 8 +- 12 files changed, 224 insertions(+), 25 deletions(-) diff --git a/CITATION.cff b/CITATION.cff index 99e1600..70722fe 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -17,5 +17,5 @@ repository-code: 'https://github.com/glevv/obscure_stats' repository-artifact: 'https://pypi.org/project/obscure_stats' abstract: Collection of lesser-known statistical measures license: MIT -version: 0.3.0 +version: 0.3.1 date-released: '2023-10-21' diff --git a/pyproject.toml b/pyproject.toml index 2cfbb46..4e326b6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "obscure_stats" -version = "0.3.0" +version = "0.3.1" description = "Collection of lesser-known statistical functions" authors = ["Hleb Levitski"] readme = "README.md" @@ -31,7 +31,7 @@ scipy = "^1.9.1" mypy = "^1.6.1" pytest = "^8.0.0" pytest-cov = "^5.0.0" -ruff = "^0.5.0" +ruff = "^0.7.0" hypothesis = "^6.103.1" hypothesis-pytest = "^0.19.0" diff --git a/src/obscure_stats/central_tendency/__init__.py b/src/obscure_stats/central_tendency/__init__.py index 2485b90..574b06e 100644 --- a/src/obscure_stats/central_tendency/__init__.py +++ b/src/obscure_stats/central_tendency/__init__.py @@ -2,6 +2,7 @@ from .central_tendency import ( contraharmonic_mean, + grenanders_m, half_sample_mode, hodges_lehmann_sen_location, midhinge, @@ -14,6 +15,7 @@ __all__ = [ "contraharmonic_mean", + "grenanders_m", "half_sample_mode", "hodges_lehmann_sen_location", "midhinge", diff --git a/src/obscure_stats/central_tendency/central_tendency.py b/src/obscure_stats/central_tendency/central_tendency.py index 74a1f98..bcc80d6 100644 --- a/src/obscure_stats/central_tendency/central_tendency.py +++ b/src/obscure_stats/central_tendency/central_tendency.py @@ -257,7 +257,7 @@ def half_sample_mode(x: np.ndarray) -> float: y = y[np.isfinite(y)] _corner_cases = (4, 3) # for 4 samples and 3 samples while (ny := len(y)) >= _corner_cases[0]: - half_y = ny // 2 + half_y = math.ceil(ny / 2) w_min = y[-1] - y[0] for i in range(ny - half_y): w = y[i + half_y - 1] - y[i] @@ -308,3 +308,51 @@ def tau_location(x: np.ndarray, c: float = 4.5) -> float: scaled_x = (x - med) / mad w = np.square(1.0 - np.square(scaled_x / c)) * (np.abs(scaled_x) <= c) return np.nansum(x * w) / np.nansum(w) + + +def grenanders_m(x: np.ndarray, p: float = 1.5, k: int = 3) -> float: + """Calculate Grenander's Mode. + + This measure is a direct nonparametric estimation of the mode. + It is not very robust to outliers. + It is recommended to set k > 2p, but it is only possible if x has + enough samples. + + Parameters + ---------- + x : array_like + Input array. + p : float + Smoothing constant. + k : int + The number of samples to exclude from the calculation. + + Returns + ------- + gm : float + The value of Grenander's Mode + + References + ---------- + Grenander, U. (1965). + Some direct estimates of the mode. + Annals of Mathematical Statistics, 36, 131-138. + """ + x_sort = np.sort(x) + x_sort = x_sort[np.isfinite(x_sort)] + + if p <= 1: + msg = "Parameter p should be a float greater than 1." + raise ValueError(msg) + if (k <= 0) or (not isinstance(k, int)): + msg = "Parameter k should be an integer between 1 and lenght of x." + raise ValueError(msg) + + if len(x_sort) <= k: + return np.nan + + return ( + 0.5 + * np.sum((x_sort[k:] + x_sort[:-k]) / np.power(x_sort[k:] - x_sort[:-k], p)) + / np.sum(np.power(x_sort[k:] - x_sort[:-k], -p)) + ) diff --git a/src/obscure_stats/kurtosis/__init__.py b/src/obscure_stats/kurtosis/__init__.py index a5d0ad6..73c823e 100644 --- a/src/obscure_stats/kurtosis/__init__.py +++ b/src/obscure_stats/kurtosis/__init__.py @@ -4,9 +4,12 @@ crow_siddiqui_kurt, hogg_kurt, l_kurt, + left_quantile_weight, moors_kurt, moors_octile_kurt, reza_ma_kurt, + right_quantile_weight, + schmid_trede_peakedness, staudte_kurt, ) @@ -14,8 +17,11 @@ "crow_siddiqui_kurt", "hogg_kurt", "l_kurt", + "left_quantile_weight", "moors_kurt", "moors_octile_kurt", "reza_ma_kurt", + "right_quantile_weight", + "schmid_trede_peakedness", "staudte_kurt", ] diff --git a/src/obscure_stats/kurtosis/kurtosis.py b/src/obscure_stats/kurtosis/kurtosis.py index 88577eb..55d9b44 100644 --- a/src/obscure_stats/kurtosis/kurtosis.py +++ b/src/obscure_stats/kurtosis/kurtosis.py @@ -87,9 +87,7 @@ def moors_octile_kurt(x: np.ndarray) -> float: A quantile alternative for kurtosis. Journal of the Royal Statistical Society. Series D, 37(1):25-32. """ - o1, o2, o3, o5, o6, o7 = np.nanquantile( - x, [0.125, 0.25, 0.375, 0.625, 0.750, 0.875] - ) + o1, o2, o3, o5, o6, o7 = np.nanquantile(x, [0.125, 0.25, 0.375, 0.625, 0.75, 0.875]) return ((o7 - o5) + (o3 - o1)) / (o6 - o2) @@ -206,3 +204,103 @@ def staudte_kurt(x: np.ndarray) -> float: """ p10, p33, p66, p90 = np.nanquantile(x, [0.1, 1 / 3, 2 / 3, 0.9]) return (p90 - p10) / (p66 - p33) + + +def schmid_trede_peakedness(x: np.ndarray) -> float: + """Calculate Schmid and Trder measure of peakedness P. + + It is based on inter-percentile ranges (uncentered, unscaled) and + tries to compare two different measures of dispersion of the same + sample. + This measure should be more robust than moment based kurtosis. + + Parameters + ---------- + x : array_like + Input array. + + Returns + ------- + stp : float + The value of measure of peakedness P. + + References + ---------- + Schmid, F.; Trede, M. (2003). + Simple tests for peakedness, fat tails and leptokurtosis based on quantiles. + Computational Statistics and Data Analysis, 43, 1-12. + """ + p125, p25, p75, p875 = np.nanquantile(x, [0.125, 0.25, 0.75, 0.875]) + return (p875 - p125) / (p75 - p25) + + +def left_quantile_weight(x: np.ndarray, q: float = 0.25) -> float: + """Calculate left quantile weight (LQW). + + It is based on inter-percentile ranges (uncentered, unscaled) of the + left tail of the distribution. + + Parameters + ---------- + x : array_like + Input array. + q : float + Quantile to use for the anchor. + + Returns + ------- + lqw : float + The value of left quantile weight. + + References + ---------- + Brys, G.; Hubert, M.; Struyf, A. (2006). + Robust measures of tail weight. + Computational Statistics and Data Analysis 50(3), 733-759. + """ + min_q, max_q = 0.0, 0.5 + if q <= min_q or q >= max_q: + msg = "Parameter q should be in range (0, 0.5)." + raise ValueError(msg) + lower_quantile, q025, upper_quantile = np.nanquantile( + x, [q * 0.5, 0.25, (1 - q) * 0.5] + ) + return -(upper_quantile + lower_quantile - 2 * q025) / ( + upper_quantile - lower_quantile + ) + + +def right_quantile_weight(x: np.ndarray, q: float = 0.75) -> float: + """Calculate right quantile weight (RQW). + + It is based on inter-percentile ranges (uncentered, unscaled) of the + right tail of the distribution. + + Parameters + ---------- + x : array_like + Input array. + q : float + Quantile to use for the anchor. + + Returns + ------- + rqw : float + The value of right quantile weight. + + References + ---------- + Brys, G.; Hubert, M.; Struyf, A. (2006). + Robust measures of tail weight. + Computational Statistics and Data Analysis 50(3), 733-759. + """ + min_q, max_q = 0.5, 1.0 + if q <= min_q or q >= max_q: + msg = "Parameter q should be in range (0.5, 1.0)." + raise ValueError(msg) + lower_quantile, q075, upper_quantile = np.nanquantile( + x, [1 - q * 0.5, 0.75, (1 + q) * 0.5] + ) + return (lower_quantile + upper_quantile - 2 * q075) / ( + lower_quantile - upper_quantile + ) diff --git a/tests/test_association.py b/tests/test_association.py index d95b41b..364323e 100644 --- a/tests/test_association.py +++ b/tests/test_association.py @@ -1,5 +1,6 @@ """Collection of tests of association module.""" +import math import typing import numpy as np @@ -7,6 +8,7 @@ from hypothesis import given from hypothesis import strategies as st from hypothesis.extra.numpy import array_shapes, arrays + from obscure_stats.association import ( blomqvist_beta, chatterjee_xi, @@ -118,7 +120,7 @@ def test_const(func: typing.Callable, y_array_float: np.ndarray) -> None: x = np.ones(shape=(len(y_array_float),)) with pytest.warns(match="is constant"): res = func(x, y_array_float) - if res is not np.nan: + if not math.isnan(res): msg = f"Corr coef should be 0 with constant input, got {res}" raise ValueError(msg) @@ -129,7 +131,7 @@ def test_const_after_prep( ) -> None: """Testing for constant input.""" res = func(x_array_float, corr_test_data) - if res is not np.nan: + if not math.isnan(res): msg = f"Corr coef should be 0 with constant input, got {res}" raise ValueError(msg) @@ -167,13 +169,13 @@ def test_notfinite_association( y_array_int: np.ndarray, ) -> None: """Test for correct nan behaviour.""" - if np.isnan(func(x_array_nan, y_array_int)): + if math.isnan(func(x_array_nan, y_array_int)): msg = "Corr coef should support nans." raise ValueError(msg) with pytest.warns(match="too many missing values"): func(x_array_nan[:2], x_array_int[:2]) with pytest.warns(match="contains inf"): - if not np.isnan(func(x_array_int, y_array_inf)): + if not math.isnan(func(x_array_int, y_array_inf)): msg = "Corr coef should support infs." raise ValueError(msg) diff --git a/tests/test_central_tendency.py b/tests/test_central_tendency.py index 77e69f1..d53723b 100644 --- a/tests/test_central_tendency.py +++ b/tests/test_central_tendency.py @@ -2,6 +2,7 @@ from __future__ import annotations +import math import typing import numpy as np @@ -9,8 +10,10 @@ from hypothesis import given from hypothesis import strategies as st from hypothesis.extra.numpy import array_shapes, arrays + from obscure_stats.central_tendency import ( contraharmonic_mean, + grenanders_m, half_sample_mode, hodges_lehmann_sen_location, midhinge, @@ -23,6 +26,7 @@ all_functions = [ contraharmonic_mean, + grenanders_m, half_sample_mode, hodges_lehmann_sen_location, midhinge, @@ -67,7 +71,7 @@ def test_edge_cases(x_array_float: np.ndarray) -> None: msg = "Result does not match expected output." raise ValueError(msg) result = standard_trimmed_harrell_davis_quantile(x_array_float[:0]) - if result is not np.nan: + if not math.isnan(result): msg = "Result does not match expected output." raise ValueError(msg) @@ -114,7 +118,7 @@ def test_hsm(hsm_test_data: np.ndarray) -> None: @pytest.mark.parametrize("func", all_functions) def test_statistic_with_nans(func: typing.Callable, x_array_nan: np.ndarray) -> None: """Test for different data types.""" - if np.isnan(func(x_array_nan)): + if math.isnan(func(x_array_nan)): msg = "Statistic should not return nans." raise ValueError(msg) @@ -126,6 +130,19 @@ def test_tau_location(x_array_float: np.ndarray, c: float) -> None: tau_location(x_array_float, c=c) +def test_grenaders_m(x_array_float: np.ndarray) -> None: + """Test that function will raise error if p or k parameters are incorrect.""" + with pytest.raises(ValueError, match="Parameter p should be a float"): + grenanders_m(x_array_float, p=1) + with pytest.raises(ValueError, match="Parameter k should be an integer"): + grenanders_m(x_array_float, k=0) + with pytest.raises(ValueError, match="Parameter k should be an integer"): + grenanders_m(x_array_float, k=1.5) # type: ignore[arg-type] + if not math.isnan(grenanders_m(x_array_float, k=len(x_array_float))): + msg = "Statistic should return nans." + raise ValueError(msg) + + @given( arrays( dtype=np.float64, diff --git a/tests/test_dispersion.py b/tests/test_dispersion.py index 9e8ae33..ebce60d 100644 --- a/tests/test_dispersion.py +++ b/tests/test_dispersion.py @@ -1,5 +1,6 @@ """Collection of tests of dispersion module.""" +import math import typing import numpy as np @@ -7,6 +8,7 @@ from hypothesis import given from hypothesis import strategies as st from hypothesis.extra.numpy import array_shapes, arrays + from obscure_stats.dispersion import ( coefficient_of_lvariation, coefficient_of_range, @@ -76,8 +78,8 @@ def test_mock_aggregation_functions( def test_dispersion_sensibility(func: typing.Callable, seed: int) -> None: """Testing for result correctness.""" rng = np.random.default_rng(seed) - low_disp = np.round(rng.exponential(scale=1, size=100) + 1, 2) - high_disp = np.round(rng.exponential(scale=10, size=100) + 1, 2) + low_disp = np.round(rng.exponential(scale=1, size=99) + 1, 2) + high_disp = np.round(rng.exponential(scale=10, size=99) + 1, 2) low_disp_res = func(low_disp) high_disp_res = func(high_disp) if low_disp_res > high_disp_res: @@ -91,7 +93,7 @@ def test_dispersion_sensibility(func: typing.Callable, seed: int) -> None: @pytest.mark.parametrize("func", all_functions) def test_statistic_with_nans(func: typing.Callable, x_array_nan: np.ndarray) -> None: """Test for different data types.""" - if np.isnan(func(x_array_nan)): + if math.isnan(func(x_array_nan)): msg = "Statistic should not return nans." raise ValueError(msg) diff --git a/tests/test_kurtosis.py b/tests/test_kurtosis.py index 75d92b1..331209d 100644 --- a/tests/test_kurtosis.py +++ b/tests/test_kurtosis.py @@ -1,5 +1,6 @@ """Collection of tests of kurtosis module.""" +import math import typing import numpy as np @@ -7,13 +8,17 @@ from hypothesis import given from hypothesis import strategies as st from hypothesis.extra.numpy import array_shapes, arrays + from obscure_stats.kurtosis import ( crow_siddiqui_kurt, hogg_kurt, l_kurt, + left_quantile_weight, moors_kurt, moors_octile_kurt, reza_ma_kurt, + right_quantile_weight, + schmid_trede_peakedness, staudte_kurt, ) @@ -21,9 +26,12 @@ crow_siddiqui_kurt, hogg_kurt, l_kurt, + left_quantile_weight, moors_kurt, moors_octile_kurt, reza_ma_kurt, + right_quantile_weight, + schmid_trede_peakedness, staudte_kurt, ] @@ -45,10 +53,14 @@ def test_mock_aggregation_functions( def test_kurt_sensibility(func: typing.Callable, seed: int) -> None: """Testing for result correctness.""" rng = np.random.default_rng(seed) - platy = rng.uniform(size=100) - lepto = rng.laplace(size=100) + platy = rng.uniform(size=99) + lepto = rng.laplace(size=99) platy_res = func(platy) lepto_res = func(lepto) + if func.__name__ == "right_quantile_weight": + # ugly but more harmonized this way + platy_res = -platy_res + lepto_res = -lepto_res if platy_res > lepto_res: msg = ( f"Kurtosis in the first case should be lower, got {platy_res} > {lepto_res}" @@ -59,11 +71,19 @@ def test_kurt_sensibility(func: typing.Callable, seed: int) -> None: @pytest.mark.parametrize("func", all_functions) def test_statistic_with_nans(func: typing.Callable, x_array_nan: np.ndarray) -> None: """Test for different data types.""" - if np.isnan(func(x_array_nan)): + if math.isnan(func(x_array_nan)): msg = "Statistic should not return nans." raise ValueError(msg) +@pytest.mark.parametrize("func", [right_quantile_weight, left_quantile_weight]) +@pytest.mark.parametrize("q", [0.0, 1.0]) +def test_q_in_qw(x_array_float: np.ndarray, func: typing.Callable, q: float) -> None: + """Simple tets case for correctnes of q.""" + with pytest.raises(ValueError, match="Parameter q should be in range"): + func(x_array_float, q=q) + + @given( arrays( dtype=np.float64, diff --git a/tests/test_skewness.py b/tests/test_skewness.py index 662ac21..de64a13 100644 --- a/tests/test_skewness.py +++ b/tests/test_skewness.py @@ -1,5 +1,6 @@ """Collection of tests of skewness module.""" +import math import typing import numpy as np @@ -7,6 +8,7 @@ from hypothesis import given from hypothesis import strategies as st from hypothesis.extra.numpy import array_shapes, arrays + from obscure_stats.skewness import ( auc_skew_gamma, bickel_mode_skew, @@ -57,8 +59,8 @@ def test_mock_aggregation_functions( def test_skew_sensibility(func: typing.Callable, seed: int) -> None: """Testing for result correctness.""" rng = np.random.default_rng(seed) - no_skew = np.round(rng.normal(size=100), 2) - left_skew = np.round(rng.exponential(size=100) + 1, 2) + no_skew = np.round(rng.normal(size=99), 2) + left_skew = np.round(rng.exponential(size=99) + 1, 2) no_skew_res = func(no_skew) left_skew_res = func(left_skew) if no_skew_res > left_skew_res: @@ -81,7 +83,7 @@ def test_rank_skew(rank_skewness_test_data: np.ndarray) -> None: @pytest.mark.parametrize("func", all_functions) def test_statistic_with_nans(func: typing.Callable, x_array_nan: np.ndarray) -> None: """Test for different data types.""" - if np.isnan(func(x_array_nan)): + if math.isnan(func(x_array_nan)): msg = "Statistic should not return nans." raise ValueError(msg) diff --git a/tests/test_variation.py b/tests/test_variation.py index 2da65e9..b6eacbb 100644 --- a/tests/test_variation.py +++ b/tests/test_variation.py @@ -1,5 +1,6 @@ """Collection of tests of variation module.""" +import math import typing import numpy as np @@ -7,6 +8,7 @@ from hypothesis import given from hypothesis import strategies as st from hypothesis.extra.numpy import array_shapes, arrays + from obscure_stats.variation import ( avdev, b_index, @@ -47,8 +49,8 @@ def test_mock_variation_functions( def test_var_sensibility_higher_better(func: typing.Callable, seed: int) -> None: """Testing for result correctness.""" rng = np.random.default_rng(seed) - low_var = rng.choice(["a", "b", "c", "d"], p=[0.25, 0.25, 0.25, 0.25], size=100) - high_var = rng.choice(["a", "b", "c", "d"], p=[0.75, 0.15, 0.05, 0.05], size=100) + low_var = rng.choice(["a", "b", "c", "d"], p=[0.25, 0.25, 0.25, 0.25], size=99) + high_var = rng.choice(["a", "b", "c", "d"], p=[0.75, 0.15, 0.05, 0.05], size=99) low_var_res = func(low_var) high_var_res = func(high_var) if low_var_res < high_var_res: @@ -59,7 +61,7 @@ def test_var_sensibility_higher_better(func: typing.Callable, seed: int) -> None @pytest.mark.parametrize("func", all_functions) def test_statistic_with_nans(func: typing.Callable, c_array_nan: np.ndarray) -> None: """Test for different data types.""" - if np.isnan(func(c_array_nan)): + if math.isnan(func(c_array_nan)): msg = "Statistic should not return nans." raise ValueError(msg)