Skip to content

Commit

Permalink
Merge pull request #154 from glevv/add-new-stats
Browse files Browse the repository at this point in the history
Added Grenader's M, LQW, RQW and Scmid-Trede measure of peakedness
  • Loading branch information
glevv authored Nov 24, 2024
2 parents 477a61e + 63dd6d3 commit e34e92d
Show file tree
Hide file tree
Showing 12 changed files with 224 additions and 25 deletions.
2 changes: 1 addition & 1 deletion CITATION.cff
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,5 @@ repository-code: 'https://github.com/glevv/obscure_stats'
repository-artifact: 'https://pypi.org/project/obscure_stats'
abstract: Collection of lesser-known statistical measures
license: MIT
version: 0.3.0
version: 0.3.1
date-released: '2023-10-21'
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "obscure_stats"
version = "0.3.0"
version = "0.3.1"
description = "Collection of lesser-known statistical functions"
authors = ["Hleb Levitski"]
readme = "README.md"
Expand Down Expand Up @@ -31,7 +31,7 @@ scipy = "^1.9.1"
mypy = "^1.6.1"
pytest = "^8.0.0"
pytest-cov = "^5.0.0"
ruff = "^0.5.0"
ruff = "^0.7.0"
hypothesis = "^6.103.1"
hypothesis-pytest = "^0.19.0"

Expand Down
2 changes: 2 additions & 0 deletions src/obscure_stats/central_tendency/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from .central_tendency import (
contraharmonic_mean,
grenanders_m,
half_sample_mode,
hodges_lehmann_sen_location,
midhinge,
Expand All @@ -14,6 +15,7 @@

__all__ = [
"contraharmonic_mean",
"grenanders_m",
"half_sample_mode",
"hodges_lehmann_sen_location",
"midhinge",
Expand Down
50 changes: 49 additions & 1 deletion src/obscure_stats/central_tendency/central_tendency.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ def half_sample_mode(x: np.ndarray) -> float:
y = y[np.isfinite(y)]
_corner_cases = (4, 3) # for 4 samples and 3 samples
while (ny := len(y)) >= _corner_cases[0]:
half_y = ny // 2
half_y = math.ceil(ny / 2)
w_min = y[-1] - y[0]
for i in range(ny - half_y):
w = y[i + half_y - 1] - y[i]
Expand Down Expand Up @@ -308,3 +308,51 @@ def tau_location(x: np.ndarray, c: float = 4.5) -> float:
scaled_x = (x - med) / mad
w = np.square(1.0 - np.square(scaled_x / c)) * (np.abs(scaled_x) <= c)
return np.nansum(x * w) / np.nansum(w)


def grenanders_m(x: np.ndarray, p: float = 1.5, k: int = 3) -> float:
"""Calculate Grenander's Mode.
This measure is a direct nonparametric estimation of the mode.
It is not very robust to outliers.
It is recommended to set k > 2p, but it is only possible if x has
enough samples.
Parameters
----------
x : array_like
Input array.
p : float
Smoothing constant.
k : int
The number of samples to exclude from the calculation.
Returns
-------
gm : float
The value of Grenander's Mode
References
----------
Grenander, U. (1965).
Some direct estimates of the mode.
Annals of Mathematical Statistics, 36, 131-138.
"""
x_sort = np.sort(x)
x_sort = x_sort[np.isfinite(x_sort)]

if p <= 1:
msg = "Parameter p should be a float greater than 1."
raise ValueError(msg)
if (k <= 0) or (not isinstance(k, int)):
msg = "Parameter k should be an integer between 1 and lenght of x."
raise ValueError(msg)

if len(x_sort) <= k:
return np.nan

return (
0.5
* np.sum((x_sort[k:] + x_sort[:-k]) / np.power(x_sort[k:] - x_sort[:-k], p))
/ np.sum(np.power(x_sort[k:] - x_sort[:-k], -p))
)
6 changes: 6 additions & 0 deletions src/obscure_stats/kurtosis/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,24 @@
crow_siddiqui_kurt,
hogg_kurt,
l_kurt,
left_quantile_weight,
moors_kurt,
moors_octile_kurt,
reza_ma_kurt,
right_quantile_weight,
schmid_trede_peakedness,
staudte_kurt,
)

__all__ = [
"crow_siddiqui_kurt",
"hogg_kurt",
"l_kurt",
"left_quantile_weight",
"moors_kurt",
"moors_octile_kurt",
"reza_ma_kurt",
"right_quantile_weight",
"schmid_trede_peakedness",
"staudte_kurt",
]
104 changes: 101 additions & 3 deletions src/obscure_stats/kurtosis/kurtosis.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,7 @@ def moors_octile_kurt(x: np.ndarray) -> float:
A quantile alternative for kurtosis.
Journal of the Royal Statistical Society. Series D, 37(1):25-32.
"""
o1, o2, o3, o5, o6, o7 = np.nanquantile(
x, [0.125, 0.25, 0.375, 0.625, 0.750, 0.875]
)
o1, o2, o3, o5, o6, o7 = np.nanquantile(x, [0.125, 0.25, 0.375, 0.625, 0.75, 0.875])
return ((o7 - o5) + (o3 - o1)) / (o6 - o2)


Expand Down Expand Up @@ -206,3 +204,103 @@ def staudte_kurt(x: np.ndarray) -> float:
"""
p10, p33, p66, p90 = np.nanquantile(x, [0.1, 1 / 3, 2 / 3, 0.9])
return (p90 - p10) / (p66 - p33)


def schmid_trede_peakedness(x: np.ndarray) -> float:
"""Calculate Schmid and Trder measure of peakedness P.
It is based on inter-percentile ranges (uncentered, unscaled) and
tries to compare two different measures of dispersion of the same
sample.
This measure should be more robust than moment based kurtosis.
Parameters
----------
x : array_like
Input array.
Returns
-------
stp : float
The value of measure of peakedness P.
References
----------
Schmid, F.; Trede, M. (2003).
Simple tests for peakedness, fat tails and leptokurtosis based on quantiles.
Computational Statistics and Data Analysis, 43, 1-12.
"""
p125, p25, p75, p875 = np.nanquantile(x, [0.125, 0.25, 0.75, 0.875])
return (p875 - p125) / (p75 - p25)


def left_quantile_weight(x: np.ndarray, q: float = 0.25) -> float:
"""Calculate left quantile weight (LQW).
It is based on inter-percentile ranges (uncentered, unscaled) of the
left tail of the distribution.
Parameters
----------
x : array_like
Input array.
q : float
Quantile to use for the anchor.
Returns
-------
lqw : float
The value of left quantile weight.
References
----------
Brys, G.; Hubert, M.; Struyf, A. (2006).
Robust measures of tail weight.
Computational Statistics and Data Analysis 50(3), 733-759.
"""
min_q, max_q = 0.0, 0.5
if q <= min_q or q >= max_q:
msg = "Parameter q should be in range (0, 0.5)."
raise ValueError(msg)
lower_quantile, q025, upper_quantile = np.nanquantile(
x, [q * 0.5, 0.25, (1 - q) * 0.5]
)
return -(upper_quantile + lower_quantile - 2 * q025) / (
upper_quantile - lower_quantile
)


def right_quantile_weight(x: np.ndarray, q: float = 0.75) -> float:
"""Calculate right quantile weight (RQW).
It is based on inter-percentile ranges (uncentered, unscaled) of the
right tail of the distribution.
Parameters
----------
x : array_like
Input array.
q : float
Quantile to use for the anchor.
Returns
-------
rqw : float
The value of right quantile weight.
References
----------
Brys, G.; Hubert, M.; Struyf, A. (2006).
Robust measures of tail weight.
Computational Statistics and Data Analysis 50(3), 733-759.
"""
min_q, max_q = 0.5, 1.0
if q <= min_q or q >= max_q:
msg = "Parameter q should be in range (0.5, 1.0)."
raise ValueError(msg)
lower_quantile, q075, upper_quantile = np.nanquantile(
x, [1 - q * 0.5, 0.75, (1 + q) * 0.5]
)
return (lower_quantile + upper_quantile - 2 * q075) / (
lower_quantile - upper_quantile
)
10 changes: 6 additions & 4 deletions tests/test_association.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
"""Collection of tests of association module."""

import math
import typing

import numpy as np
import pytest
from hypothesis import given
from hypothesis import strategies as st
from hypothesis.extra.numpy import array_shapes, arrays

from obscure_stats.association import (
blomqvist_beta,
chatterjee_xi,
Expand Down Expand Up @@ -118,7 +120,7 @@ def test_const(func: typing.Callable, y_array_float: np.ndarray) -> None:
x = np.ones(shape=(len(y_array_float),))
with pytest.warns(match="is constant"):
res = func(x, y_array_float)
if res is not np.nan:
if not math.isnan(res):
msg = f"Corr coef should be 0 with constant input, got {res}"
raise ValueError(msg)

Expand All @@ -129,7 +131,7 @@ def test_const_after_prep(
) -> None:
"""Testing for constant input."""
res = func(x_array_float, corr_test_data)
if res is not np.nan:
if not math.isnan(res):
msg = f"Corr coef should be 0 with constant input, got {res}"
raise ValueError(msg)

Expand Down Expand Up @@ -167,13 +169,13 @@ def test_notfinite_association(
y_array_int: np.ndarray,
) -> None:
"""Test for correct nan behaviour."""
if np.isnan(func(x_array_nan, y_array_int)):
if math.isnan(func(x_array_nan, y_array_int)):
msg = "Corr coef should support nans."
raise ValueError(msg)
with pytest.warns(match="too many missing values"):
func(x_array_nan[:2], x_array_int[:2])
with pytest.warns(match="contains inf"):
if not np.isnan(func(x_array_int, y_array_inf)):
if not math.isnan(func(x_array_int, y_array_inf)):
msg = "Corr coef should support infs."
raise ValueError(msg)

Expand Down
21 changes: 19 additions & 2 deletions tests/test_central_tendency.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,18 @@

from __future__ import annotations

import math
import typing

import numpy as np
import pytest
from hypothesis import given
from hypothesis import strategies as st
from hypothesis.extra.numpy import array_shapes, arrays

from obscure_stats.central_tendency import (
contraharmonic_mean,
grenanders_m,
half_sample_mode,
hodges_lehmann_sen_location,
midhinge,
Expand All @@ -23,6 +26,7 @@

all_functions = [
contraharmonic_mean,
grenanders_m,
half_sample_mode,
hodges_lehmann_sen_location,
midhinge,
Expand Down Expand Up @@ -67,7 +71,7 @@ def test_edge_cases(x_array_float: np.ndarray) -> None:
msg = "Result does not match expected output."
raise ValueError(msg)
result = standard_trimmed_harrell_davis_quantile(x_array_float[:0])
if result is not np.nan:
if not math.isnan(result):
msg = "Result does not match expected output."
raise ValueError(msg)

Expand Down Expand Up @@ -114,7 +118,7 @@ def test_hsm(hsm_test_data: np.ndarray) -> None:
@pytest.mark.parametrize("func", all_functions)
def test_statistic_with_nans(func: typing.Callable, x_array_nan: np.ndarray) -> None:
"""Test for different data types."""
if np.isnan(func(x_array_nan)):
if math.isnan(func(x_array_nan)):
msg = "Statistic should not return nans."
raise ValueError(msg)

Expand All @@ -126,6 +130,19 @@ def test_tau_location(x_array_float: np.ndarray, c: float) -> None:
tau_location(x_array_float, c=c)


def test_grenaders_m(x_array_float: np.ndarray) -> None:
"""Test that function will raise error if p or k parameters are incorrect."""
with pytest.raises(ValueError, match="Parameter p should be a float"):
grenanders_m(x_array_float, p=1)
with pytest.raises(ValueError, match="Parameter k should be an integer"):
grenanders_m(x_array_float, k=0)
with pytest.raises(ValueError, match="Parameter k should be an integer"):
grenanders_m(x_array_float, k=1.5) # type: ignore[arg-type]
if not math.isnan(grenanders_m(x_array_float, k=len(x_array_float))):
msg = "Statistic should return nans."
raise ValueError(msg)


@given(
arrays(
dtype=np.float64,
Expand Down
8 changes: 5 additions & 3 deletions tests/test_dispersion.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
"""Collection of tests of dispersion module."""

import math
import typing

import numpy as np
import pytest
from hypothesis import given
from hypothesis import strategies as st
from hypothesis.extra.numpy import array_shapes, arrays

from obscure_stats.dispersion import (
coefficient_of_lvariation,
coefficient_of_range,
Expand Down Expand Up @@ -76,8 +78,8 @@ def test_mock_aggregation_functions(
def test_dispersion_sensibility(func: typing.Callable, seed: int) -> None:
"""Testing for result correctness."""
rng = np.random.default_rng(seed)
low_disp = np.round(rng.exponential(scale=1, size=100) + 1, 2)
high_disp = np.round(rng.exponential(scale=10, size=100) + 1, 2)
low_disp = np.round(rng.exponential(scale=1, size=99) + 1, 2)
high_disp = np.round(rng.exponential(scale=10, size=99) + 1, 2)
low_disp_res = func(low_disp)
high_disp_res = func(high_disp)
if low_disp_res > high_disp_res:
Expand All @@ -91,7 +93,7 @@ def test_dispersion_sensibility(func: typing.Callable, seed: int) -> None:
@pytest.mark.parametrize("func", all_functions)
def test_statistic_with_nans(func: typing.Callable, x_array_nan: np.ndarray) -> None:
"""Test for different data types."""
if np.isnan(func(x_array_nan)):
if math.isnan(func(x_array_nan)):
msg = "Statistic should not return nans."
raise ValueError(msg)

Expand Down
Loading

0 comments on commit e34e92d

Please sign in to comment.