diff --git a/CITATION.cff b/CITATION.cff index af03f6d..d877348 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -18,5 +18,5 @@ repository-code: 'https://github.com/glevv/obscure_stats' repository-artifact: 'https://pypi.org/project/obscure_stats' abstract: Collection of lesser-known statistical measures license: MIT -version: 0.1.7 +version: 0.1.8 date-released: '2023-10-21' \ No newline at end of file diff --git a/README.md b/README.md index c8047c6..992424e 100644 --- a/README.md +++ b/README.md @@ -37,11 +37,12 @@ * Area Under the Skewness Curve (weighted and unweighted); * Bickel Mode Skewness Coefficient; * Bowley Skewness Coefficient; + * Cumulative Skewness Coefficient; * Forhad-Shorna Rank Skewness Coefficient; * Groeneveld Skewness Coefficient; * Hossain-Adnan Skewness Coefficient; * Kelly Skewness Coefficient; - * L-Skewness; + * L-Skewness Coefficient; * Medeen Skewness Coefficient; * Pearson Median Skewness Coefficient; * Pearson Mode Skewness Coefficient. @@ -53,18 +54,21 @@ * Moors Octile Kurtosis; * Reza-Ma Kurtosis. - Collection of measures of association - `obscure_stats/association`: - * Chatterjee Xi correlation Coefficient (original and symmetric versions); + * Blomqvist's Beta; + * Chatterjee Xi Correlation Coefficient (original and symmetric versions); * Concordance Correlation Coefficient; * Concordance Rate; + * Rank Minrelation Coefficient; * Tanimoto Similarity; + * Winsorized Correlation Coefficient; * Zhang I Correlation Coefficient. - Collection of measures of qualitative variation - `obscure_stats/variation`: * AVDev; * B Index; - * Extropy; * Gibbs M1; * Gibbs M2; * ModVR; + * Negative Extropy; * RanVR. ## Installation diff --git a/pyproject.toml b/pyproject.toml index e698837..b1fdb0d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,14 +1,15 @@ [tool.poetry] name = "obscure_stats" -version = "0.1.7" +version = "0.1.8" description = "Collection of lesser-known statistical functions" -authors = ["Gleb Levitski"] +authors = ["Hleb Levitski"] readme = "README.md" classifiers = [ "Development Status :: 3 - Alpha", - "Intended Audience :: Science/Research", "Intended Audience :: Developers", + "Intended Audience :: Science/Research", "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.9", @@ -16,11 +17,9 @@ classifiers = [ "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3 :: Only", - "Topic :: Software Development", "Topic :: Scientific/Engineering", + "Topic :: Software Development", "Typing :: Typed", - "Operating System :: OS Independent", - "Natural Language :: English", ] [tool.poetry.dependencies] diff --git a/src/obscure_stats/association/__init__.py b/src/obscure_stats/association/__init__.py index 6deb2a9..4615248 100644 --- a/src/obscure_stats/association/__init__.py +++ b/src/obscure_stats/association/__init__.py @@ -1,19 +1,25 @@ """Association module.""" from .association import ( + blomqvistbeta, chatterjeexi, concordance_corrcoef, concordance_rate, + rank_minrelation_coefficient, symmetric_chatterjeexi, tanimoto_similarity, + winsorized_correlation, zhangi, ) __all__ = [ + "blomqvistbeta", "chatterjeexi", "concordance_corrcoef", "concordance_rate", + "rank_minrelation_coefficient", "symmetric_chatterjeexi", "tanimoto_similarity", + "winsorized_correlation", "zhangi", ] diff --git a/src/obscure_stats/association/association.py b/src/obscure_stats/association/association.py index 3233735..63e08df 100644 --- a/src/obscure_stats/association/association.py +++ b/src/obscure_stats/association/association.py @@ -73,9 +73,9 @@ def chatterjeexi(x: np.ndarray, y: np.ndarray) -> float: Parameters ---------- x : array_like - Measured values. + Input array. y : array_like - Target values. + Input array. Returns ------- @@ -118,9 +118,9 @@ def concordance_corrcoef(x: np.ndarray, y: np.ndarray) -> float: Parameters ---------- x : array_like - Measured values. + Input array. y : array_like - Reference values. + Input array. Returns ------- @@ -162,14 +162,14 @@ def concordance_rate( Parameters ---------- x : array_like - Measured values. + Input array. y : array_like - Reference values. + Input array. Returns ------- cr : float. - The value of the quadrant count ratio. + The value of the concordance rate. References ---------- @@ -213,14 +213,14 @@ def symmetric_chatterjeexi(x: np.ndarray, y: np.ndarray) -> float: Parameters ---------- x : array_like - Measured values. + Input array. y : array_like - Target values. + Input array. Returns ------- sxi : float. - The value of the xi correlation coefficient. + The value of the symmetric xi correlation coefficient. References ---------- @@ -266,9 +266,9 @@ def zhangi(x: np.ndarray, y: np.ndarray) -> float: Parameters ---------- x : array_like - Measured values. + Input array. y : array_like - Reference values. + Input array. Returns ------- @@ -309,14 +309,14 @@ def tanimoto_similarity(x: np.ndarray, y: np.ndarray) -> float: Parameters ---------- x : array_like - Measured values. + Input array. y : array_like - Reference values. + Input array. Returns ------- ts : float. - The value of the tanimoto similarity measure + The value of the Tanimoto similarity measure References ---------- @@ -336,3 +336,128 @@ def tanimoto_similarity(x: np.ndarray, y: np.ndarray) -> float: xx = np.mean(x**2) yy = np.mean(y**2) return xy / (xx + yy - xy) + + +def blomqvistbeta(x: np.ndarray, y: np.ndarray) -> float: + """Calculate Blomqvist's beta. + + Also known as medial correlation. It is similar to Spearman Rho + and Kendall Tau correlations, but have some advantages over them. + + Parameters + ---------- + x : array_like + Input array. + y : array_like + Input array. + + Returns + ------- + bb : float. + The value of the Blomqvist's beta. + + References + ---------- + Blomqvist, N. (1950). + On a measure of dependence between two random variables. + Annals of Mathematical Statistics, 21, 593-600. + + Schmid, F.; Schmidt, R. (2007). + Nonparametric Inference on Multivariate Versions of + Blomqvist's Beta and Related Measures of Tail Dependence. + Metrika, 66(3), 323-354. + + See Also + -------- + scipy.stats.spearmanr - Spearman R coefficient. + scipy.stats.kendalltau - Kendall Tau coefficient. + """ + if _check_arrays(x, y): + return np.nan + x, y = _prep_arrays(x, y) + med_x = np.median(x) + med_y = np.median(y) + return np.mean(np.sign((x - med_x) * (y - med_y))) + + +def winsorized_correlation(x: np.ndarray, y: np.ndarray, k: float = 0.1) -> float: + """Calculate winsorized correlation coefficient. + + This correlation is a robust alternative of the Pearson correlation. + + Parameters + ---------- + x : array_like + Input array. + y : array_like + Input array. + k : float + The percentages of values to winsorize on each side of the arrays. + + Returns + ------- + wcr : float. + The value of the winsorized correlation. + + References + ---------- + Wilcox, R. R. (1993). + Some Results on a Winsorized Correlation Coefficient. + British Journal of Mathematical and Statistical Psychology, 46, 339-349. + + See Also + -------- + scipy.stats.pearsonr - Pearson correlation coefficient. + """ + if _check_arrays(x, y): + return np.nan + x, y = _prep_arrays(x, y) + x_w = stats.mstats.winsorize(x, (k, k)) + y_w = stats.mstats.winsorize(y, (k, k)) + return np.corrcoef(x_w, y_w)[0, 1] + + +def rank_minrelation_coefficient(x: np.ndarray, y: np.ndarray) -> float: + """Calculate rank minrelation coefficient. + + This measure estimates p(y > x) when x and y are continuous random variables. + In short, if a variable x exhibits a minrelation to y then, + as x increases, y is likely to increases too. + + Parameters + ---------- + x : array_like + Input array. + y : array_like + Input array. + + Returns + ------- + rmc : float. + The value of the rank minrelation coefficient. + + References + ---------- + Meyer, P. E. (2013). + A Rank Minrelation-Majrelation Coefficient. + arXiv preprint arXiv:1305.2038. + + Notes + ----- + This measure is assymetric: (x, y) != (y, x). + + See Also + -------- + Concordance rate. + Concordance correlation coefficient. + """ + if _check_arrays(x, y): + return np.nan + x, y = _prep_arrays(x, y) + n_sq = len(x) ** 2 + rank_x_inc = (np.argsort(x) + 1) ** 2 / n_sq - 0.5 + rank_y_inc = (np.argsort(y) + 1) ** 2 / n_sq - 0.5 + rank_y_dec = 0.5 - (np.argsort(-y) + 1) ** 2 / n_sq + lower = np.sum((-rank_x_inc < rank_y_inc) * (rank_x_inc + rank_y_inc) ** 2) + higher = np.sum((rank_x_inc > rank_y_dec) * (rank_x_inc - rank_y_dec) ** 2) + return (lower - higher) / (lower + higher) diff --git a/src/obscure_stats/central_tendency/central_tendency.py b/src/obscure_stats/central_tendency/central_tendency.py index b40eed3..bbe9957 100644 --- a/src/obscure_stats/central_tendency/central_tendency.py +++ b/src/obscure_stats/central_tendency/central_tendency.py @@ -238,7 +238,7 @@ def half_sample_mode(x: np.ndarray) -> float: Returns ------- hsm : float - The value of Half Sample Mode. + The value of half sample mode. References ---------- diff --git a/src/obscure_stats/dispersion/dispersion.py b/src/obscure_stats/dispersion/dispersion.py index c81891c..4dc75aa 100644 --- a/src/obscure_stats/dispersion/dispersion.py +++ b/src/obscure_stats/dispersion/dispersion.py @@ -234,7 +234,7 @@ def morisita_index(x: np.ndarray) -> float: def standard_quantile_absolute_deviation(x: np.ndarray) -> float: - """Calculate Standard quantile absolute deviation. + """Calculate standard quantile absolute deviation. This measure is a robust measure of dispersion, that has higher gaussian efficiency, but lower breaking point than MAD. @@ -247,7 +247,7 @@ def standard_quantile_absolute_deviation(x: np.ndarray) -> float: Returns ------- sqad : float - The value of the SQAD. + The value of the standard quantile absolute deviation. References ---------- @@ -276,7 +276,7 @@ def shamos_estimator(x: np.ndarray) -> float: Returns ------- se : float - The value of Hodges-Lehmann-Sen estimator. + The value of Shamos estimator. References ---------- @@ -311,7 +311,7 @@ def coefficient_of_range(x: np.ndarray) -> float: Returns ------- cr : float - The value of the linear coefficient of variation. + The value of the range coefficient. References ---------- diff --git a/src/obscure_stats/skewness/__init__.py b/src/obscure_stats/skewness/__init__.py index b98c92f..5bc7b4b 100644 --- a/src/obscure_stats/skewness/__init__.py +++ b/src/obscure_stats/skewness/__init__.py @@ -4,6 +4,7 @@ auc_skew_gamma, bickel_mode_skew, bowley_skew, + cumulative_skew, forhad_shorna_rank_skew, groeneveld_skew, hossain_adnan_skew, @@ -19,6 +20,7 @@ "auc_skew_gamma", "bickel_mode_skew", "bowley_skew", + "cumulative_skew", "forhad_shorna_rank_skew", "groeneveld_skew", "hossain_adnan_skew", diff --git a/src/obscure_stats/skewness/skewness.py b/src/obscure_stats/skewness/skewness.py index bbffdde..6d96d20 100644 --- a/src/obscure_stats/skewness/skewness.py +++ b/src/obscure_stats/skewness/skewness.py @@ -192,7 +192,7 @@ def groeneveld_skew(x: np.ndarray) -> float: Returns ------- - bsk : float + gsc : float The value of Groeneveld's skewness coefficinet. References @@ -263,7 +263,7 @@ def hossain_adnan_skew(x: np.ndarray) -> float: def forhad_shorna_rank_skew(x: np.ndarray) -> float: - """Calculate Forhad-Shorna coefficient of Rank Skewness. + """Calculate Forhad-Shorna coefficient of rank skewness. This measure is similar to Houssain and Adnan skewness coefficient, but uses differences in ranks instead of absolute differences. @@ -277,7 +277,7 @@ def forhad_shorna_rank_skew(x: np.ndarray) -> float: Returns ------- fsrs : float - The value of Forhad-Shorna coefficient of Rank Skewness. + The value of Forhad-Shorna coefficient of rank skewness. References ---------- @@ -308,7 +308,7 @@ def _auc_skew_gamma(x: np.ndarray, dp: float, w: np.ndarray | float) -> float: def auc_skew_gamma(x: np.ndarray, dp: float = 0.01) -> float: - """Calculate Area under the curve of generalized Bowley skewness coefficients. + """Calculate area under the curve of generalized Bowley skewness coefficients. This measure tries to combine multiple generalized Bowley skewness coefficients into one value. @@ -337,7 +337,7 @@ def auc_skew_gamma(x: np.ndarray, dp: float = 0.01) -> float: def wauc_skew_gamma(x: np.ndarray, dp: float = 0.01) -> float: """ - Calculate Weighted Area under the curve of generalized Bowley skewness coefficients. + Calculate weighted area under the curve of generalized Bowley skewness coefficients. This version use reweightning. It will assign bigger weights to the Bowley skewness coefficients calculated on percentiles far from the median. @@ -352,7 +352,7 @@ def wauc_skew_gamma(x: np.ndarray, dp: float = 0.01) -> float: Returns ------- aucbs : float - The value of AUC Bowley skewness. + The value of weighted AUC Bowley skewness. References ---------- @@ -364,3 +364,35 @@ def wauc_skew_gamma(x: np.ndarray, dp: float = 0.01) -> float: half_n = n // 2 w = (np.arange(half_n) / half_n)[::-1] return _auc_skew_gamma(x, dp, w) + + +def cumulative_skew(x: np.ndarray) -> float: + """ + Calculate cumulative measure of skewness. + + It is based on calculating the cumulative statistics of the Lorenz curve. + + Parameters + ---------- + x : array_like + Input array. + + Returns + ------- + cs : float + The value of cumulative skew. + + References + ---------- + Schlemmer, M. (2022). + A robust measure of skewness using cumulative statistic calculation. + arXiv preprint arXiv:2209.10699. + """ + n = len(x) + p = np.nancumsum(np.sort(x)) + p = p / p[-1] + r = np.arange(n) + q = r / n + d = q - p + w = (2 * r - n) * 3 / n + return np.sum(d * w) / np.sum(d) diff --git a/src/obscure_stats/variation/__init__.py b/src/obscure_stats/variation/__init__.py index a518750..ef2bf1c 100644 --- a/src/obscure_stats/variation/__init__.py +++ b/src/obscure_stats/variation/__init__.py @@ -3,19 +3,19 @@ from .variation import ( avdev, b_index, - extropy, gibbs_m1, gibbs_m2, mod_vr, + negative_extropy, range_vr, ) __all__ = [ "avdev", "b_index", - "extropy", "gibbs_m1", "gibbs_m2", "mod_vr", + "negative_extropy", "range_vr", ] diff --git a/src/obscure_stats/variation/variation.py b/src/obscure_stats/variation/variation.py index d6d16b0..1a68119 100644 --- a/src/obscure_stats/variation/variation.py +++ b/src/obscure_stats/variation/variation.py @@ -198,7 +198,7 @@ def avdev(x: np.ndarray) -> float: return 1 - (np.sum(np.abs(freq - mean)) / (2 * mean * max(k - 1, 1))) -def extropy(x: np.ndarray) -> float: +def negative_extropy(x: np.ndarray) -> float: """Calculate Negative Information Extropy (bits). This measure is complementary to entropy. @@ -216,7 +216,7 @@ def extropy(x: np.ndarray) -> float: Returns ------- ext : float - The value of extropy. + The value of negative extropy. References ---------- diff --git a/tests/test_association.py b/tests/test_association.py index 2bad367..e2c1f25 100644 --- a/tests/test_association.py +++ b/tests/test_association.py @@ -5,20 +5,26 @@ import numpy as np import pytest from obscure_stats.association import ( + blomqvistbeta, chatterjeexi, concordance_corrcoef, concordance_rate, + rank_minrelation_coefficient, symmetric_chatterjeexi, tanimoto_similarity, + winsorized_correlation, zhangi, ) all_functions = [ + blomqvistbeta, chatterjeexi, concordance_corrcoef, concordance_rate, + rank_minrelation_coefficient, symmetric_chatterjeexi, tanimoto_similarity, + winsorized_correlation, zhangi, ] @@ -50,17 +56,21 @@ def test_mock_association_functions( @pytest.mark.parametrize( "func", [ + blomqvistbeta, concordance_corrcoef, concordance_rate, + rank_minrelation_coefficient, tanimoto_similarity, + winsorized_correlation, ], ) def test_signed_corr_sensibility( func: typing.Callable, y_array_float: np.ndarray ) -> None: """Testing for result correctness.""" - if func(y_array_float, -y_array_float) > 0: - msg = "Corr coeff should be negative." + res = func(y_array_float, -y_array_float) + if res > 0: + msg = f"Corr coeff should be negative, got {res}" raise ValueError(msg) @@ -78,47 +88,55 @@ def test_unsigned_corr_sensibility( """Testing for result correctness.""" w = np.ones(shape=len(y_array_float)) w[0] = 2 - if func(y_array_float, -y_array_float) < func(y_array_float, w): - msg = "Corr coeff higher in the first case." + res_ideal = func(y_array_float, -y_array_float) + res_normal = func(y_array_float, w) + if res_ideal < res_normal: + msg = f"Corr coeff higher in the first case, got {res_ideal} < {res_normal}" raise ValueError(msg) @pytest.mark.parametrize( "func", [ - concordance_corrcoef, - zhangi, + blomqvistbeta, chatterjeexi, + concordance_corrcoef, concordance_rate, + rank_minrelation_coefficient, symmetric_chatterjeexi, + winsorized_correlation, + zhangi, ], ) def test_const(func: typing.Callable, y_array_float: np.ndarray) -> None: """Testing for constant input.""" x = np.ones(shape=(len(y_array_float),)) with pytest.warns(match="is constant"): - if func(x, y_array_float) is not np.nan: - msg = "Corr coef should be 0 with constant input." + res = func(x, y_array_float) + if res is not np.nan: + msg = f"Corr coef should be 0 with constant input, got {res}" raise ValueError(msg) @pytest.mark.parametrize( "func", [ + blomqvistbeta, concordance_corrcoef, concordance_rate, tanimoto_similarity, symmetric_chatterjeexi, + winsorized_correlation, ], ) def test_invariance( func: typing.Callable, x_array_float: np.ndarray, y_array_float: np.ndarray ) -> None: """Testing for invariance.""" - if pytest.approx(func(x_array_float, y_array_float)) != pytest.approx( - func(y_array_float, x_array_float) - ): - msg = "Corr coef should symmetrical." + xy = func(x_array_float, y_array_float) + yx = func(y_array_float, x_array_float) + if pytest.approx(xy) != pytest.approx(yx): + msg = f"Corr coef should symmetrical, got {xy}, {yx}" raise ValueError(msg) diff --git a/tests/test_skewness.py b/tests/test_skewness.py index 7df7eee..ca27699 100644 --- a/tests/test_skewness.py +++ b/tests/test_skewness.py @@ -8,6 +8,7 @@ auc_skew_gamma, bickel_mode_skew, bowley_skew, + cumulative_skew, forhad_shorna_rank_skew, groeneveld_skew, hossain_adnan_skew, @@ -23,6 +24,7 @@ auc_skew_gamma, bickel_mode_skew, bowley_skew, + cumulative_skew, forhad_shorna_rank_skew, groeneveld_skew, hossain_adnan_skew, diff --git a/tests/test_variation.py b/tests/test_variation.py index 496cb90..67213f9 100644 --- a/tests/test_variation.py +++ b/tests/test_variation.py @@ -7,20 +7,20 @@ from obscure_stats.variation import ( avdev, b_index, - extropy, gibbs_m1, gibbs_m2, mod_vr, + negative_extropy, range_vr, ) all_functions = [ avdev, b_index, - extropy, gibbs_m1, gibbs_m2, mod_vr, + negative_extropy, range_vr, ]