diff --git a/.github/workflows/package.yml b/.github/workflows/package.yml new file mode 100644 index 0000000..60fdc7b --- /dev/null +++ b/.github/workflows/package.yml @@ -0,0 +1,74 @@ +name: CI + +on: + push: + branches: + - main + pull_request: + branches: + - main + +env: + PYTHONUNBUFFERED: "1" + PYTHONHASHSEED: "1" + +jobs: + build: + runs-on: ubuntu-latest + strategy: + fail-fast: True + defaults: + run: + shell: bash + + steps: + - name: Check out repository + uses: actions/checkout@v3 + - name: Set up python + id: setup-python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + + - name: Install Poetry + uses: snok/install-poetry@v1 + with: + virtualenvs-create: true + virtualenvs-in-project: true + installer-parallel: true + + - name: Load cached venv + id: cached-poetry-dependencies + uses: actions/cache@v3 + with: + path: .venv + key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }} + + - name: Install dependencies + if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' + run: poetry install --no-interaction --no-root + + - name: Activate env + run: | + source $VENV + + - name: Run ruff + run: | + poetry run ruff check . + poetry run ruff format . --check + + - name: Run mypy + run: | + poetry run mypy . + + - name: Run tests + run: | + poetry run pytest --cov=obscure_stats --cov-report term --cov-report xml:coverage.xml + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v3 + with: + flags: smart-tests + verbose: true + env: + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} \ No newline at end of file diff --git a/README.md b/README.md index 92ac33e..5a58994 100644 --- a/README.md +++ b/README.md @@ -2,24 +2,71 @@ | | | | --- | --- | -| Package | [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/obscure_stats?logo=Python)](https://pypi.org/project/obscure_stats/) [![PyPI](https://img.shields.io/pypi/v/obscure_stats?logo=PyPI)](https://pypi.org/project/obscure_stats) | -| Meta | [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff) [![Checked with mypy](https://www.mypy-lang.org/static/mypy_badge.svg)](https://mypy-lang.org/) [![License - MIT](https://img.shields.io/badge/license-MIT-9400d3.svg)](https://spdx.org/licenses/) [![DOI](https://zenodo.org/badge/163630824.svg)](https://zenodo.org/badge/latestdoi/163630824) +| CI/CD |[![CI - Test](https://github.com/glevv/obscure_stats/actions/workflows/package.yml/badge.svg)](https://github.com/glevv/obscure_stats/actions/workflows/package.yml) [![Coverage](https://codecov.io/github/glevv/obscure_stats/coverage.svg?branch=main)](https://codecov.io/gh/glevv/obscure_stats) +| Package | [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/obscure_stats?logo=Python)](https://pypi.org/project/obscure_stats/) [![PyPI](https://img.shields.io/pypi/v/obscure_stats?logo=PyPI)](https://pypi.org/project/obscure_stats/) | +| Meta | [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/) [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff) [![Checked with mypy](https://www.mypy-lang.org/static/mypy_badge.svg)](https://mypy-lang.org/) [![License - MIT](https://img.shields.io/badge/license-MIT-9400d3.svg)](https://spdx.org/licenses/) [![DOI](https://zenodo.org/badge/163630824.svg)](https://zenodo.org/badge/latestdoi/163630824) -**Collection of less-known statistical measures** +## Highlights: -Highlights: +`obscure_stats` is a Python package that includes a lot of useful but less known statistical functions and builds on top of `numpy` and `scipy`. -- Collection of measures of central tendency - `obscure_stats/central_tendency`. -- Collection of measures of dispersion - `obscure_stats/dispersion`. -- Collection of measures of skewness - `obscure_stats/skewness`. -- Collection of measures of kurtosis - `obscure_stats/kurtosis`. -- Collection of measures of association - `obscure_stats/association`. -- Collection of measures of qualitative variation - `obscure_stats/variation`. +## Current API list -**Installation** +- Collection of measures of central tendency - `obscure_stats/central_tendency`: + * Contraharmonic Mean; + * Half-Sample Mode; + * Hodges-Lehmann-Sen Location; + * Midhinge; + * Midmean; + * Midrange; + * Standard Trimmed Harrell-Davis Quantile; + * Trimean. +- Collection of measures of dispersion - `obscure_stats/dispersion`: + * Coefficient of Variation; + * Dispersion Ratio; + * Linear Coefficient of Variation; + * Lloyds Index; + * Morisita Index; + * Quartile Coefficient of Dispersion; + * Robust Coefficient of Variation; + * Standard Quantile Absolute Deviation; + * Studentized Range. +- Collection of measures of skewness - `obscure_stats/skewness`: + * Area Under the Skewness Curve (weighted and unweighted); + * Bickel Mode Skewness Coefficient; + * Bowley Skewness Coefficient; + * Forhad-Shorna Rank Skewness Coefficient; + * Groeneveld Skewness Coefficient; + * Hossain-Adnan Skewness Coefficient; + * Kelly Skewness Coefficient; + * Medeen Skewness Coefficient; + * Pearson Median Skewness Coefficient; + * Pearson Mode Skewness Coefficient (original and halfmode modification). +- Collection of measures of kurtosis - `obscure_stats/kurtosis`: + * Crow-Siddiqui Kurtosis; + * Hogg Kurtosis; + * Moors Kurtosis; + * Moors Octile Kurtosis; + * Reza-Ma Kurtosis. +- Collection of measures of association - `obscure_stats/association`: + * Chatterjee Xi correlation Coefficient (original and symmetric versions); + * Concordance Correlation Coefficient; + * Concordance Rate; + * Tanimoto Similarity; + * Zhang I Correlation Coefficient. +- Collection of measures of qualitative variation - `obscure_stats/variation`: + * AVDev; + * B Index; + * Extropy; + * Gibbs M1; + * Gibbs M2; + * ModVR; + * RanVR. + +## Installation `pip install obscure_stats` -**License** +## License The content of this repository is licensed under a [MIT license](https://github.com/glevv/obscure_stats/blob/main/LICENSE). diff --git a/src/obscure_stats/association/association.py b/src/obscure_stats/association/association.py index 9ed0b4c..38ccb89 100644 --- a/src/obscure_stats/association/association.py +++ b/src/obscure_stats/association/association.py @@ -154,9 +154,9 @@ def concordance_rate( It could be seen as simplified version of Pearson's R. It differs from quadrant count ratio by adding and exclusion zone - variation has an option for an exclusion zone. - It is based on the standard error of the mean and will exlucde - points that are in the range of mean+-sem. + variation has an option for an exclusion zone. It is based on the + standard error of the mean and will exlucde points that are in the + range of mean+-sem. Parameters ---------- diff --git a/src/obscure_stats/central_tendency/central_tendency.py b/src/obscure_stats/central_tendency/central_tendency.py index a26e21a..a2983bf 100644 --- a/src/obscure_stats/central_tendency/central_tendency.py +++ b/src/obscure_stats/central_tendency/central_tendency.py @@ -175,8 +175,8 @@ def standard_trimmed_harrell_davis_quantile(x: np.ndarray, q: float = 0.5) -> fl """Calculate Standard Trimmed Harrell-Davis median estimator. This measure is very robust. - It uses modified Harrel-Davies quantiles to calculate median - in the most dense region of probability function. + It calculates weighted Harrel-Davies quantiles on only sqrt(N) samples, + located in the region with the most probability mass. Parameters ---------- diff --git a/src/obscure_stats/dispersion/__init__.py b/src/obscure_stats/dispersion/__init__.py index d648e0e..62b0710 100644 --- a/src/obscure_stats/dispersion/__init__.py +++ b/src/obscure_stats/dispersion/__init__.py @@ -4,7 +4,6 @@ coefficient_of_lvariation, coefficient_of_variation, dispersion_ratio, - efficiency, lloyds_index, morisita_index, quartile_coefficient_of_dispersion, @@ -17,7 +16,6 @@ "coefficient_of_lvariation", "coefficient_of_variation", "dispersion_ratio", - "efficiency", "lloyds_index", "morisita_index", "quartile_coefficient_of_dispersion", diff --git a/src/obscure_stats/dispersion/dispersion.py b/src/obscure_stats/dispersion/dispersion.py index 1ca430a..d21bbc8 100644 --- a/src/obscure_stats/dispersion/dispersion.py +++ b/src/obscure_stats/dispersion/dispersion.py @@ -8,31 +8,6 @@ EPS = 1e-6 -def efficiency(x: np.ndarray) -> float: - """Calculate array efficiency (squared CV). - - Parameters - ---------- - x : array_like - Input array. - - Returns - ------- - eff : float or array_like. - The value of the efficiency. - - References - ---------- - Grubbs, F. E. (1965). - Statistical Measures of Accuracy for Riflemen and Missile Engineers. pp. 26-27. - """ - mean = np.nanmean(x) - if abs(mean) <= EPS: - warnings.warn("Mean is close to 0. Statistic is undefined.", stacklevel=2) - return np.inf - return np.nanvar(x) / mean**2 - - def studentized_range(x: np.ndarray) -> float: """Calculate range normalized by standard deviation. @@ -59,7 +34,10 @@ def studentized_range(x: np.ndarray) -> float: def coefficient_of_lvariation(x: np.ndarray) -> float: - """Calculate linear coefficient of variation (MeanAbsDev / Mean). + """Calculate linear coefficient of variation. + + L-CV is the L-scale (half of mean absolute deviation) divided + by L-mean (the same as regular mean). Parameters ---------- @@ -87,7 +65,7 @@ def coefficient_of_lvariation(x: np.ndarray) -> float: def coefficient_of_variation(x: np.ndarray) -> float: - """Calculate coefficient of variation (Std / Mean). + """Calculate coefficient of variation (Standard deviation / Mean). Parameters ---------- @@ -115,7 +93,8 @@ def coefficient_of_variation(x: np.ndarray) -> float: def robust_coefficient_of_variation(x: np.ndarray) -> float: """Calculate robust coefficient of variation. - It is based on median absolute deviation from the median (MedAbsDev / Median). + It is based on median absolute deviation from the median, i.e. median + absolute deviation from the median divided by the median. Parameters ---------- @@ -170,6 +149,11 @@ def quartile_coefficient_of_dispersion(x: np.ndarray) -> float: def dispersion_ratio(x: np.ndarray) -> float: """Calculate dispersion ratio (Mean / GMean). + The closer a dispersion ratio is to 1, the lower the dispersion + between the observations within an array. + In this function geometric mean computed by excluding zeros and + missing data points. + Parameters ---------- x : array_like @@ -187,7 +171,9 @@ def dispersion_ratio(x: np.ndarray) -> float: prior to unsupervised machine learning. Statistics, Optimization & Information Computing, 11(2), 519-530. """ - return np.nanmean(x) / (stats.gmean(x, nan_policy="omit") + EPS) + _x = np.asarray(x) + _x = np.where(_x == 0, np.nan, _x) + return np.nanmean(x) / stats.gmean(_x, nan_policy="omit") def lloyds_index(x: np.ndarray) -> float: @@ -246,8 +232,8 @@ def morisita_index(x: np.ndarray) -> float: def sqad(x: np.ndarray) -> float: """Calculate Standard quantile absolute deviation. - This measure is a robust measure of dispersion, that does not need - normalizing constant like MAD and has higher gaussian efficiency. + This measure is a robust measure of dispersion, that has higher + gaussian efficiency, but lower breaking point. Parameters ---------- @@ -266,5 +252,8 @@ def sqad(x: np.ndarray) -> float: arXiv preprint arXiv:2208.13459. """ med = np.nanmedian(x) - # constant value to maximize efficiency for normal distribution - return np.nanquantile(np.abs(x - med), q=0.682689492137086) + n = len(x) + # finite sample correction + k = 1.0 + 0.762 / n + 0.967 / n**2 + # constant value that maximizes efficiency for normal distribution + return k * np.nanquantile(np.abs(x - med), q=0.682689492137086) diff --git a/src/obscure_stats/kurtosis/kurtosis.py b/src/obscure_stats/kurtosis/kurtosis.py index 045c565..3e5d423 100644 --- a/src/obscure_stats/kurtosis/kurtosis.py +++ b/src/obscure_stats/kurtosis/kurtosis.py @@ -8,6 +8,11 @@ def moors_kurt(x: np.ndarray) -> float: """Calculate Moor's vision of kurtosis, based on Z score. + The kurtosis can now be seen as a measure of the dispersion of + squared Z around its expectation. + Alternatively it can be seen to be a measure of the dispersion + of Z around +1 and -1. + Parameters ---------- x : array_like diff --git a/src/obscure_stats/skewness/skewness.py b/src/obscure_stats/skewness/skewness.py index a6c1000..6e56e6a 100644 --- a/src/obscure_stats/skewness/skewness.py +++ b/src/obscure_stats/skewness/skewness.py @@ -138,7 +138,9 @@ def medeen_skew(x: np.ndarray) -> float: def bowley_skew(x: np.ndarray) -> float: """Calculate Bowley's skewness coefficinet. - It is based on quartiles (uncentered, unscaled). + Also known as Yule-Kendall skewness coefficient. + It is based on quartiles (uncentered, unscaled) and compares the distance + between the median and each of the two quartiles. This measure should be more robust than moment based skewness. Parameters diff --git a/src/obscure_stats/variation/__init__.py b/src/obscure_stats/variation/__init__.py index a043841..a518750 100644 --- a/src/obscure_stats/variation/__init__.py +++ b/src/obscure_stats/variation/__init__.py @@ -1,7 +1,7 @@ """Variation module.""" from .variation import ( - ada_index, + avdev, b_index, extropy, gibbs_m1, @@ -11,7 +11,7 @@ ) __all__ = [ - "ada_index", + "avdev", "b_index", "extropy", "gibbs_m1", diff --git a/src/obscure_stats/variation/variation.py b/src/obscure_stats/variation/variation.py index fe03ba0..81f9e03 100644 --- a/src/obscure_stats/variation/variation.py +++ b/src/obscure_stats/variation/variation.py @@ -167,12 +167,12 @@ def b_index(x: np.ndarray) -> float: return 1 - np.sqrt(1 - np.square(stats.gmean(freq * len(freq) / n))) -def ada_index(x: np.ndarray) -> float: +def avdev(x: np.ndarray) -> float: """Calculate Average Deviation Analogue. Normalized to 0-1 range categorical analog of the mean deviation. - Low values of Ada Index correspond to lower variation and + Low values of AVDev correspond to lower variation and high values to higher variation. Parameters @@ -182,8 +182,8 @@ def ada_index(x: np.ndarray) -> float: Returns ------- - adi : float - The value of ADA index. + avd : float + The value of AVDev. References ---------- diff --git a/tests/test_dispersion.py b/tests/test_dispersion.py index 497d019..21a763a 100644 --- a/tests/test_dispersion.py +++ b/tests/test_dispersion.py @@ -8,7 +8,6 @@ coefficient_of_lvariation, coefficient_of_variation, dispersion_ratio, - efficiency, lloyds_index, morisita_index, quartile_coefficient_of_dispersion, @@ -25,7 +24,6 @@ coefficient_of_variation, robust_coefficient_of_variation, dispersion_ratio, - efficiency, lloyds_index, morisita_index, quartile_coefficient_of_dispersion, @@ -54,7 +52,6 @@ def test_mock_aggregation_functions( coefficient_of_variation, robust_coefficient_of_variation, dispersion_ratio, - efficiency, lloyds_index, morisita_index, quartile_coefficient_of_dispersion, @@ -79,7 +76,6 @@ def test_dispersion_sensibility(func: typing.Callable, seed: int) -> None: coefficient_of_variation, robust_coefficient_of_variation, quartile_coefficient_of_dispersion, - efficiency, ], ) def test_cv_corner_cases(func: typing.Callable) -> None: @@ -98,7 +94,6 @@ def test_cv_corner_cases(func: typing.Callable) -> None: coefficient_of_variation, robust_coefficient_of_variation, dispersion_ratio, - efficiency, lloyds_index, morisita_index, quartile_coefficient_of_dispersion, diff --git a/tests/test_variation.py b/tests/test_variation.py index d6ee606..8259f0d 100644 --- a/tests/test_variation.py +++ b/tests/test_variation.py @@ -5,7 +5,7 @@ import numpy as np import pytest from obscure_stats.variation import ( - ada_index, + avdev, b_index, extropy, gibbs_m1, @@ -18,7 +18,7 @@ @pytest.mark.parametrize( "func", [ - ada_index, + avdev, b_index, extropy, gibbs_m1, @@ -41,7 +41,7 @@ def test_mock_variation_functions( @pytest.mark.parametrize( "func", [ - ada_index, + avdev, b_index, gibbs_m1, gibbs_m2, @@ -66,7 +66,7 @@ def test_var_sensibility_higher_better(func: typing.Callable, seed: int) -> None @pytest.mark.parametrize( "func", [ - ada_index, + avdev, b_index, extropy, gibbs_m1,