diff --git a/.github/workflows/package.yml b/.github/workflows/package.yml new file mode 100644 index 0000000..349bbaf --- /dev/null +++ b/.github/workflows/package.yml @@ -0,0 +1,68 @@ +name: CI + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +jobs: + build: + runs-on: ubuntu-latest + strategy: + fail-fast: True + matrix: + python-version: [ "3.11" ] + + steps: + - name: Check out repository + uses: actions/checkout@v3 + - name: Set up python + id: setup-python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + + - name: Install Poetry + uses: snok/install-poetry@v1 + with: + virtualenvs-create: true + virtualenvs-in-project: true + installer-parallel: true + + - name: Load cached venv + id: cached-poetry-dependencies + uses: actions/cache@v3 + with: + path: .venv + key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }} + + - name: Install dependencies + if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' + run: poetry install --no-interaction --no-root + + - name: Activate env + run: | + source .venv/bin/activate + + - uses: chartboost/ruff-action@v1 + name: Ruff + with: + args: --check . + config: pyproject.toml + + - name: Run mypy + run: | + mypy . + + - name: Run tests + run: | + pytest --cov tests/ + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v4-beta + with: + flags: smart-tests + verbose: true + env: + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} \ No newline at end of file diff --git a/src/obscure_stats/association/association.py b/src/obscure_stats/association/association.py index 9ed0b4c..38ccb89 100644 --- a/src/obscure_stats/association/association.py +++ b/src/obscure_stats/association/association.py @@ -154,9 +154,9 @@ def concordance_rate( It could be seen as simplified version of Pearson's R. It differs from quadrant count ratio by adding and exclusion zone - variation has an option for an exclusion zone. - It is based on the standard error of the mean and will exlucde - points that are in the range of mean+-sem. + variation has an option for an exclusion zone. It is based on the + standard error of the mean and will exlucde points that are in the + range of mean+-sem. Parameters ---------- diff --git a/src/obscure_stats/dispersion/__init__.py b/src/obscure_stats/dispersion/__init__.py index d648e0e..62b0710 100644 --- a/src/obscure_stats/dispersion/__init__.py +++ b/src/obscure_stats/dispersion/__init__.py @@ -4,7 +4,6 @@ coefficient_of_lvariation, coefficient_of_variation, dispersion_ratio, - efficiency, lloyds_index, morisita_index, quartile_coefficient_of_dispersion, @@ -17,7 +16,6 @@ "coefficient_of_lvariation", "coefficient_of_variation", "dispersion_ratio", - "efficiency", "lloyds_index", "morisita_index", "quartile_coefficient_of_dispersion", diff --git a/src/obscure_stats/dispersion/dispersion.py b/src/obscure_stats/dispersion/dispersion.py index 1ca430a..d21bbc8 100644 --- a/src/obscure_stats/dispersion/dispersion.py +++ b/src/obscure_stats/dispersion/dispersion.py @@ -8,31 +8,6 @@ EPS = 1e-6 -def efficiency(x: np.ndarray) -> float: - """Calculate array efficiency (squared CV). - - Parameters - ---------- - x : array_like - Input array. - - Returns - ------- - eff : float or array_like. - The value of the efficiency. - - References - ---------- - Grubbs, F. E. (1965). - Statistical Measures of Accuracy for Riflemen and Missile Engineers. pp. 26-27. - """ - mean = np.nanmean(x) - if abs(mean) <= EPS: - warnings.warn("Mean is close to 0. Statistic is undefined.", stacklevel=2) - return np.inf - return np.nanvar(x) / mean**2 - - def studentized_range(x: np.ndarray) -> float: """Calculate range normalized by standard deviation. @@ -59,7 +34,10 @@ def studentized_range(x: np.ndarray) -> float: def coefficient_of_lvariation(x: np.ndarray) -> float: - """Calculate linear coefficient of variation (MeanAbsDev / Mean). + """Calculate linear coefficient of variation. + + L-CV is the L-scale (half of mean absolute deviation) divided + by L-mean (the same as regular mean). Parameters ---------- @@ -87,7 +65,7 @@ def coefficient_of_lvariation(x: np.ndarray) -> float: def coefficient_of_variation(x: np.ndarray) -> float: - """Calculate coefficient of variation (Std / Mean). + """Calculate coefficient of variation (Standard deviation / Mean). Parameters ---------- @@ -115,7 +93,8 @@ def coefficient_of_variation(x: np.ndarray) -> float: def robust_coefficient_of_variation(x: np.ndarray) -> float: """Calculate robust coefficient of variation. - It is based on median absolute deviation from the median (MedAbsDev / Median). + It is based on median absolute deviation from the median, i.e. median + absolute deviation from the median divided by the median. Parameters ---------- @@ -170,6 +149,11 @@ def quartile_coefficient_of_dispersion(x: np.ndarray) -> float: def dispersion_ratio(x: np.ndarray) -> float: """Calculate dispersion ratio (Mean / GMean). + The closer a dispersion ratio is to 1, the lower the dispersion + between the observations within an array. + In this function geometric mean computed by excluding zeros and + missing data points. + Parameters ---------- x : array_like @@ -187,7 +171,9 @@ def dispersion_ratio(x: np.ndarray) -> float: prior to unsupervised machine learning. Statistics, Optimization & Information Computing, 11(2), 519-530. """ - return np.nanmean(x) / (stats.gmean(x, nan_policy="omit") + EPS) + _x = np.asarray(x) + _x = np.where(_x == 0, np.nan, _x) + return np.nanmean(x) / stats.gmean(_x, nan_policy="omit") def lloyds_index(x: np.ndarray) -> float: @@ -246,8 +232,8 @@ def morisita_index(x: np.ndarray) -> float: def sqad(x: np.ndarray) -> float: """Calculate Standard quantile absolute deviation. - This measure is a robust measure of dispersion, that does not need - normalizing constant like MAD and has higher gaussian efficiency. + This measure is a robust measure of dispersion, that has higher + gaussian efficiency, but lower breaking point. Parameters ---------- @@ -266,5 +252,8 @@ def sqad(x: np.ndarray) -> float: arXiv preprint arXiv:2208.13459. """ med = np.nanmedian(x) - # constant value to maximize efficiency for normal distribution - return np.nanquantile(np.abs(x - med), q=0.682689492137086) + n = len(x) + # finite sample correction + k = 1.0 + 0.762 / n + 0.967 / n**2 + # constant value that maximizes efficiency for normal distribution + return k * np.nanquantile(np.abs(x - med), q=0.682689492137086) diff --git a/src/obscure_stats/kurtosis/kurtosis.py b/src/obscure_stats/kurtosis/kurtosis.py index 045c565..3e5d423 100644 --- a/src/obscure_stats/kurtosis/kurtosis.py +++ b/src/obscure_stats/kurtosis/kurtosis.py @@ -8,6 +8,11 @@ def moors_kurt(x: np.ndarray) -> float: """Calculate Moor's vision of kurtosis, based on Z score. + The kurtosis can now be seen as a measure of the dispersion of + squared Z around its expectation. + Alternatively it can be seen to be a measure of the dispersion + of Z around +1 and -1. + Parameters ---------- x : array_like diff --git a/src/obscure_stats/skewness/skewness.py b/src/obscure_stats/skewness/skewness.py index a6c1000..6e56e6a 100644 --- a/src/obscure_stats/skewness/skewness.py +++ b/src/obscure_stats/skewness/skewness.py @@ -138,7 +138,9 @@ def medeen_skew(x: np.ndarray) -> float: def bowley_skew(x: np.ndarray) -> float: """Calculate Bowley's skewness coefficinet. - It is based on quartiles (uncentered, unscaled). + Also known as Yule-Kendall skewness coefficient. + It is based on quartiles (uncentered, unscaled) and compares the distance + between the median and each of the two quartiles. This measure should be more robust than moment based skewness. Parameters diff --git a/tests/test_dispersion.py b/tests/test_dispersion.py index 497d019..21a763a 100644 --- a/tests/test_dispersion.py +++ b/tests/test_dispersion.py @@ -8,7 +8,6 @@ coefficient_of_lvariation, coefficient_of_variation, dispersion_ratio, - efficiency, lloyds_index, morisita_index, quartile_coefficient_of_dispersion, @@ -25,7 +24,6 @@ coefficient_of_variation, robust_coefficient_of_variation, dispersion_ratio, - efficiency, lloyds_index, morisita_index, quartile_coefficient_of_dispersion, @@ -54,7 +52,6 @@ def test_mock_aggregation_functions( coefficient_of_variation, robust_coefficient_of_variation, dispersion_ratio, - efficiency, lloyds_index, morisita_index, quartile_coefficient_of_dispersion, @@ -79,7 +76,6 @@ def test_dispersion_sensibility(func: typing.Callable, seed: int) -> None: coefficient_of_variation, robust_coefficient_of_variation, quartile_coefficient_of_dispersion, - efficiency, ], ) def test_cv_corner_cases(func: typing.Callable) -> None: @@ -98,7 +94,6 @@ def test_cv_corner_cases(func: typing.Callable) -> None: coefficient_of_variation, robust_coefficient_of_variation, dispersion_ratio, - efficiency, lloyds_index, morisita_index, quartile_coefficient_of_dispersion,