Skip to content

Commit

Permalink
Calc kernel once (#83)
Browse files Browse the repository at this point in the history
* remove support for python 3.5

* remove kendall, pearson, spearman

* calculate kernel once

* flake 8 formatting fixes

* add systsem and package requirements to readme

* make indep test not case sensitive

* add workers option to kernels

* will change scipy code to permute distance matrices correctly

* fix scipy issue later

* assertion errors fix

* Update mgcx.py
  • Loading branch information
sampan501 authored Apr 28, 2020
1 parent ddcaa5e commit 4547cc8
Show file tree
Hide file tree
Showing 43 changed files with 259 additions and 697 deletions.
6 changes: 3 additions & 3 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
language: python

python:
- '3.5'
- '3.6'
- '3.7'
- '3.8'

jobs:
include:
Expand All @@ -24,11 +24,11 @@ cache: pip
install:
- pip install -r requirements.txt
- pip install -U pytest pytest-cov codecov
- if [[ $TRAVIS_PYTHON_VERSION != '3.5' ]]; then travis_retry pip install black; fi
- pip install black

script:
- pytest hyppo/
- if [[ $TRAVIS_PYTHON_VERSION != '3.5' ]]; then black --check --diff ./hyppo; fi
- black --check --diff ./hyppo

after_success:
- codecov
Expand Down
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,9 @@
- **Issues:** [https://github.com/neurodata/hyppo/issues](https://github.com/neurodata/hyppo/issues)
- **Contribution Guide:** [https://hyppo.neurodata.io/contributing.html](https://hyppo.neurodata.io/contributing.html)

Some system/package requirements:
- **Python**: 3.6+
- **OS**: All major platforms (Linux, macOS, Windows)
- **Dependencies**: numpy, scipy, numba, scikit-learn

`hyppo` intends to be a comprehensive multivariate hypothesis testing package and runs on all major versions of operating systems. It also includes novel tests not found in other packages. It is quick to install and free of charge. If you need to use multivariate hypothesis testing, be sure to give `hyppo` a try!
4 changes: 0 additions & 4 deletions benchmarks/power.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,6 @@ def __call__(self, index):
# calculate permuted stats, store in null distribution
perm_stat = self.test._statistic(x, permy)

if self.test_name == "Pearson":
obs_stat = np.abs(obs_stat)
perm_stat = np.abs(perm_stat)

return obs_stat, perm_stat


Expand Down
12 changes: 0 additions & 12 deletions docs/reference/independence.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,3 @@ Cannonical Correlation Analysis (CCA)
RV
---
.. autoclass:: RV

Pearson
-------
.. autoclass:: Pearson

Kendall's tau
-------------
.. autoclass:: Kendall

Spearman's rho
--------------
.. autoclass:: Spearman
18 changes: 7 additions & 11 deletions docs/tutorials/independence/indep_power.ipynb

Large diffs are not rendered by default.

4 changes: 0 additions & 4 deletions docs/tutorials/independence/power.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,6 @@ def __call__(self, index):
# calculate permuted stats, store in null distribution
perm_stat = self.test._statistic(x, permy)

if self.test_name == "Pearson":
obs_stat = np.abs(obs_stat)
perm_stat = np.abs(perm_stat)

return obs_stat, perm_stat


Expand Down
40 changes: 26 additions & 14 deletions hyppo/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
from joblib import Parallel, delayed

import numpy as np
from scipy._lib._util import check_random_state
from scipy.stats.distributions import chi2
from scipy.spatial.distance import cdist
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import rbf_kernel


# from scipy
Expand Down Expand Up @@ -95,29 +95,37 @@ def check_inputs_distmat(inputs):
)


def euclidean(x):
def euclidean(x, workers=None):
"""Default euclidean distance function calculation"""
return cdist(x, x, metric="euclidean")
return pairwise_distances(X=x, metric="euclidean", n_jobs=workers)


def gaussian(x):
def gaussian(x, workers=None):
"""Default medial gaussian kernel similarity calculation"""
l1 = cdist(x, x, "cityblock")
mask = np.ones(l1.shape, dtype=bool)
np.fill_diagonal(mask, 0)
gamma = 1.0 / (2 * (np.median(l1[mask]) ** 2))
return np.exp(-gamma * cdist(x, x, "sqeuclidean"))
l1 = pairwise_distances(X=x, metric="l1", n_jobs=workers)
n = l1.shape[0]
med = np.median(
np.lib.stride_tricks.as_strided(
l1, (n - 1, n + 1), (l1.itemsize * (n + 1), l1.itemsize)
)[:, 1:]
)
gamma = 1.0 / (2 * (med ** 2))
return rbf_kernel(x, gamma=gamma)


# p-value computation
def _perm_stat(calc_stat, x, y):
permy = np.random.permutation(y)
def _perm_stat(calc_stat, x, y, is_distsim=True):
if is_distsim:
order = np.random.permutation(y.shape[0])
permy = y[order][:, order]
else:
permy = np.random.permutation(y)
perm_stat = calc_stat(x, permy)

return perm_stat


def perm_test(calc_stat, x, y, reps=1000, workers=1):
def perm_test(calc_stat, x, y, reps=1000, workers=1, is_distsim=True):
"""
Calculate the p-value via permutation
"""
Expand All @@ -127,12 +135,16 @@ def perm_test(calc_stat, x, y, reps=1000, workers=1):
# calculate null distribution
null_dist = np.array(
Parallel(n_jobs=workers)(
[delayed(_perm_stat)(calc_stat, x, y) for rep in range(reps)]
[delayed(_perm_stat)(calc_stat, x, y, is_distsim) for rep in range(reps)]
)
)
pvalue = (null_dist >= stat).sum() / reps

# correct for a p-value of 0. This is because, with bootstrapping
# permutations, a p-value of 0 is incorrect
if pvalue == 0:
pvalue = 1 / reps

return stat, pvalue, null_dist


Expand Down
3 changes: 1 addition & 2 deletions hyppo/discrim/_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import warnings
import numpy as np
from .._utils import (
contains_nan,
Expand Down Expand Up @@ -42,7 +41,7 @@ def __call__(self):
return self.x, self.y

def _condition_input(self, x1):
"""Checks whether there is only one subject and removes
"""Checks whether there is only one subject and removes
isolates and calculate distance."""
uniques, counts = np.unique(self.y, return_counts=True)

Expand Down
1 change: 0 additions & 1 deletion hyppo/discrim/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from abc import ABC, abstractmethod
import numpy as np
from .._utils import euclidean


class DiscriminabilityTest(ABC):
Expand Down
1 change: 0 additions & 1 deletion hyppo/discrim/discrim_one_samp.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from ._utils import _CheckInputs
import numpy as np
import random
from .base import DiscriminabilityTest
from scipy._lib._util import MapWrapper

Expand Down
14 changes: 7 additions & 7 deletions hyppo/discrim/discrim_two_samp.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from ._utils import _CheckInputs
import numpy as np
import random
from numba import njit
from .base import DiscriminabilityTest
from scipy._lib._util import MapWrapper
Expand All @@ -27,7 +26,8 @@ class DiscrimTwoSample(DiscriminabilityTest):
Notes
-----
Let :math:`\hat D_{x_1}` denote the sample discriminability of one approach,
and :math:`\hat D_{x_2}` denote the sample discriminability of another approach. Then,
and :math:`\hat D_{x_2}` denote the sample discriminability of another approach.
Then,
.. math::
Expand Down Expand Up @@ -81,15 +81,15 @@ def test(self, x1, x2, y, reps=1000, alt="neq", workers=-1):
to ``True`` in this case.
y : ndarray
A vector containing the sample ids for our `n` samples. Should be matched
to the inputs such that ``y[i]`` is the corresponding label for ``x_1[i, :]``
and ``x_2[i, :]``.
to the inputs such that ``y[i]`` is the corresponding label for
``x_1[i, :]`` and ``x_2[i, :]``.
reps : int, optional (default: 1000)
The number of replications used to estimate the null distribution
when using the permutation test used to calculate the p-value.
alt : {"greater", "less", "neq"} (default: "neq")
The alternative hypothesis for the test. Can test that first dataset is more
discriminable (alt = "greater"), less discriminable (alt = "less") or unequal
discriminability (alt = "neq").
The alternative hypothesis for the test. Can test that first dataset is
more discriminable (alt = "greater"), less discriminable (alt = "less")
or unequal discriminability (alt = "neq").
workers : int, optional (default: -1)
The number of cores to parallelize the p-value computation over.
Supply -1 to use all cores available to the Process.
Expand Down
5 changes: 3 additions & 2 deletions hyppo/discrim/tests/test_discrim_two_samp.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ def test_less(self):
assert_almost_equal(p, obs_p, decimal=4)

def test_neq(self):
# test whether discriminability for x1 is not equal compared to discriminability for x2
# test whether discriminability for x1 is not equal compared to
# discriminability for x2
x1 = np.ones((100, 2), dtype=float)
x2 = np.concatenate((np.zeros((50, 2)), np.ones((50, 2))), axis=0)
y = np.concatenate((np.zeros(50), np.ones(50)), axis=0)
Expand All @@ -55,7 +56,7 @@ def test_neq(self):


class TestTwoSampleWarn:
"""
"""
Tests errors and warnings derived from one sample test.
"""

Expand Down
3 changes: 0 additions & 3 deletions hyppo/independence/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
from .pearson import Pearson
from .rv import RV
from .cca import CCA
from .kendall import Kendall
from .spearman import Spearman
from .hhg import HHG
from .dcorr import Dcorr
from .hsic import Hsic
Expand Down
59 changes: 18 additions & 41 deletions hyppo/independence/_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
import warnings

import numpy as np
from scipy.stats import chi2

from .._utils import (
contains_nan,
Expand All @@ -15,10 +12,9 @@
class _CheckInputs:
"""Checks inputs for all independence tests"""

def __init__(self, x, y, dim, reps=None, compute_distance=None):
def __init__(self, x, y, reps=None, compute_distance=None):
self.x = x
self.y = y
self.dim = dim
self.compute_distance = compute_distance
self.reps = reps

Expand All @@ -29,6 +25,7 @@ def __call__(self):
self.x, self.y = self.check_dim_xy()
self.x, self.y = convert_xy_float64(self.x, self.y)
self._check_min_samples()
self._check_variance()
check_compute_distance(self.compute_distance)

if self.reps:
Expand All @@ -38,30 +35,20 @@ def __call__(self):

def check_dim_xy(self):
"""Convert x and y to proper dimensions"""
# for kendall, pearson, and spearman
if self.dim == 1:
# check if x or y is shape (n,)
if self.x.ndim > 1 or self.y.ndim > 1:
self.x.shape = (-1,)
self.y.shape = (-1,)

# for other tests
elif self.dim > 1:
# convert arrays of type (n,) to (n, 1)
if self.x.ndim == 1:
self.x = self.x[:, np.newaxis]
elif self.x.ndim != 2:
raise ValueError(
"Expected a 2-D array `x`, found shape " "{}".format(self.x.shape)
)
if self.y.ndim == 1:
self.y = self.y[:, np.newaxis]
elif self.y.ndim != 2:
raise ValueError(
"Expected a 2-D array `y`, found shape " "{}".format(self.y.shape)
)
if self.x.ndim == 1:
self.x = self.x[:, np.newaxis]
elif self.x.ndim != 2:
raise ValueError(
"Expected a 2-D array `x`, found shape " "{}".format(self.x.shape)
)
if self.y.ndim == 1:
self.y = self.y[:, np.newaxis]
elif self.y.ndim != 2:
raise ValueError(
"Expected a 2-D array `y`, found shape " "{}".format(self.y.shape)
)

self._check_nd_indeptest()
self._check_nd_indeptest()

return self.x, self.y

Expand All @@ -82,16 +69,6 @@ def _check_min_samples(self):
if nx <= 3 or ny <= 3:
raise ValueError("Number of samples is too low")


def _chi2_approx(stat, null_dist, samps):
mu = np.mean(null_dist)
sigma = np.std(null_dist)

if sigma < 10e-4 and mu < 10e-4:
x = 0.0
else:
x = samps * (stat - mu) / sigma + 1

pvalue = 1 - chi2.cdf(x, 1)

return pvalue
def _check_variance(self):
if np.var(self.x) == 0 or np.var(self.y) == 0:
raise ValueError("Test cannot be run, one of the inputs has 0 variance")
6 changes: 2 additions & 4 deletions hyppo/independence/base.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from abc import ABC, abstractmethod

import numpy as np

from .._utils import euclidean, perm_test


Expand Down Expand Up @@ -40,7 +38,7 @@ def _statistic(self, x, y):
"""

@abstractmethod
def test(self, x, y, reps=1000, workers=1):
def test(self, x, y, reps=1000, workers=1, is_distsim=True):
r"""
Calulates the independence test p-value.
Expand All @@ -66,7 +64,7 @@ def test(self, x, y, reps=1000, workers=1):

# calculate p-value
stat, pvalue, null_dist = perm_test(
self._statistic, x, y, reps=reps, workers=workers
self._statistic, x, y, reps=reps, workers=workers, is_distsim=is_distsim
)
self.stat = stat
self.pvalue = pvalue
Expand Down
5 changes: 2 additions & 3 deletions hyppo/independence/cca.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ class CCA(IndependenceTest):
See Also
--------
Pearson : Pearson product-moment correlation test statistic and p-value.
RV : RV test statistic and p-value.
Notes
Expand Down Expand Up @@ -140,8 +139,8 @@ def test(self, x, y, reps=1000, workers=1):
>>> '%.1f, %.2f' % (stat, pvalue)
'1.0, 0.00'
"""
check_input = _CheckInputs(x, y, dim=2, reps=reps)
check_input = _CheckInputs(x, y, reps=reps)
x, y = check_input()

# use default permutation test
return super(CCA, self).test(x, y, reps, workers)
return super(CCA, self).test(x, y, reps, workers, is_distsim=False)
Loading

0 comments on commit 4547cc8

Please sign in to comment.