From a3e2ba6a9fc36ad34c73399b26c40b4642742b42 Mon Sep 17 00:00:00 2001 From: Raphael Vallat Date: Sun, 20 Feb 2022 09:02:48 -0800 Subject: [PATCH] Release v0.5.1 (#236) * Flake8 * Explicit error when y is an empty list in pg.ttest https://github.com/raphaelvallat/pingouin/issues/222 * Add keyword arguments in homoscedasticity function https://github.com/raphaelvallat/pingouin/issues/218 * Bugfix rm_anova and mixed_anova changed the dtypes of categorical columns + added observed=True to all groupby https://github.com/raphaelvallat/pingouin/issues/224 * Update version number in init and setup * Use np.isclose for test_pearson == 1 https://github.com/raphaelvallat/pingouin/issues/195 * Coverage for try..except scipy fallback * Fix set_option for pandas 1.4 * Upgraded dependencies for seaborn and statsmodels * Added Jarque-Bera test in pg.normality https://github.com/raphaelvallat/pingouin/issues/216 * Coverage scipy import error * Use pd.concat instead of frame.append to avoid FutureWarning * Remove add_categories(inplace=True) to avoid FutureWarning * GH Discussions instead of Gitter * Minor doc fix --- README.rst | 7 +-- docs/changelog.rst | 19 ++++-- docs/index.rst | 5 +- pingouin/__init__.py | 2 +- pingouin/distribution.py | 52 +++++++++------ pingouin/pairwise.py | 27 +++----- pingouin/parametric.py | 98 +++++++++++------------------ pingouin/regression.py | 6 +- pingouin/tests/test_correlation.py | 5 +- pingouin/tests/test_distribution.py | 6 +- pingouin/tests/test_pairwise.py | 2 +- pingouin/tests/test_parametric.py | 11 ++-- pingouin/tests/test_utils.py | 6 ++ pingouin/utils.py | 1 + requirements.txt | 4 +- setup.py | 6 +- 16 files changed, 121 insertions(+), 136 deletions(-) diff --git a/README.rst b/README.rst index 1c3c2b13..75a54395 100644 --- a/README.rst +++ b/README.rst @@ -23,8 +23,6 @@ .. image:: http://joss.theoj.org/papers/d2254e6d8e8478da192148e4cfbe4244/status.svg :target: http://joss.theoj.org/papers/d2254e6d8e8478da192148e4cfbe4244 -.. image:: https://badges.gitter.im/owner/repo.png - :target: https://gitter.im/pingouin-stats/Lobby ---------------- @@ -70,10 +68,7 @@ Documentation Chat ==== -If you have questions, please ask them in the public `Gitter chat `_ - -.. image:: https://badges.gitter.im/owner/repo.png - :target: https://gitter.im/pingouin-stats/Lobby +If you have questions, please ask them in `GitHub Discussions `_. Installation ============ diff --git a/docs/changelog.rst b/docs/changelog.rst index 04a7fd47..015fe374 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -8,13 +8,24 @@ What's new ************* -v0.6.0.dev ----------- +v0.5.1 (February 2022) +---------------------- + +This is a minor release, with several bugfixes and improvements. This release is compatible with SciPy 1.8 and Pandas 1.4. + +**Bugfixes** + +a. Added support for SciPy 1.8 and Pandas 1.4. `PR 234 `_. +b. Fixed bug where :py:func:`pingouin.rm_anova` and :py:func:`pingouin.mixed_anova` changed the dtypes of categorical columns in-place (`issue 224 `_). **Enhancements** -a. Faster implementation of :py:func:`pingouin.gzscore`, adding all options available in zscore: axis, ddof and nan_policy. Warning: this functions is deprecated and will be removed in pingouin 0.7.0 (use scipy.stats.gzscore instead). See `pull request 210 `_. -b. Replace use of statsmodels' studentized range distribution functions with more SciPy's more accurate `scipy.stats.studentized_range`. See `pull request 229 `_. +a. Faster implementation of :py:func:`pingouin.gzscore`, adding all options available in zscore: axis, ddof and nan_policy. Warning: this functions is deprecated and will be removed in pingouin 0.7.0 (use :py:func:`scipy.stats.gzscore` instead). `PR 210 `_. +b. Replace use of statsmodels' studentized range distribution functions with more SciPy's more accurate :py:func:`scipy.stats.studentized_range`. `PR 229 `_. +c. Add support for optional keywords argument in the :py:func:`pingouin.homoscedasticity` function (`issue 218 `_). +d. Add support for the Jarque-Bera test in :py:func:`pingouin.normality` (`issue 216 `_). + +Lastly, we have also deprecated the Gitter forum in favor of `GitHub Discussions `_. Please use Discussions to ask questions, share ideas / tips and engage with the Pingouin community! ************* diff --git a/docs/index.rst b/docs/index.rst index 927ee5b7..663c8bdc 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -21,9 +21,6 @@ .. image:: http://joss.theoj.org/papers/d2254e6d8e8478da192148e4cfbe4244/status.svg :target: http://joss.theoj.org/papers/d2254e6d8e8478da192148e4cfbe4244 -.. image:: https://badges.gitter.im/owner/repo.png - :target: https://gitter.im/pingouin-stats/Lobby - ---------------- @@ -108,7 +105,7 @@ Whenever a new release is out there, you can upgrade your version by typing the Quick start =========== -* If you have *questions*, please ask them in the public `Gitter chat `_. +* If you have *questions*, please ask them in `GitHub Discussions `_. * If you want to *report a bug*, please open an issue on the `GitHub repository `_. diff --git a/pingouin/__init__.py b/pingouin/__init__.py index cff5759c..bcb123b9 100644 --- a/pingouin/__init__.py +++ b/pingouin/__init__.py @@ -20,7 +20,7 @@ from .config import * # Current version -__version__ = "0.5.0" +__version__ = "0.5.1" # Warn if a newer version of Pingouin is available from outdated import warn_if_outdated diff --git a/pingouin/distribution.py b/pingouin/distribution.py index 80a5d2ea..3fead4a4 100644 --- a/pingouin/distribution.py +++ b/pingouin/distribution.py @@ -86,9 +86,11 @@ def normality(data, dv=None, group=None, method="shapiro", alpha=.05): Grouping variable (only when ``data`` is a long-format dataframe). method : str Normality test. `'shapiro'` (default) performs the Shapiro-Wilk test - using :py:func:`scipy.stats.shapiro`, and `'normaltest'` performs the - omnibus test of normality using :py:func:`scipy.stats.normaltest`. - The latter is more appropriate for large samples. + using :py:func:`scipy.stats.shapiro`, `'normaltest'` performs the + omnibus test of normality using :py:func:`scipy.stats.normaltest`, `'jarque_bera'` performs + the Jarque-Bera test using :py:func:`scipy.stats.jarque_bera`. + The Omnibus and Jarque-Bera tests are more suitable than the Shapiro test for + large samples. alpha : float Significance level. @@ -194,9 +196,16 @@ def normality(data, dv=None, group=None, method="shapiro", alpha=.05): W pval normal Pre 0.967718 0.478773 True Post 0.940728 0.095157 True + + 5. Same but using the Jarque-Bera test + + >>> pg.normality(data, dv='Performance', group='Time', method="jarque_bera") + W pval normal + Pre 0.304021 0.858979 True + Post 1.265656 0.531088 True """ assert isinstance(data, (pd.DataFrame, pd.Series, list, np.ndarray)) - assert method in ['shapiro', 'normaltest'] + assert method in ['shapiro', 'normaltest', 'jarque_bera'] if isinstance(data, pd.Series): data = data.to_frame() col_names = ['W', 'pval'] @@ -227,14 +236,13 @@ def normality(data, dv=None, group=None, method="shapiro", alpha=.05): grp = data.groupby(group, observed=True, sort=False) cols = grp.groups.keys() for _, tmp in grp: - stats = stats.append(normality(tmp[dv].to_numpy(), - method=method, - alpha=alpha)) + st_grp = normality(tmp[dv].to_numpy(), method=method, alpha=alpha) + stats = pd.concat([stats, st_grp], axis=0, ignore_index=True) stats.index = cols return _postprocess_dataframe(stats) -def homoscedasticity(data, dv=None, group=None, method="levene", alpha=.05): +def homoscedasticity(data, dv=None, group=None, method="levene", alpha=.05, **kwargs): """Test equality of variance. Parameters @@ -253,6 +261,8 @@ def homoscedasticity(data, dv=None, group=None, method="levene", alpha=.05): The former is more robust to departure from normality. alpha : float Significance level. + **kwargs : optional + Optional argument(s) passed to the lower-level :py:func:`scipy.stats.levene` function. Returns ------- @@ -339,7 +349,13 @@ def homoscedasticity(data, dv=None, group=None, method="levene", alpha=.05): W pval equal_var levene 1.173518 0.310707 True - 3. Bartlett test using a list of iterables + 3. Same but using a mean center + + >>> pg.homoscedasticity(data_long, dv="value", group="variable", center="mean") + W pval equal_var + levene 1.572239 0.209303 True + + 4. Bartlett test using a list of iterables >>> data = [[4, 8, 9, 20, 14], np.array([5, 8, 15, 45, 12])] >>> pg.homoscedasticity(data, method="bartlett", alpha=.05) @@ -356,30 +372,28 @@ def homoscedasticity(data, dv=None, group=None, method="levene", alpha=.05): # Get numeric data only numdata = data._get_numeric_data() assert numdata.shape[1] > 1, 'Data must have at least two columns.' - statistic, p = func(*numdata.to_numpy().T) + statistic, p = func(*numdata.to_numpy().T, **kwargs) else: # Long-format assert group in data.columns assert dv in data.columns grp = data.groupby(group, observed=True)[dv] assert grp.ngroups > 1, 'Data must have at least two columns.' - statistic, p = func(*grp.apply(list)) + statistic, p = func(*grp.apply(list), **kwargs) elif isinstance(data, list): # Check that list contains other list or np.ndarray assert all(isinstance(el, (list, np.ndarray)) for el in data) assert len(data) > 1, 'Data must have at least two iterables.' - statistic, p = func(*data) + statistic, p = func(*data, **kwargs) else: # Data is a dict assert all(isinstance(el, (list, np.ndarray)) for el in data.values()) assert len(data) > 1, 'Data must have at least two iterables.' - statistic, p = func(*data.values()) + statistic, p = func(*data.values(), **kwargs) equal_var = True if p > alpha else False stat_name = 'W' if method.lower() == 'levene' else 'T' - - stats = pd.DataFrame({stat_name: statistic, 'pval': p, - 'equal_var': equal_var}, index=[method]) + stats = pd.DataFrame({stat_name: statistic, 'pval': p, 'equal_var': equal_var}, index=[method]) return _postprocess_dataframe(stats) @@ -463,7 +477,7 @@ def _check_multilevel_rm(data, func='epsilon'): # We end up with a one-way design. It is similar to applying # a paired T-test to gain scores instead of using repeated measures # on two time points. Here we have computed the gain scores. - data = data.groupby(level=1, axis=1).diff(axis=1).dropna(axis=1) + data = data.groupby(level=1, axis=1, observed=True).diff(axis=1).dropna(axis=1) data = data.droplevel(level=0, axis=1) else: # Both factors have more than 2 levels -- differ from R / JASP @@ -498,8 +512,8 @@ def _long_to_wide_rm(data, dv=None, within=None, subject=None): # Keep all relevant columns and reset index data = data[_fl([subject, within, dv])] # Convert to wide-format + collapse to the mean - data = pd.pivot_table(data, index=subject, values=dv, columns=within, - aggfunc='mean', dropna=True, observed=True) + data = pd.pivot_table( + data, index=subject, values=dv, columns=within, aggfunc='mean', dropna=True, observed=True) return data diff --git a/pingouin/pairwise.py b/pingouin/pairwise.py index 8b73c4f0..c759cdc3 100644 --- a/pingouin/pairwise.py +++ b/pingouin/pairwise.py @@ -173,8 +173,8 @@ def pairwise_ttests(data=None, dv=None, between=None, within=None, subject=None, >>> import pandas as pd >>> import pingouin as pg - >>> pd.set_option('expand_frame_repr', False) - >>> pd.set_option('max_columns', 20) + >>> pd.set_option('display.expand_frame_repr', False) + >>> pd.set_option('display.max_columns', 20) >>> df = pg.read_dataset('mixed_anova.csv') >>> pg.pairwise_ttests(dv='Scores', between='Group', data=df).round(3) Contrast A B Paired Parametric T dof alternative p-unc BF10 hedges @@ -426,11 +426,12 @@ def pairwise_ttests(data=None, dv=None, between=None, within=None, subject=None, else: tmp = data # Recursive call to pairwise_ttests - stats = stats.append(pairwise_ttests( + pt = pairwise_ttests( dv=dv, between=fbt[i], within=fwt[i], subject=subject, data=tmp, parametric=parametric, marginal=marginal, alpha=alpha, alternative=alternative, padjust=padjust, effsize=effsize, correction=correction, nan_policy=nan_policy, - return_desc=return_desc), ignore_index=True, sort=False) + return_desc=return_desc) + stats = pd.concat([stats, pt], axis=0, ignore_index=True, sort=False) # Then compute the interaction between the factors if interaction: @@ -608,12 +609,6 @@ def pairwise_tukey(data=None, dv=None, between=None, effsize='hedges'): :math:`Q(\\sqrt2|t_i|, r, N - r)` where :math:`r` is the total number of groups and :math:`N` is the total sample size. - .. warning:: Versions of Pingouin below 0.3.10 used a wrong algorithm for - the studentized range approximation [2]_, which resulted in (slightly) - incorrect p-values. Please make sure you're using the - LATEST VERSION of Pingouin, and always DOUBLE CHECK your results with - another statistical software. - References ---------- .. [1] Tukey, John W. "Comparing individual means in the analysis of @@ -635,7 +630,6 @@ def pairwise_tukey(data=None, dv=None, between=None, effsize='hedges'): 1 Adelie Gentoo 3700.662 5076.016 -1375.354 56.148 -24.495 0.000 -2.967 2 Chinstrap Gentoo 3733.088 5076.016 -1342.928 69.857 -19.224 0.000 -2.894 """ - # First compute the ANOVA # For max precision, make sure rounding is disabled old_options = options.copy() @@ -760,12 +754,6 @@ def pairwise_gameshowell(data=None, dv=None, between=None, effsize='hedges'): The p-values are then approximated using the Studentized range distribution :math:`Q(\\sqrt2|t_i|, r, v_i)`. - .. warning:: Versions of Pingouin below 0.3.10 used a wrong algorithm for - the studentized range approximation [2]_, which resulted in (slightly) - incorrect p-values. Please make sure you're using the - LATEST VERSION of Pingouin, and always DOUBLE CHECK your results with - another statistical software. - References ---------- .. [1] Games, Paul A., and John F. Howell. "Pairwise multiple comparison @@ -789,7 +777,6 @@ def pairwise_gameshowell(data=None, dv=None, between=None, effsize='hedges'): 1 Adelie Gentoo 3700.662 5076.016 -1375.354 58.811 -23.386 249.643 0.00 -2.833 2 Chinstrap Gentoo 3733.088 5076.016 -1342.928 65.103 -20.628 170.404 0.00 -3.105 """ - # Check the dataframe _check_dataframe(dv=dv, between=between, effects='between', data=data) @@ -956,8 +943,8 @@ def pairwise_corr(data, columns=None, covar=None, alternative='two-sided', >>> import pandas as pd >>> import pingouin as pg - >>> pd.set_option('expand_frame_repr', False) - >>> pd.set_option('max_columns', 20) + >>> pd.set_option('display.expand_frame_repr', False) + >>> pd.set_option('display.max_columns', 20) >>> data = pg.read_dataset('pairwise_corr').iloc[:, 1:] >>> pg.pairwise_corr(data, method='spearman', alternative='greater', padjust='bonf').round(3) X Y method alternative n r CI95% p-unc p-corr p-adjust power diff --git a/pingouin/parametric.py b/pingouin/parametric.py index 8a53960b..515c699f 100644 --- a/pingouin/parametric.py +++ b/pingouin/parametric.py @@ -10,8 +10,7 @@ __all__ = ["ttest", "rm_anova", "anova", "welch_anova", "mixed_anova", "ancova"] -def ttest(x, y, paired=False, alternative='two-sided', correction='auto', r=.707, - confidence=0.95): +def ttest(x, y, paired=False, alternative='two-sided', correction='auto', r=.707, confidence=0.95): """T-test. Parameters @@ -198,13 +197,12 @@ def ttest(x, y, paired=False, alternative='two-sided', correction='auto', r=.707 array([1.971859, 0.057056]) """ from scipy.stats import t, ttest_rel, ttest_ind, ttest_1samp - try: - from scipy.stats._stats_py import (_unequal_var_ttest_denom, - _equal_var_ttest_denom) - except ImportError: # Fallback for scipy<1.8.0 - from scipy.stats.stats import (_unequal_var_ttest_denom, - _equal_var_ttest_denom) - from pingouin import (power_ttest, power_ttest2n, compute_effsize) + try: # pragma: no cover + from scipy.stats._stats_py import _unequal_var_ttest_denom, _equal_var_ttest_denom + except ImportError: # pragma: no cover + # Fallback for scipy<1.8.0 + from scipy.stats.stats import _unequal_var_ttest_denom, _equal_var_ttest_denom + from pingouin import power_ttest, power_ttest2n, compute_effsize # Check arguments assert alternative in ['two-sided', 'greater', 'less'], ( @@ -215,8 +213,7 @@ def ttest(x, y, paired=False, alternative='two-sided', correction='auto', r=.707 y = np.asarray(y) if x.size != y.size and paired: - warnings.warn("x and y have unequal sizes. Switching to " - "paired == False. Check your data.") + warnings.warn("x and y have unequal sizes. Switching to paired == False. Check your data.") paired = False # Remove rows with missing values @@ -512,13 +509,6 @@ def rm_anova(data=None, dv=None, within=None, subject=None, correction='auto', # Check dataframe _check_dataframe(dv=dv, within=within, data=data, subject=subject, effects='within') - # Convert Categorical columns to string - # This is important otherwise all the groupby will return different results - # unless we specify .groupby(..., observed = True). - for c in [subject, within]: - if data[c].dtype.name == 'category': - data[c] = data[c].astype(str) - assert not data[within].isnull().any(), "Cannot have missing values in `within`." assert not data[subject].isnull().any(), "Cannot have missing values in `subject`." @@ -532,10 +522,12 @@ def rm_anova(data=None, dv=None, within=None, subject=None, correction='auto', data = data_piv.melt(ignore_index=False, value_name=dv).reset_index() # Groupby - grp_with = data.groupby(within)[dv] + # I think that observed=True is actually not needed here since we have already used + # `observed=True` in pivot_table. + grp_with = data.groupby(within, observed=True)[dv] rm = list(data[within].unique()) n_rm = len(rm) - n_obs = int(data.groupby(within)[dv].count().max()) + n_obs = int(grp_with.count().max()) grandmean = data[dv].mean() # Calculate sums of squares @@ -544,7 +536,7 @@ def rm_anova(data=None, dv=None, within=None, subject=None, correction='auto', # sstotal = sstime + ss_resall = sstime + (sssubj + sserror) # ss_total = ((data[dv] - grandmean)**2).sum() # We can further divide the residuals into a within and between component: - grp_subj = data.groupby(subject)[dv] + grp_subj = data.groupby(subject, observed=True)[dv] ss_resbetw = n_rm * np.sum((grp_subj.mean() - grandmean)**2) ss_reswith = ss_resall - ss_resbetw @@ -633,13 +625,6 @@ def rm_anova2(data=None, dv=None, within=None, subject=None, effsize="np2"): # Validate the dataframe _check_dataframe(dv=dv, within=within, data=data, subject=subject, effects='within') - # Convert Categorical columns to string - # This is important otherwise all the groupby will return different results - # unless we specify .groupby(..., observed = True). - for c in [subject, a, b]: - if data[c].dtype.name == 'category': - data[c] = data[c].astype(str) - assert not data[a].isnull().any(), "Cannot have missing values in %s" % a assert not data[b].isnull().any(), "Cannot have missing values in %s" % b assert not data[subject].isnull().any(), "Cannot have missing values in %s" % subject @@ -660,12 +645,14 @@ def rm_anova2(data=None, dv=None, within=None, subject=None, effsize="np2"): mu = data[dv].mean() # Groupby means - grp_s = data.groupby(subject)[dv].mean() - grp_a = data.groupby([a])[dv].mean() - grp_b = data.groupby([b])[dv].mean() - grp_ab = data.groupby([a, b])[dv].mean() - grp_as = data.groupby([a, subject])[dv].mean() - grp_bs = data.groupby([b, subject])[dv].mean() + # I think that observed=True is actually not needed here since we have already used + # `observed=True` in pivot_table. + grp_s = data.groupby(subject, observed=True)[dv].mean() + grp_a = data.groupby([a], observed=True)[dv].mean() + grp_b = data.groupby([b], observed=True)[dv].mean() + grp_ab = data.groupby([a, b], observed=True)[dv].mean() + grp_as = data.groupby([a, subject], observed=True)[dv].mean() + grp_bs = data.groupby([b, subject], observed=True)[dv].mean() # Sums of squares ss_tot = np.sum((data[dv] - mu)**2) @@ -732,7 +719,6 @@ def rm_anova2(data=None, dv=None, within=None, subject=None, effsize="np2"): # Epsilon piv_a = data.pivot_table(index=subject, columns=a, values=dv, observed=True) piv_b = data.pivot_table(index=subject, columns=b, values=dv, observed=True) - # piv_ab = data.pivot_table(index=subject, columns=[a, b], values=dv) # Same as data_piv eps_a = epsilon(piv_a, correction='gg') eps_b = epsilon(piv_b, correction='gg') # Note that the GG epsilon of the interaction slightly differs between @@ -931,12 +917,10 @@ def anova(data=None, dv=None, between=None, ss_type=2, detailed=False, elif len(between) == 2: # Two factors with balanced design = Pingouin implementation # Two factors with unbalanced design = statsmodels - return anova2(dv=dv, between=between, data=data, ss_type=ss_type, - effsize=effsize) + return anova2(dv=dv, between=between, data=data, ss_type=ss_type, effsize=effsize) else: # 3 or more factors with (un)-balanced design = statsmodels - return anovan(dv=dv, between=between, data=data, ss_type=ss_type, - effsize=effsize) + return anovan(dv=dv, between=between, data=data, ss_type=ss_type, effsize=effsize) # Check data _check_dataframe(dv=dv, between=between, data=data, effects='between') @@ -1035,8 +1019,7 @@ def anova2(data=None, dv=None, between=None, ss_type=2, effsize='np2'): df_resid = data[dv].size - (ng1 * ng2) else: # UNBALANCED DESIGN - return anovan(dv=dv, between=between, data=data, ss_type=ss_type, - effsize=effsize) + return anovan(dv=dv, between=between, data=data, ss_type=ss_type, effsize=effsize) # Mean squares ms_fac1 = ss_fac1 / df_fac1 @@ -1069,8 +1052,7 @@ def anova2(data=None, dv=None, between=None, ss_type=2, effsize='np2'): all_effsize = [np2_fac1, np2_fac2, np2_inter, np.nan] # Create output dataframe - aov = pd.DataFrame({'Source': [fac1, fac2, fac1 + ' * ' + fac2, - 'Residual'], + aov = pd.DataFrame({'Source': [fac1, fac2, fac1 + ' * ' + fac2, 'Residual'], 'SS': [ss_fac1, ss_fac2, ss_inter, ss_resid], 'DF': [df_fac1, df_fac2, df_inter, df_resid], 'MS': [ms_fac1, ms_fac2, ms_inter, ms_resid], @@ -1133,8 +1115,7 @@ def anovan(data=None, dv=None, between=None, ss_type=2, effsize='np2'): aov = aov.iloc[1:, :] aov = aov.reset_index() - aov = aov.rename(columns={'index': 'Source', 'sum_sq': 'SS', - 'df': 'DF', 'PR(>F)': 'p-unc'}) + aov = aov.rename(columns={'index': 'Source', 'sum_sq': 'SS', 'df': 'DF', 'PR(>F)': 'p-unc'}) aov['MS'] = aov['SS'] / aov['DF'] # Effect size @@ -1144,8 +1125,7 @@ def anovan(data=None, dv=None, between=None, ss_type=2, effsize='np2'): all_n2[-1] = np.nan aov['n2'] = all_n2 else: - aov['np2'] = (aov['F'] * aov['DF']) / (aov['F'] * aov['DF'] + - aov.iloc[-1, 2]) + aov['np2'] = (aov['F'] * aov['DF']) / (aov['F'] * aov['DF'] + aov.iloc[-1, 2]) def format_source(x): for fac in between: @@ -1426,13 +1406,6 @@ def mixed_anova(data=None, dv=None, within=None, subject=None, between=None, _check_dataframe( dv=dv, within=within, between=between, data=data, subject=subject, effects='interaction') - # Convert Categorical columns to string - # This is important otherwise all the groupby will return different results - # unless we specify .groupby(..., observed = True). - for c in [within, between, subject]: - if data[c].dtype.name == 'category': - data[c] = data[c].astype(str) - # Pivot and melt the table. This has several effects: # 1) Force missing values to be explicit (a NaN cell is created) # 2) Automatic collapsing to the mean if multiple within factors are present @@ -1445,7 +1418,7 @@ def mixed_anova(data=None, dv=None, within=None, subject=None, between=None, # Check that subject IDs do not overlap between groups: the subject ID # should have a unique range / set of values for each between-subject # group e.g. group1= 1 --> 20 and group2 = 21 --> 40. - if not (data.groupby([subject, within])[between].nunique() == 1).all(): + if not (data.groupby([subject, within], observed=True)[between].nunique() == 1).all(): raise ValueError("Subject IDs cannot overlap between groups: each " "group in `%s` must have a unique set of " "subject IDs, e.g. group1 = [1, 2, 3, ..., 10] " @@ -1461,7 +1434,7 @@ def mixed_anova(data=None, dv=None, within=None, subject=None, between=None, ss_betw = aov_betw.at[0, 'SS'] ss_with = aov_with.at[0, 'SS'] # Extract residuals and interactions - grp = data.groupby([between, within])[dv] + grp = data.groupby([between, within], observed=True)[dv] # ssresall = residuals within + residuals between ss_resall = grp.apply(lambda x: (x - x.mean())**2).sum() # Interaction @@ -1470,10 +1443,10 @@ def mixed_anova(data=None, dv=None, within=None, subject=None, between=None, ss_resbetw = ss_total - (ss_with + ss_betw + ss_reswith + ss_inter) # DEGREES OF FREEDOM - n_obs = data.groupby(within)[dv].count().max() + n_obs = data.groupby(within, observed=True)[dv].count().max() df_with = aov_with.at[0, 'DF'] df_betw = aov_betw.at[0, 'DF'] - df_resbetw = n_obs - data.groupby(between)[dv].count().count() + df_resbetw = n_obs - data.groupby(between, observed=True)[dv].count().count() df_reswith = df_with * df_resbetw df_inter = aov_with.at[0, 'DF'] * aov_betw.at[0, 'DF'] @@ -1514,22 +1487,21 @@ def mixed_anova(data=None, dv=None, within=None, subject=None, between=None, ef_inter = ss_inter / (ss_inter + ss_reswith) # Stats table - aov = pd.concat([aov_betw.drop(1), aov_with.drop(1)], sort=False, ignore_index=True) + aov = pd.concat([aov_betw.drop(1), aov_with.drop(1)], axis=0, sort=False, ignore_index=True) # Update values aov.rename(columns={'DF': 'DF1'}, inplace=True) aov.at[0, 'F'], aov.at[1, 'F'] = f_betw, f_with aov.at[0, 'p-unc'], aov.at[1, 'p-unc'] = p_betw, p_with aov.at[0, effsize], aov.at[1, effsize] = ef_betw, ef_with - aov = aov.append({ + aov_inter = pd.DataFrame({ 'Source': 'Interaction', 'SS': ss_inter, 'DF1': df_inter, 'MS': ms_inter, 'F': f_inter, - 'p-unc': p_inter, effsize: ef_inter}, ignore_index=True) - + 'p-unc': p_inter, effsize: ef_inter}, index=[2]) + aov = pd.concat([aov, aov_inter], axis=0, sort=False, ignore_index=True) aov['DF2'] = [df_resbetw, df_reswith, df_reswith] aov['eps'] = [np.nan, aov_with.at[0, 'eps'], np.nan] col_order = [ 'Source', 'SS', 'DF1', 'DF2', 'MS', 'F', 'p-unc', 'p-GG-corr', effsize, 'eps', 'sphericity', 'W-spher', 'p-spher'] - aov = aov.reindex(columns=col_order) aov.dropna(how='all', axis=1, inplace=True) return _postprocess_dataframe(aov) diff --git a/pingouin/regression.py b/pingouin/regression.py index 437bf1ab..5276fd60 100644 --- a/pingouin/regression.py +++ b/pingouin/regression.py @@ -460,8 +460,8 @@ def linear_regression(X, y, add_intercept=True, weights=None, coef_only=False, # Relative importance if relimp: - data = pd.concat([pd.DataFrame(y, columns=['y']), - pd.DataFrame(X, columns=names)], sort=False, axis=1) + data = pd.concat( + [pd.DataFrame(y, columns=['y']), pd.DataFrame(X, columns=names)], sort=False, axis=1) if 'Intercept' in names: # Intercept is the first column reli = _relimp(data.drop(columns=['Intercept']).cov()) @@ -1254,7 +1254,7 @@ def mediation_analysis(data=None, x=None, m=None, y=None, covar=None, indirect['names'] = 'Indirect' else: indirect['names'] = indirect['names'].apply(lambda x: 'Indirect %s' % x) - stats = stats.append(indirect, ignore_index=True) + stats = pd.concat([stats, indirect], axis=0, ignore_index=True, sort=False) stats = stats.rename(columns={'names': 'path'}) # Restore options diff --git a/pingouin/tests/test_correlation.py b/pingouin/tests/test_correlation.py index a5302630..ee1c0bb8 100644 --- a/pingouin/tests/test_correlation.py +++ b/pingouin/tests/test_correlation.py @@ -112,9 +112,10 @@ def test_corr(self): stats = corr(df['Neuroticism'], df['Extraversion']) assert np.isclose(1 / float(stats['BF10'].to_numpy()), 1.478e-13) # Perfect correlation, CI and power should be 1, BF should be Inf + # https://github.com/raphaelvallat/pingouin/issues/195 stats = corr(x, x) - assert stats.at['pearson', 'r'] == 1 - assert stats.at['pearson', 'power'] == 1 + assert np.isclose(stats.at['pearson', 'r'], 1) + assert np.isclose(stats.at['pearson', 'power'], 1) # When one column is a constant, the correlation is not defined # and Pingouin return a DataFrame full of NaN, except for ``n`` x, y = [1, 1, 1], [1, 2, 3] diff --git a/pingouin/tests/test_distribution.py b/pingouin/tests/test_distribution.py index 883a57ad..9de5a678 100644 --- a/pingouin/tests/test_distribution.py +++ b/pingouin/tests/test_distribution.py @@ -62,20 +62,22 @@ def test_normality(self): normality(x, alpha=.05) normality(x.tolist(), method='normaltest', alpha=.05) # Pandas DataFrame - df_nan_piv = df_nan.pivot(index='Subject', columns='Time', - values='Scores') + df_nan_piv = df_nan.pivot(index='Subject', columns='Time', values='Scores') normality(df_nan_piv) # Wide-format dataframe normality(df_nan_piv['August']) # pandas Series # The line below is disabled because test fails on python 3.5 # assert stats_piv.equals(normality(df_nan, group='Time', dv='Scores')) normality(df_nan, group='Group', dv='Scores', method='normaltest') + normality(df_nan, group='Group', dv='Scores', method='jarque_bera') def test_homoscedasticity(self): """Test function test_homoscedasticity.""" hl = homoscedasticity(data=[x, y], alpha=.05) homoscedasticity(data=[x, y], method='bartlett', alpha=.05) hd = homoscedasticity(data={'x': x, 'y': y}, alpha=.05) + hd2 = homoscedasticity(data={'x': x, 'y': y}, alpha=.05, center="mean") assert hl.equals(hd) + assert not hd.equals(hd2) # Wide-format DataFrame homoscedasticity(df_pivot) # Long-format diff --git a/pingouin/tests/test_pairwise.py b/pingouin/tests/test_pairwise.py index 5972f095..f2c9fb9b 100644 --- a/pingouin/tests/test_pairwise.py +++ b/pingouin/tests/test_pairwise.py @@ -120,7 +120,7 @@ def test_pairwise_ttests(self): pt_med = pairwise_ttests(dv='Scores', within='Time', subject='Subject', padjust='holm', data=df[df['Group'] == 'Meditation']) - pt_merged = pt_con.append(pt_med) + pt_merged = pd.concat([pt_con, pt_med], ignore_index=True, sort=False) # T, dof and p-values should be equal assert np.array_equal(pt_merged['T'], pt['T'].iloc[4:]) assert np.array_equal(pt_merged['dof'], pt['dof'].iloc[4:]) diff --git a/pingouin/tests/test_parametric.py b/pingouin/tests/test_parametric.py index fcecbca7..ee33f155 100644 --- a/pingouin/tests/test_parametric.py +++ b/pingouin/tests/test_parametric.py @@ -18,8 +18,8 @@ df_cat[['Time', 'Group', 'Subject']] = df_cat[[ 'Time', 'Group', 'Subject']].astype('category') # Let's complicate things even more and add "ghost" Categories -df_cat['Time'].cat.add_categories('Casper', inplace=True) -df_cat['Group'].cat.add_categories('The Friendly Ghost', inplace=True) +df_cat['Time'] = df_cat['Time'].cat.add_categories('Casper') +df_cat['Group'] = df_cat['Group'].cat.add_categories('The Friendly Ghost') # Create random normal variables np.random.seed(1234) @@ -197,9 +197,8 @@ def test_anova(self): # Unbalanced and with missing values AND between as a categorical df_paincat = df_pain.copy() df_paincat['Hair color'] = df_paincat['Hair color'].astype('category') - df_paincat['Hair color'].cat.add_categories('Bald', inplace=True) - aov = df_paincat.anova(dv='Pain threshold', - between='Hair color').round(3) + df_paincat['Hair color'] = df_paincat['Hair color'].cat.add_categories('Bald') + aov = df_paincat.anova(dv='Pain threshold', between='Hair color').round(3) assert aov.at[0, 'ddof1'] == 3 assert aov.at[0, 'ddof2'] == 13 assert aov.at[0, 'F'] == 4.359 @@ -381,7 +380,7 @@ def test_rm_anova2(self): data_cat = data.copy() data_cat[['Subject', 'Time', 'Metric']] = \ data_cat[['Subject', 'Time', 'Metric']].astype('category') - data_cat['Time'].cat.add_categories('Casper', inplace=True) + data_cat['Time'] = data_cat['Time'].cat.add_categories('Casper') aov = rm_anova(data=data_cat, subject='Subject', within=['Time', 'Metric'], dv='Performance').round(3) array_equal(aov.loc[:, 'MS'], [828.817, 682.617, 112.217]) diff --git a/pingouin/tests/test_utils.py b/pingouin/tests/test_utils.py index bbf2e4a0..0137ffac 100644 --- a/pingouin/tests/test_utils.py +++ b/pingouin/tests/test_utils.py @@ -137,6 +137,12 @@ def test_remove_na(self): assert np.allclose(y_nan, [[6.], [3.], [2.]]) # When y is None remove_na(x, None, paired=False) + # When x or y is an empty list + # See https://github.com/raphaelvallat/pingouin/issues/222 + with pytest.raises(AssertionError): + remove_na(x=[], y=0) + with pytest.raises(AssertionError): + remove_na(x, y=[]) def test_check_eftype(self): """Test function _check_eftype.""" diff --git a/pingouin/utils.py b/pingouin/utils.py index b46e11dc..2563ecb4 100644 --- a/pingouin/utils.py +++ b/pingouin/utils.py @@ -225,6 +225,7 @@ def remove_na(x, y=None, paired=False, axis='rows'): return _remove_na_single(x, axis=axis), y else: # y is list, np.array, pd.Series y = np.asarray(y) + assert y.size != 0, "y cannot be an empty list or array." # Make sure that we just pass-through if y have only 1 element if y.size == 1: return _remove_na_single(x, axis=axis), y diff --git a/requirements.txt b/requirements.txt index 51c57f3d..5a32a1a9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,8 +2,8 @@ numpy>=1.19 scipy>=1.7 pandas>=1.0 matplotlib>=3.0.2 -seaborn>=0.9.0 -statsmodels>=0.12.0 +seaborn>=0.11 +statsmodels>=0.13 scikit-learn pandas_flavor>=0.2.0 outdated diff --git a/setup.py b/setup.py index 41b0fc27..273c49a2 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ def read(fname): MAINTAINER_EMAIL = 'raphaelvallat9@gmail.com' URL = 'https://pingouin-stats.org/index.html' DOWNLOAD_URL = 'https://github.com/raphaelvallat/pingouin/' -VERSION = '0.5.0' +VERSION = '0.5.1' LICENSE = 'GPL-3.0' PACKAGE_DATA = {'pingouin.data.icons': ['*.svg']} @@ -24,8 +24,8 @@ def read(fname): 'scipy>=1.7', 'pandas>=1.0', 'matplotlib>=3.0.2', - 'seaborn>=0.9.0', - 'statsmodels>=0.12.0', + 'seaborn>=0.11', + 'statsmodels>=0.13', 'scikit-learn', 'pandas_flavor>=0.2.0', 'outdated',