Skip to content

Commit

Permalink
Merge pull request #188 from phobson/add-group-comparisons
Browse files Browse the repository at this point in the history
Add group comparisons
  • Loading branch information
phobson authored Mar 26, 2024
2 parents a71f374 + c5c793f commit 18d727e
Show file tree
Hide file tree
Showing 11 changed files with 747 additions and 134 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/python-runtests-basic.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.9", "3.10", "3.11", "3.12"]
python-version: ["3.10", "3.11", "3.12"]

steps:
- uses: actions/checkout@v2
Expand Down
209 changes: 164 additions & 45 deletions wqio/datacollections.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,11 @@
_Stat = namedtuple("_stat", ["stat", "pvalue"])


def _dist_compare(x, y, stat_comp_func):
def _dist_compare(x, y, stat_comp_func, **test_opts):
if (len(x) == len(y)) and numpy.equal(x, y).all():
return _Stat(numpy.nan, numpy.nan)
return stat_comp_func(x, y, alternative="two-sided")

return stat_comp_func(x, y, **test_opts)


class DataCollection:
Expand Down Expand Up @@ -130,6 +131,8 @@ def __init__(
**{self.cencol: dataframe[self.qualcol].isin(self.ndval)}
).reset_index()

self.pbarfxn = tqdm if (self.showpbar and tqdm) else utils.misc.no_op

@cache_readonly
def tidy(self):
if self.useros:
Expand Down Expand Up @@ -323,7 +326,7 @@ def mean(self):

@cache_readonly
def std_dev(self):
return self.generic_stat(numpy.std, statname="std. dev.", use_bootstrap=False)
return self.generic_stat(numpy.std, statname="std. dev.", use_bootstrap=False, ddof=1)

def percentile(self, percentile):
"""Return the percentiles (0 - 100) for the data."""
Expand All @@ -342,7 +345,7 @@ def logmean(self):
@cache_readonly
def logstd_dev(self):
return self.generic_stat(
lambda x, axis=0: numpy.std(numpy.log(x), axis=axis),
lambda x, axis=0: numpy.std(numpy.log(x), axis=axis, ddof=1),
use_bootstrap=False,
statname="Log-std. dev.",
)
Expand All @@ -359,65 +362,93 @@ def geostd_dev(self):
geostd.columns.names = ["station", "Geo-std. dev."]
return geostd

@cache_readonly
def shapiro(self):
def shapiro(self, **opts):
"""
Run the Shapiro-Wilk test for normality on the datasets.
Requires at least 3 observations in each dataset.
See `scipy.stats.shapiro` for info on kwargs you can pass.
"""
return self.generic_stat(
stats.shapiro,
use_bootstrap=False,
has_pvalue=True,
statname="shapiro",
filterfxn=lambda x: x.shape[0] > 3,
**opts,
)

@cache_readonly
def shapiro_log(self):
def shapiro_log(self, **opts):
"""
Run the Shapiro-Wilk test for normality on log-transformed datasets.
Requires at least 3 observations in each dataset.
See `scipy.stats.shapiro` for info on kwargs you can pass.
"""
return self.generic_stat(
lambda x: stats.shapiro(numpy.log(x)),
use_bootstrap=False,
has_pvalue=True,
filterfxn=lambda x: x.shape[0] > 3,
statname="log-shapiro",
**opts,
)

@cache_readonly
def lilliefors(self):
def lilliefors(self, **opts):
"""
Run the Lilliefors test for normality on the datasets.
Requires at least 3 observations in each dataset.
See `statsmodels.api.stats.lilliefors` for info on kwargs you can pass.
"""
return self.generic_stat(
sm.stats.lilliefors,
use_bootstrap=False,
has_pvalue=True,
statname="lilliefors",
**opts,
)

@cache_readonly
def lilliefors_log(self):
def lilliefors_log(self, **opts):
"""
Run the Lilliefors test for normality on the log-transformed datasets.
Requires at least 3 observations in each dataset.
See `statsmodels.api.stats.lilliefors` for info on kwargs you can pass.
"""
return self.generic_stat(
lambda x: sm.stats.lilliefors(numpy.log(x)),
use_bootstrap=False,
has_pvalue=True,
statname="log-lilliefors",
**opts,
)

@cache_readonly
def anderson_darling(self):
def anderson_darling(self, **opts):
raise NotImplementedError
return self.generic_stat(
utils.anderson_darling,
use_bootstrap=False,
has_pvalue=True,
statname="anderson-darling",
**opts,
)

@cache_readonly
def anderson_darling_log(self):
def anderson_darling_log(self, **opts):
raise NotImplementedError
return self.generic_stat(
lambda x: utils.anderson_darling(numpy.log(x)),
use_bootstrap=False,
has_pvalue=True,
statname="log-anderson-darling",
**opts,
)

def comparison_stat(self, statfxn, statname=None, paired=False, **statopts):
def comparison_stat_twoway(self, statfxn, statname=None, paired=False, **statopts):
"""Generic function to apply comparative hypothesis tests to
the groups of the ``DataCollection``.
Expand All @@ -430,7 +461,7 @@ def comparison_stat(self, statfxn, statname=None, paired=False, **statopts):
statname : string, optional
Name of the statistic. Included as a column name in the
final dataframe.
apired : bool, optional
paired : bool, optional
Set to ``True`` if ``statfxn`` requires paired data.
**statopts : optional kwargs
Additional keyword arguments that will be passed to
Expand All @@ -455,9 +486,9 @@ def comparison_stat(self, statfxn, statname=None, paired=False, **statopts):
>>> dc = DataCollection(df, rescol='res', qualcol='qual',
... stationcol='loc', paramcol='param',
... ndval='<')
>>> mwht = dc.comparison_stat(stats.mannwhitneyu,
... statname='mann_whitney',
... alternative='two-sided')
>>> mwht = dc.comparison_stat_twoway(stats.mannwhitneyu,
... statname='mann_whitney',
... alternative='two-sided')
"""

Expand All @@ -475,46 +506,134 @@ def comparison_stat(self, statfxn, statname=None, paired=False, **statopts):
index_cols = meta_columns + station_columns

results = generator(
data, meta_columns, self.stationcol, rescol, statfxn, statname=statname, **statopts
data,
meta_columns,
self.stationcol,
rescol,
statfxn,
statname=statname,
pbarfxn=self.pbarfxn,
**statopts,
)
return pandas.DataFrame.from_records(results).set_index(index_cols)

@cache_readonly
def mann_whitney(self):
return self.comparison_stat(
partial(_dist_compare, stat_comp_func=stats.mannwhitneyu),
def comparison_stat_allway(self, statfxn, statname, control=None, **statopts):
results = utils.numutils._group_comp_stat_generator(
self.tidy,
self.groupcols_comparison,
self.stationcol,
self.rescol,
statfxn,
statname=statname,
control=control,
pbarfxn=self.pbarfxn,
**statopts,
)
return pandas.DataFrame.from_records(results).set_index(self.groupcols_comparison)

def mann_whitney(self, **opts):
"""
Run the Mann-Whitney U test across datasets.
See `scipy.stats.mannwhitneyu` for available options.
"""
return self.comparison_stat_twoway(
partial(_dist_compare, stat_comp_func=stats.mannwhitneyu, **opts),
statname="mann_whitney",
)

@cache_readonly
def ranksums(self):
return self.comparison_stat(stats.ranksums, statname="rank_sums")
def ranksums(self, **opts):
"""
Run the unpaired Wilcoxon rank-sum test across datasets.
@cache_readonly
def t_test(self):
return self.comparison_stat(stats.ttest_ind, statname="t_test", equal_var=False)
See `scipy.stats.ranksums` for available options.
"""
return self.comparison_stat_twoway(stats.ranksums, statname="rank_sums", **opts)

@cache_readonly
def levene(self):
return self.comparison_stat(stats.levene, statname="levene", center="median")
def t_test(self, **opts):
"""
Run the T-test for independent scores.
@cache_readonly
def wilcoxon(self):
return self.comparison_stat(
See `scipy.stats.ttest_ind` for available options.
"""
return self.comparison_stat_twoway(stats.ttest_ind, statname="t_test", **opts)

def levene(self, **opts):
"""
Run the Levene test for equal variances
See `scipy.stats.levene` for available options.
"""
return self.comparison_stat_twoway(stats.levene, statname="levene", **opts)

def wilcoxon(self, **opts):
"""
Run the paired Wilcoxon rank-sum test across paired dataset.
See `scipy.stats.wilcoxon` for available options.
"""
return self.comparison_stat_twoway(
partial(_dist_compare, stat_comp_func=stats.wilcoxon),
statname="wilcoxon",
paired=True,
**opts,
)

@cache_readonly
def kendall(self):
return self.comparison_stat(stats.kendalltau, statname="kendalltau", paired=True)
def kendall(self, **opts):
"""
Run the paired Kendall-tau test across paired dataset.
@cache_readonly
def spearman(self):
return self.comparison_stat(stats.spearmanr, statname="spearmanrho", paired=True)
See `scipy.stats.kendalltau` for available options.
"""
return self.comparison_stat_twoway(
stats.kendalltau, statname="kendalltau", paired=True, **opts
)

def spearman(self, **opts):
"""
Run the paired Spearman-rho test across paired dataset.
See `scipy.stats.spearmanr` for available options.
"""
return self.comparison_stat_twoway(
stats.spearmanr, statname="spearmanrho", paired=True, **opts
)

def kruskal_wallis(self, **opts):
"""
Run the paired Kruskal-Wallos H-test across paired dataset.
See `scipy.stats.kruskal` for available options.
"""
return self.comparison_stat_allway(stats.kruskal, statname="K-W H", control=None, **opts)

def f_test(self, **opts):
"""
One-way ANOVA test across datasets
See `scipy.stats.f_oneway` for available options.
"""
return self.comparison_stat_allway(stats.f_oneway, statname="f-test", control=None, **opts)

def tukey_hsd(self) -> tuple[pandas.DataFrame, pandas.DataFrame]:
"""
Tukey Honestly Significant Difference (HSD) test across stations + other groups
for each parameter
"""
hsd = utils.tukey_hsd(
self.tidy, self.rescol, self.stationcol, self.paramcol, *self.othergroups
)
scores = utils.process_tukey_hsd_scores(hsd, self.stationcol, self.paramcol)
return hsd, scores

def dunn(self):
"""
Dunn test across the different stations for each pollutant
"""
return self.tidy.groupby(by=[self.paramcol]).apply(
lambda g: utils.dunn_test(g, self.rescol, self.stationcol, *self.othergroups).scores
)

@cache_readonly
def theilslopes(self, logs=False):
raise NotImplementedError

Expand Down
14 changes: 6 additions & 8 deletions wqio/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -1599,20 +1599,18 @@ def scatterplot(
raise ValueError(f"`eqn_pos` must be on of {list.positions.keys()}")
# annotate axes with stats

slope = utils.sig_figs(modelres.params[1], n=3)
icept = utils.sig_figs(modelres.params[0], n=3)
ax.annotate(
r"$\log(y) = {} \, \log(x) + {}$".format(
utils.sig_figs(modelres.params[1], n=3),
utils.sig_figs(modelres.params[0], n=3),
),
rf"$\log(y) = {slope} \, \log(x) + {icept}$",
(txt_x, txt_y),
xycoords="axes fraction",
)

slope_pval = utils.process_p_vals(modelres.pvalues[1])
icept_pval = utils.process_p_vals(modelres.pvalues[0])
ax.annotate(
"Slope p-value: {}\nIntercept p-value: {}".format(
utils.process_p_vals(modelres.pvalues[1]),
utils.process_p_vals(modelres.pvalues[0]),
),
f"Slope p-value: {slope_pval}\nIntercept p-value: {icept_pval}",
(txt_x, txt_y - vert_offset),
xycoords="axes fraction",
)
Expand Down
5 changes: 2 additions & 3 deletions wqio/tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import warnings

from pkg_resources import resource_filename
from importlib import resources

from wqio.tests.helpers import requires

Expand All @@ -12,7 +11,7 @@

@requires(pytest, "pytest")
def test(*args):
options = [resource_filename("wqio", "")]
options = [str(resources.files("wqio"))]
options.extend(list(args))
return pytest.main(options)

Expand Down
Binary file added wqio/tests/_data/sed.pkl
Binary file not shown.
Binary file added wqio/tests/_data/wq.pkl
Binary file not shown.
Loading

0 comments on commit 18d727e

Please sign in to comment.