ENH Generic design support using formulaic (#328)

* feat: set design matrices using formulaic (WIP) * feat: set design matrices using formulaic (WIP) * feat: check that there are no NaNs in the design matrix * feat: implement support for pairwise categorical contrasts * feat: allow directly inputing a contrast vector * feat: allow setting design factors as in versions < 0.5.* for backward compatibiliy, but throw deprecation warning * feat: improve summary message when a contrast vector was directly provided * docs: update docstrings to reflect deprecation of design_factors * chore: deprecate ref_level * chore: deprecate continuous_factors * chore: remove build_design_matrix and remove_underscores utils, as they are no longer used * refactor: update lfc_shrink to reflect new design format * tests: update edge case tests * tests: update main tests * docs: remove build_design_matrix from docs * docs: add missing docstrings in new DeseqStats methods * feat: add formulaic utils copied from pertpy * build!: discontinue python 3.9 * docs: update minimal example * docs: update data loading example * docs: update step by step example * fix: typo * feat: implement a `to_picklable_anndata()` method to allow saving a dds without its unpicklable attributes * docs: add API documentation for to_picklable_anndata * docs: fix rst formatting * refactor: throw a ValueError instead of an AssertionError when numerical constrasts have incorrect shapes * test: ignore anndata ImplicitModificationWarnings in test suite * test: add _formulaic test suite from pertpy * refactor: move .cond() to DeseqDataSet so that it can be used at DeseqStats initialization * docs: add more details on providing a design matrix directly * tests: add a few tests when a design matrix is directly provided * refactor: throw ValueError instead of AssertionError in case of design with invalid type * docs: improve lfc_shrink example * docs: document the new materializer attributes
owkin · Nov 18, 2024 · ed62d06 · ed62d06
1 parent c5f2335
commit ed62d06
Show file tree

Hide file tree

Showing 15 changed files with 1,174 additions and 700 deletions.
diff --git a/.github/workflows/pr_validation.yml b/.github/workflows/pr_validation.yml
@@ -17,8 +17,6 @@ jobs:
     strategy:
       matrix:
         include:
-          - os: ubuntu-latest
-            python: "3.9"
           - os: ubuntu-latest
             python: "3.10"
           - os: ubuntu-latest

diff --git a/docs/source/api/docstrings/pydeseq2.dds.DeseqDataSet.rst b/docs/source/api/docstrings/pydeseq2.dds.DeseqDataSet.rst
@@ -10,6 +10,7 @@
     .. autosummary::
 
         ~DeseqDataSet.calculate_cooks
+        ~DeseqDataSet.cond
         ~DeseqDataSet.deseq2
         ~DeseqDataSet.fit_LFC
         ~DeseqDataSet.fit_MAP_dispersions
@@ -19,5 +20,6 @@
         ~DeseqDataSet.fit_size_factors
         ~DeseqDataSet.plot_dispersions
         ~DeseqDataSet.refit
+        ~DeseqDataSet.to_picklable_anndata
         ~DeseqDataSet.vst
 
diff --git a/docs/source/api/docstrings/pydeseq2.utils.rst b/docs/source/api/docstrings/pydeseq2.utils.rst
@@ -13,7 +13,6 @@
 
    .. autosummary::
 
-      build_design_matrix
       dispersion_trend
       dnb_nll
       fit_alpha_mle

diff --git a/examples/plot_minimal_pydeseq2_pipeline.py b/examples/plot_minimal_pydeseq2_pipeline.py
@@ -135,7 +135,7 @@
 dds = DeseqDataSet(
     counts=counts_df,
     metadata=metadata,
-    design_factors="condition",
+    design="~condition",
     refit_cooks=True,
     inference=inference,
     # n_cpus=8, # n_cpus can be specified here or in the inference object
@@ -146,13 +146,13 @@
 # arguments: a ``counts`` and a ``metadata`` dataframe, like the ones we've loaded in the
 # first part of this tutorial.
 #
-# Next, we should specify the ``design_factor``, i.e. the column of the ``metadata``
-# dataframe that will be used to compare samples. This can be a single string as above,
-# or a list of strings, as in the
-# :ref:`section on multifactor analysis<multifactor_ref>`.
+# Next, we should specify a ``design``, i.e. a Wilkinson formula that describes the
+# design, or directly a design matrix. Here we provide a formula, which is a string
+# that `formulaic <https://github.com/matthewwardrop/formulaic>`_ should be able to
+# parse.
 #
 # .. note::
-#   The ``"condition"`` argument passed to ``design_factors`` corresponds to a column
+#   The ``"condition"`` factor in ``design`` corresponds to a column
 #   from the ``metadata`` dataframe we loaded earlier.
 #   You might need to change it according to your own dataset.
 #
@@ -217,11 +217,15 @@
 #
 # Now that dispersions and LFCs were fitted, we may proceed with statistical tests to
 # compute p-values and adjusted p-values for differential expresion. This is the role of
-# the :class:`DeseqStats` class. It has a unique mandatory argument, ``dds``, which
-# should be a *fitted* :class:`DeseqDataSet <pydeseq2.dds.DeseqDataSet>`
-# object.
+# the :class:`DeseqStats` class. It has two mandatory arguments:
+#
+# - ``dds``, which should be a *fitted* :class:`DeseqDataSet <pydeseq2.dds.DeseqDataSet>`
+#   object,
+# - ``contrast``, which is a list of three strings of the form
+#   ``["variable", "tested_level", "control_level"]``, or directly a contrast vector.
+#
 
-stat_res = DeseqStats(dds, inference=inference)
+ds = DeseqStats(dds, contrast=["condition", "B", "A"], inference=inference)
 
 # %%
 # It also has a set of optional keyword arguments (see the :doc:`API documentation
@@ -248,14 +252,14 @@
 # :meth:`summary() <DeseqStats.summary>` method, which runs the whole statistical
 # analysis, cooks filtering and multiple testing adjustement included.
 
-stat_res.summary()
+ds.summary()
 
 if SAVE:
-    with open(os.path.join(OUTPUT_PATH, "stat_results.pkl"), "wb") as f:
-        pkl.dump(stat_res, f)
+    with open(os.path.join(OUTPUT_PATH, "ds.pkl"), "wb") as f:
+        pkl.dump(ds, f)
 
 # %%
-# The results are then stored in the ``results_df`` attribute (``stat_res.results_df``).
+# The results are then stored in the ``results_df`` attribute (``ds.results_df``).
 
 # %%
 # Optional: threshold-based tests
@@ -267,22 +271,24 @@
 # null hypothesis. It can take one of the values
 # ``["greaterAbs", "lessAbs", "greater", "less"]``.
 
-stat_res.summary(lfc_null=0.1, alt_hypothesis="greaterAbs")
-stat_res.plot_MA(s=20)
+ds.summary(lfc_null=0.1, alt_hypothesis="greaterAbs")
+ds.plot_MA(s=20)
 
 # %%
 # LFC shrinkage
 # """""""""""""
 #
 # For visualization or post-processing purposes, it might be suitable to perform
 # LFC shrinkage. This is implemented by the :meth:`lfc_shrink() <DeseqStats.lfc_shrink>`
-# method.
+# method, which takes as argument the name of the coefficient to shrink (i.e., the name
+# of one of the columns of the design matrix ``dds.obsm["design_matrix"]``).
+# For instance, to shrink the LFCs of the ``condition B`` samples, we would run:
 
-stat_res.lfc_shrink(coeff="condition_B_vs_A")
+ds.lfc_shrink(coeff="condition[T.B]")
 
 if SAVE:
-    with open(os.path.join(OUTPUT_PATH, "shrunk_stat_results.pkl"), "wb") as f:
-        pkl.dump(stat_res, f)
+    with open(os.path.join(OUTPUT_PATH, "shrunk_results.pkl"), "wb") as f:
+        pkl.dump(ds, f)
 
 # %%
 # .. note::
@@ -291,7 +297,7 @@
 #   This can be checked using the ``shrunk_LFCs`` flag.
 #
 
-print(stat_res.shrunk_LFCs)  # Will be True only if lfc_shrink() was run.
+print(ds.shrunk_LFCs)  # Will be True only if lfc_shrink() was run.
 
 # %%
 # .. _multifactor_ref:
@@ -314,23 +320,18 @@
 # ^^^^^^^^^^^^^^^^^^^^^
 #
 # To perform multifactor analysis with PyDESeq2, we start by inializing a
-# :class:`DeseqDataSet` as previously, but we provide the list of variables we would like
-# to use in the ``design_factors`` argument.
+# :class:`DeseqDataSet` as previously, but we provide several variables we would like
+# to use in the ``design`` argument.
 #
 
 dds = DeseqDataSet(
     counts=counts_df,
     metadata=metadata,
-    design_factors=["group", "condition"],
+    design="~group + condition",
     refit_cooks=True,
     inference=inference,
 )
 # %%
-# .. note::
-#   By default, the last variable in the list (here, ``"condition"``) will be the one for
-#   which LFCs and p-values will be displayed, but this may be changed later on when
-#   performing the statistical analysis.
-#
 # As for the single-factor analysis, we fit dispersions and LFCs using the
 # :meth:`deseq2() <DeseqDataSet.deseq2>` method.
 
@@ -349,30 +350,22 @@
 # ^^^^^^^^^^^^^^^^^^^^
 #
 # P-values are computed as earlier from a :class:`DeseqStats` object with the
-# :meth:`summary() <DeseqStats.summary>` method, with a new important argument:
-# the ``contrast``.
+# :meth:`summary() <DeseqStats.summary>` method. The ``contrast`` argument will allow us
+# to determines which variable we want to obtain LFCs and pvalues for.
 # It is a list of three strings of the form
-# ``["variable", "tested level", "reference level"]`` which determines which
-# variable we want to compute LFCs and pvalues for.
+# ``["variable", "tested level", "reference level"]``
 # As an example, to compare the condition B to the condition A, we set
 # ``contrast=["condition", "B", "A"]``.
 #
 
-stat_res_B_vs_A = DeseqStats(dds, contrast=["condition", "B", "A"], inference=inference)
+ds_B_vs_A = DeseqStats(dds, contrast=["condition", "B", "A"], inference=inference)
 
 # %%
-# .. note::
-#   If left blank, the variable of interest will be the last one provided in
-#   the ``design_factors`` attribute of the corresponding
-#   :class:`DeseqDataSet <pydeseq2.dds.DeseqDataSet>` object,
-#   and the reference level will be picked alphabetically.
-#   In any case, *both variables are still used*. This is due to the fact that ``dds``
-#   was fit with both as design factors.
 #
 # Let us fit p-values:
 #
 
-stat_res_B_vs_A.summary()
+ds_B_vs_A.summary()
 
 
 # %%
@@ -385,8 +378,8 @@
 # :class:`DeseqDataSet <pydeseq2.dds.DeseqDataSet>`
 # with ``contrast=["group", "Y", "X"]``, and run the analysis again.
 
-stat_res_Y_vs_X = DeseqStats(dds, contrast=["group", "Y", "X"], inference=inference)
-stat_res_Y_vs_X.summary()
+ds_Y_vs_X = DeseqStats(dds, contrast=["group", "Y", "X"], inference=inference)
+ds_Y_vs_X.summary()
 
 # %%
 # LFC shrinkage (multifactor)
@@ -396,4 +389,4 @@
 # only shrink the LFCs of a :class:`DeseqStats` object based on its
 # ``contrast`` argument.
 
-stat_res_B_vs_A.lfc_shrink(coeff="condition_B_vs_A")
+ds_B_vs_A.lfc_shrink(coeff="condition[T.B]")
diff --git a/examples/plot_pandas_io_example.py b/examples/plot_pandas_io_example.py
@@ -3,7 +3,7 @@
 ======================================================
 
 In this example, we show how load data in order to perform a DEA analysis with PyDESeq2,
-and how to solve its results, using `pandas <https://pandas.pydata.org/>`_ and
+and how to save its results, using `pandas <https://pandas.pydata.org/>`_ and
 `pickle <https://docs.python.org/3/library/pickle.html>`_.
 
 We refer to the :doc:`getting started example <plot_minimal_pydeseq2_pipeline>` for more
@@ -126,7 +126,7 @@
 dds = DeseqDataSet(
     counts=counts_df,
     metadata=metadata,
-    design_factors="condition",
+    design="~condition",
     refit_cooks=True,
     inference=inference,
 )
@@ -149,20 +149,11 @@
 
 
 # %%
-# As such, it can be saved using
-# `pickle.dump <https://docs.python.org/3/library/pickle.html#pickle.dump>`_.
-with open(os.path.join(OUTPUT_PATH, "dds.pkl"), "wb") as f:
-    pkl.dump(dds, f)
-
-
-# %%
-# It may be loaded again using
-# `pickle.load <https://docs.python.org/3/library/pickle.html#pickle.load>`_.
-
-with open(os.path.join(OUTPUT_PATH, "dds.pkl"), "rb") as f:
-    dds2 = pkl.load(f)
-
-print(dds2)
+# After removing unpicklable DeseqDataSet attributes, we can save the corresponding
+# AnnData object. This can be done using the
+# :meth:`to_picklable_anndata() <DeseqDataSet.to_picklable_anndata>` method.
+with open(os.path.join(OUTPUT_PATH, "result_adata.pkl"), "wb") as f:
+    pkl.dump(dds.to_picklable_anndata(), f)
 
 
 # %%
@@ -197,14 +188,14 @@
 # compute p-values and adjusted p-values for differential expresion. This is the role of
 # the :class:`DeseqStats <ds.DeseqStats>` class.
 
-stat_res = DeseqStats(dds, inference=inference)
+ds = DeseqStats(dds, contrast=["condition", "B", "A"], inference=inference)
 
 # %%
 # PyDESeq2 computes p-values using Wald tests. This can be done using the
 # :meth:`summary() <ds.DeseqStats.summary>` method, which runs the whole statistical
 # analysis, cooks filtering and multiple testing adjustement included.
 
-stat_res.summary()
+ds.summary()
 
 # %%
 # The results are then stored in the ``results_df`` attribute (``stat_res.results_df``).
@@ -214,4 +205,4 @@
 # Hence, we may export ``stat_res.results_df`` as CSV, using `pandas.DataFrame.to_csv()
 # <https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html>`_.
 
-stat_res.results_df.to_csv(os.path.join(OUTPUT_PATH, "results.csv"))
+ds.results_df.to_csv(os.path.join(OUTPUT_PATH, "results.csv"))