Merge pull request #65 from flatironinstitute/multitask_multigene

v0.6.3
flatironinstitute · Jul 8, 2024 · 9a0e0a8 · 9a0e0a8
2 parents 5fd992e + 23848c9
commit 9a0e0a8
Show file tree

Hide file tree

Showing 48 changed files with 1,642 additions and 550 deletions.
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -12,7 +12,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ['3.8', '3.9', '3.10']
+        python-version: ['3.9', '3.10', '3.11']
 
     steps:
     - uses: actions/checkout@v3

diff --git a/docs/changelog.rst b/docs/changelog.rst
@@ -1,6 +1,17 @@
 Change Log
 ==========
 
+Inferelator v0.6.3 `August 15, 2023`
+----------------------------------------
+
+New Functionality:
+
+- Accepts h5ad objects directly into the constructor
+
+Bug Fixes:
+
+- Fixed several deprecated arguments in dependencies
+
 Inferelator v0.6.2 `May 8, 2023`
 ----------------------------------------
 

diff --git a/docs/conf.py b/docs/conf.py
@@ -23,7 +23,7 @@
 author = 'Chris Jackson'
 
 # The full version, including alpha/beta/rc tags
-release = 'v0.6.2'
+release = 'v0.6.3'
 
 
 # -- General configuration ---------------------------------------------------

diff --git a/inferelator/__init__.py b/inferelator/__init__.py
@@ -1,10 +1,4 @@
-# Set threading control variables if they're not already set
-
-import os
-os.environ["MKL_NUM_THREADS"] = os.environ.get("MKL_NUM_THREADS", "1")
-os.environ["NUMEXPR_NUM_THREADS"] = os.environ.get("NUMEXPR_NUM_THREADS", "1")
-os.environ["OMP_NUM_THREADS"] = os.environ.get("OMP_NUM_THREADS", "1")
-os.environ["OPENBLAS_NUM_THREADS"] = os.environ.get("OPENBLAS_NUM_THREADS", "1")
+__version__ = '0.6.3'
 
 from inferelator.workflow import inferelator_workflow
 from inferelator.crossvalidation_workflow import CrossValidationManager

diff --git a/inferelator/benchmarking/scenic.py b/inferelator/benchmarking/scenic.py
@@ -25,39 +25,51 @@
 FEATHER_FILE_NAME = "RANKED.gene_based.max.feather"
 MOTIF_TABLE_NAME = "motifs-from-binary-prior.tbl"
 
-MOTIF_TABLE_COLS = ['#motif_id',
- 'motif_name',
- 'motif_description',
- 'source_name',
- 'source_version',
- 'gene_name',
- 'motif_similarity_qvalue',
- 'similar_motif_id',
- 'similar_motif_description',
- 'orthologous_identity',
- 'orthologous_gene_name',
- 'orthologous_species',
- 'description']
-
-MOTIF_TABLE_DEFAULTS = {'source_name': "binary_prior",
-'source_version': 1.0,
-'motif_similarity_qvalue': 0.0, 
-'similar_motif_id': None,
-'similar_motif_description': None,
-'orthologous_identity': 1.0,
-'orthologous_gene_name': None,
-'orthologous_species': None,
-'description': "Gene"}
-
-MOTIF_NAME_COLS = ['#motif_id',
- 'motif_name',
- 'motif_description',
- 'source_name',
- 'source_version',
- 'gene_name']
+MOTIF_TABLE_COLS = [
+    '#motif_id',
+    'motif_name',
+    'motif_description',
+    'source_name',
+    'source_version',
+    'gene_name',
+    'motif_similarity_qvalue',
+    'similar_motif_id',
+    'similar_motif_description',
+    'orthologous_identity',
+    'orthologous_gene_name',
+    'orthologous_species',
+    'description'
+]
+
+MOTIF_TABLE_DEFAULTS = {
+    'source_name': "binary_prior",
+    'source_version': 1.0,
+    'motif_similarity_qvalue': 0.0,
+    'similar_motif_id': None,
+    'similar_motif_description': None,
+    'orthologous_identity': 1.0,
+    'orthologous_gene_name': None,
+    'orthologous_species': None,
+    'description': "Gene"
+}
+
+MOTIF_NAME_COLS = [
+    '#motif_id',
+    'motif_name',
+    'motif_description',
+    'source_name',
+    'source_version',
+    'gene_name'
+]
+
 
 class SCENICWorkflow(SingleCellWorkflow):
-
+
+    do_scenic = True
+
+    _do_preprocessing = True
+    _do_scaling = True
+
     dask_temp_path = None
     _tmp_handle = None
 
@@ -68,45 +80,72 @@ class SCENICWorkflow(SingleCellWorkflow):
     def tmp_dir(self):
 
         if self._tmp_handle is None:
-            self._tmp_handle = tempfile.TemporaryDirectory(prefix="SCENIC_", dir=self.dask_temp_path)
-
+            self._tmp_handle = tempfile.TemporaryDirectory(
+                prefix="SCENIC_",
+                dir=self.dask_temp_path
+            )
+
         return self._tmp_handle.name
 
     def startup_finish(self):
 
         self.align_priors_and_expression()
 
-        tf_names = self.tf_names if self.tf_names is not None else self.priors_data.columns
-        self.tf_names = [t for t in tf_names if t in self.data.gene_names]
+        if self.tf_names is not None:
+            tf_names = self.tf_names
+        else:
+            tf_names = self.priors_data.columns
+
+        self.tf_names = [
+            t for t in tf_names
+            if t in self.data.gene_names
+        ]
 
         utils.Debug.vprint("Generating SCENIC prior files", level=0)
 
-        self._feather_rank_file = self.create_feather_file_from_prior()
-        self._motif_link_table_file = self.create_motif_table_from_prior()
+        if self.do_scenic:
+            self._feather_rank_file = self.create_feather_file_from_prior()
+            self._motif_link_table_file = self.create_motif_table_from_prior()
+
+        if self._do_preprocessing:
+            utils.Debug.vprint("Preprocessing data")
 
-        utils.Debug.vprint("Preprocessing data")
+            sc.pp.filter_cells(self.data._adata, min_genes=200)
+            sc.pp.filter_genes(self.data._adata, min_cells=3)
 
-        sc.pp.filter_cells(self.data._adata, min_genes=200)
-        sc.pp.filter_genes(self.data._adata, min_cells=3)
+            self.data.convert_to_float()
 
-        self.data.convert_to_float()
+            sc.pp.normalize_per_cell(
+                self.data._adata,
+                counts_per_cell_after=1e4
+            )
 
-        sc.pp.normalize_per_cell(self.data._adata, counts_per_cell_after=1e4)
-        sc.pp.log1p(self.data._adata)
-        sc.pp.scale(self.data._adata, max_value=10)
+            sc.pp.log1p(self.data._adata)
 
+        if self._do_scaling:
+            sc.pp.scale(self.data._adata, max_value=10)
 
     def create_feather_file_from_prior(self):
 
         # Get rid of TFs which have no edges
-        new_prior = self.priors_data.loc[:, (self.priors_data != 0).sum(axis=0) > 0]
+        new_prior = self.priors_data.loc[
+            :,
+            (self.priors_data != 0).sum(axis=0) > 0
+        ]
 
         # Make sure to include all genes
-        new_prior = new_prior.reindex(self.data.gene_names, axis=0).fillna(0).T.astype(int)
+        new_prior = new_prior.reindex(
+            self.data.gene_names,
+            axis=0
+        ).fillna(0).T.astype(int)
+
         new_prior.index.name = 'features'
 
         for i in range(new_prior.shape[0]):
-            new_prior.iloc[i, :] = scenic_ranking_prior(new_prior.iloc[i, :], seed=42 + i).astype(int)
+            new_prior.iloc[i, :] = scenic_ranking_prior(
+                new_prior.iloc[i, :],
+                seed=42 + i
+            ).astype(int)
 
         new_prior.reset_index(inplace=True)
         feather_file = os.path.join(self.tmp_dir, FEATHER_FILE_NAME)
@@ -129,18 +168,21 @@ def create_motif_table_from_prior(self):
         mt.to_csv(motif_table_file, sep="\t", index=False)
 
         return motif_table_file
-            
+
 
 class SCENICRegression(_RegressionWorkflowMixin):
 
     adjacency_method = "grnboost2"
     do_scenic = True
 
     def run_regression(self):
-        
+
         data_df = self.data.to_df()
 
-        utils.Debug.vprint("Calculating {m} adjacencies".format(m=self.adjacency_method), level=0)
+        utils.Debug.vprint(
+            f"Calculating {self.adjacency_method} adjacencies",
+            level=0
+        )
 
         # Get adjacencies
         adj_method = ADJ_METHODS[self.adjacency_method]
@@ -151,53 +193,85 @@ def run_regression(self):
         else:
             client_or_address = 'local'
 
-        adjacencies = adj_method(data_df, tf_names=self.tf_names, verbose=True, client_or_address=client_or_address,
-                                 seed=self.random_seed)
+        adjacencies = adj_method(
+            data_df,
+            tf_names=self.tf_names,
+            verbose=True,
+            client_or_address=client_or_address,
+            seed=self.random_seed
+        )
+
+        utils.Debug.vprint(
+            f"{self.adjacency_method} adjacencies {adjacencies.shape}",
+            level=0
+        )
 
         if self.do_scenic:
 
             # Convert adjacencies to modules
             modules = list(modules_from_adjacencies(adjacencies, data_df))
 
             # Load feather (rank) databases
-            dbs = [RankingDatabase(fname = self._feather_rank_file, name = "RANKING_PRIOR")]
+            dbs = [
+                RankingDatabase(
+                    fname=self._feather_rank_file,
+                    name="RANKING_PRIOR"
+                )
+            ]
 
             utils.Debug.vprint("Pruning adjacencies with SCENIC", level=0)
 
             # Prune to df
-            df = prune2df(dbs, modules, self._motif_link_table_file, client_or_address=client_or_address)
-
-            return self.reprocess_scenic_output_to_inferelator_results(df, self.priors_data)
+            df = prune2df(
+                dbs,
+                modules,
+                self._motif_link_table_file,
+                client_or_address=client_or_address
+            )
+
+            return self.reprocess_scenic_output_to_inferelator_results(
+                df,
+                self.priors_data
+            )
 
         else:
 
             return self.reprocess_adj_to_inferelator_results(adjacencies)
 
-
     @staticmethod
     def reprocess_scenic_output_to_inferelator_results(scenic_df, prior_data):
 
         # if there's nothing in the scenic output make an empty dataframe of 0s
         if scenic_df.shape[0] == 0:
-            mat = pd.DataFrame(0.0, index=prior_data.index, columns=prior_data.columns)
+            mat = pd.DataFrame(
+                0.0,
+                index=prior_data.index,
+                columns=prior_data.columns
+            )
 
         else:
             scenic_df = scenic_df.copy()
             scenic_df.index = scenic_df.index.droplevel(1)
             scenic_df.columns = scenic_df.columns.droplevel(0)
 
-            mat = [pd.DataFrame(data).set_index(0).rename({1: tf}, axis=1)
-                    for tf, data in scenic_df['TargetGenes'].iteritems()]
+            mat = [
+                pd.DataFrame(data).set_index(0).rename({1: tf}, axis=1)
+                for tf, data in scenic_df['TargetGenes'].iteritems()
+            ]
 
             mat = pd.concat(mat, axis=0).fillna(0)
             mat = mat.groupby(mat.index).agg('max')
             mat = mat.reindex(prior_data.columns, axis=1).reindex(prior_data.index, axis=0).fillna(0)
-    
+
         return [mat], [mat.copy()], mat.copy(), mat.copy()
 
     @staticmethod
     def reprocess_adj_to_inferelator_results(adj):
-        mat = adj.pivot(index='target', columns='TF', values='importance').fillna(0.)
+        mat = adj.pivot(
+            index='target',
+            columns='TF',
+            values='importance'
+        ).fillna(0.)
 
         return [mat], [mat.copy()], mat.copy(), mat.copy()
 

diff --git a/inferelator/crossvalidation_workflow.py b/inferelator/crossvalidation_workflow.py
@@ -1,10 +1,10 @@
 """
-This is a manager which will take an Inferelator workflow and repeatedly run it with different parameters.
-This is implemented using deep copies; it is therefore memory-intensive.
+This is a manager which will take an Inferelator workflow and repeatedly
+run it with different parameters.
+This is implemented using deep copies;
+it is therefore memory-intensive.
 """
 
-from __future__ import print_function
-
 # I hate py2 now
 try:
     from builtins import FileExistsError
@@ -20,8 +20,6 @@
 import numpy as np
 import pandas as pd
 
-
-from inferelator.distributed.inferelator_mp import MPControl
 from inferelator.utils import Validator as check
 from inferelator import utils
 from inferelator import workflow
@@ -102,7 +100,11 @@ def workflow(self):
 
     @workflow.setter
     def workflow(self, wkf):
-        assert check.argument_is_subclass(wkf, workflow.WorkflowBase, allow_none=True)
+        assert check.argument_is_subclass(
+            wkf,
+            workflow.WorkflowBase,
+            allow_none=True
+        )
         if self._baseline_workflow is not None:
             warnings.warn("Replacing stored workflow with a new workflow")
         self._baseline_workflow = wkf