Merge pull request #48 from flatironinstitute/dev

v0.5.7
flatironinstitute · Sep 29, 2021 · 2a33c74 · 2a33c74
2 parents dd532f4 + bffa1f2
commit 2a33c74
Show file tree

Hide file tree

Showing 41 changed files with 380 additions and 710 deletions.
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -26,6 +26,7 @@ jobs:
         python -m pip install -r requirements.txt
         python -m pip install -r requirements-test.txt
         python -m pip install -r requirements-multiprocessing.txt
+        python -m pip install numba
     - name: Test with pytest & coverage
       run: |
         python -m coverage run -m pytest

diff --git a/LICENSE b/LICENSE
@@ -1,4 +1,5 @@
 Copyright (c) 2016-2021 The Simons Foundation, Inc.
+Copyright (c) 2021 Broad Institute of MIT and Harvard
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without

diff --git a/docs/changelog.rst b/docs/changelog.rst
@@ -1,22 +1,39 @@
 Change Log
 ==========
 
+Inferelator v0.5.7 `September 29, 2021`
+---------------------------------------
+
+New Functionality:
+
+- Added support for numba acceleration of AMuSR with ``.set_run_parameters(use_numba=True)`` (PR #46)
+
+Code Refactoring:
+
+- Updated example scripts
+- Removed deprecated KVS multiprocessing and associated code
+
+Bug Fixes:
+
+- Gene labels are included as the first column of the produced confidences TSV file by default
+- Matplotlib backend selection checks for non-interactive mode
+
 Inferelator v0.5.6 `August 16, 2021`
------------------------------------
+------------------------------------
 
 New Functionality:
 
 - Added code to randomly generate noise in prior with ``.set_shuffle_parameters(add_prior_noise=None)``
 - Added in-workflow benchmarks for CellOracle and pySCENIC
-- 
+  
 
 Code Refactoring:
 
 - Minor changes to matplotlib interface
 - Improved testing for multitask workflows
 - Improved error messaging around prior and gold standard
 - Switch from Travis.ci to GitHub Actions for continuous integration
-- 
+  
 
 Inferelator v0.5.5 `April 29, 2021`
 -----------------------------------

diff --git a/docs/conf.py b/docs/conf.py
@@ -23,7 +23,7 @@
 author = 'Chris Jackson'
 
 # The full version, including alpha/beta/rc tags
-release = 'v0.5.6'
+release = 'v0.5.7'
 
 
 # -- General configuration ---------------------------------------------------

diff --git a/examples/Bsubtilis_Network_Inference.ipynb b/examples/Bsubtilis_Network_Inference.ipynb
@@ -144,7 +144,8 @@
     "worker.append_to_path('output_dir', 'final')\n",
     "worker.set_crossvalidation_parameters(split_gold_standard_for_crossvalidation=False, cv_split_ratio=None)\n",
     "worker.set_run_parameters(num_bootstraps=50, random_seed=100)\n",
-    "final_network = worker.run()"
+    "\n",
+    "final_network_results = worker.run()"
    ]
   },
   {
@@ -154,8 +155,35 @@
    "outputs": [],
    "source": [
     "# Visualize network results\n",
+    "# The workflow returns an InferelatorResults object\n",
+    "\n",
+    "# There is a dataframe with an edge table for the final network\n",
+    "final_network_results.network.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# There is a list of dataframes with model coeffcients\n",
+    "# Each list element is a dataframe with the results from one bootstrap\n",
+    "# The dataframes are genes x TFs\n",
+    "\n",
+    "final_network_results.betas[0].iloc[0:5, 0:5]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# The confidence scores for each network edge are also accessible\n",
+    "# This dataframe is genes x TFs\n",
     "\n",
-    "final_network.head()"
+    "final_network_results.combined_confidences.iloc[0:5, 0:5]"
    ]
   }
  ],
@@ -175,7 +203,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.6"
+   "version": "3.7.10"
   }
  },
  "nbformat": 4,

diff --git a/examples/Yeast_Network_Inference.ipynb b/examples/Yeast_Network_Inference.ipynb
@@ -209,7 +209,8 @@
     "worker.append_to_path('output_dir', 'final')\n",
     "worker.set_crossvalidation_parameters(split_gold_standard_for_crossvalidation=False, cv_split_ratio=None)\n",
     "worker.set_run_parameters(num_bootstraps=50, random_seed=100)\n",
-    "final_network = worker.run()"
+    "\n",
+    "final_network_results = worker.run()"
    ]
   },
   {
@@ -219,8 +220,35 @@
    "outputs": [],
    "source": [
     "# Visualize network results\n",
+    "# The workflow returns an InferelatorResults object\n",
+    "\n",
+    "# There is a dataframe with an edge table for the final network\n",
+    "final_network_results.network.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# There is a list of dataframes with model coeffcients\n",
+    "# Each list element is a dataframe with the results from one bootstrap\n",
+    "# The dataframes are genes x TFs\n",
+    "\n",
+    "final_network_results.betas[0].iloc[0:5, 0:5]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# The confidence scores for each network edge are also accessible\n",
+    "# This dataframe is genes x TFs\n",
     "\n",
-    "final_network.head()"
+    "final_network_results.combined_confidences.iloc[0:5, 0:5]"
    ]
   }
  ],
@@ -240,7 +268,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.6"
+   "version": "3.7.10"
   }
  },
  "nbformat": 4,

diff --git a/inferelator/amusr_workflow.py b/inferelator/amusr_workflow.py
@@ -386,16 +386,14 @@ def emit_results(self, betas, rescaled_betas, gold_standard, priors_data):
 
         This is called when `.startup()` is run. It is not necessary to call separately.
         """
-        if self.is_master():
-            self.create_output_dir()
-            rp = self._result_processor_driver(betas, rescaled_betas, filter_method=self.gold_standard_filter_method,
-                                               metric=self.metric)
-            rp.tasks_names = self._task_names
-            self.results = rp.summarize_network(self.output_dir, gold_standard, self._task_priors)
-            self.task_results = rp.tasks_networks
-            return self.results
-        else:
-            return None
+
+        self.create_output_dir()
+        rp = self._result_processor_driver(betas, rescaled_betas, filter_method=self.gold_standard_filter_method,
+                                            metric=self.metric)
+        rp.tasks_names = self._task_names
+        self.results = rp.summarize_network(self.output_dir, gold_standard, self._task_priors)
+        self.task_results = rp.tasks_networks
+        return self.results
 
 
 def create_task_data_object(workflow_class="single-cell"):

diff --git a/inferelator/benchmarking/scenic.py b/inferelator/benchmarking/scenic.py
@@ -187,10 +187,12 @@ def reprocess_scenic_output_to_inferelator_results(scenic_df, prior_data):
             scenic_df.columns = scenic_df.columns.droplevel(0)
 
             mat = [pd.DataFrame(data).set_index(0).rename({1: tf}, axis=1)
-                   for tf, data in scenic_df['TargetGenes'].iteritems()]
-
-            mat = pd.concat(mat, axis=0).reindex(prior_data.columns, axis=1).fillna(0)
+                    for tf, data in scenic_df['TargetGenes'].iteritems()]
 
+            mat = pd.concat(mat, axis=0).fillna(0)
+            mat = mat.groupby(mat.index).agg('max')
+            mat = mat.reindex(prior_data.columns, axis=1).reindex(prior_data.index, axis=0).fillna(0)
+
         return [mat], [mat.copy()]
 
     @staticmethod

diff --git a/inferelator/crossvalidation_workflow.py b/inferelator/crossvalidation_workflow.py
@@ -266,33 +266,32 @@ def _create_writer(self):
         Create a CSVWriter and stash it in self.writer
         """
 
-        if MPControl.is_master:
-            # Create a CSV header from grid search param names
-            self._csv_header = copy.copy(self.grid_params) if self.grid_params is not None else []
+        # Create a CSV header from grid search param names
+        self._csv_header = copy.copy(self.grid_params) if self.grid_params is not None else []
 
-            # Add Test & Value columns for dropouts/etc
-            self._csv_header.extend(["Test", "Value", "Num_Obs"])
+        # Add Test & Value columns for dropouts/etc
+        self._csv_header.extend(["Test", "Value", "Num_Obs"])
 
-            # Also add the metric name
-            self._csv_header.extend(MetricHandler.get_metric(self.workflow.metric).all_names())
+        # Also add the metric name
+        self._csv_header.extend(MetricHandler.get_metric(self.workflow.metric).all_names())
 
-            # Create a CSV writer
-            self._create_output_path()
-            self._open_csv_handle()
+        # Create a CSV writer
+        self._create_output_path()
+        self._open_csv_handle()
 
-            self._csv_writer = self._csv_writer_object(self._csv_file_handle,
-                                                       delimiter="\t", lineterminator="\n", quoting=csv.QUOTE_NONE)
+        self._csv_writer = self._csv_writer_object(self._csv_file_handle,
+                                                    delimiter="\t", lineterminator="\n", quoting=csv.QUOTE_NONE)
 
-            # Write the header line
-            self._csv_writer.writerow(self._csv_header)
+        # Write the header line
+        self._csv_writer.writerow(self._csv_header)
 
     def _destroy_writer(self):
         """
         Delete the CSVWriter and close the file handle
         """
-        if MPControl.is_master:
-            self._csv_file_handle.close()
-            self._csv_writer = None
+
+        self._csv_file_handle.close()
+        self._csv_writer = None
 
     def _harmonize_paths(self):
         """
@@ -417,8 +416,7 @@ def _grid_search(self, test=None, value=None, mask_function=None):
 
             results.append(((test, value), result))
 
-            if MPControl.is_master:
-                self._csv_writer.writerow(csv_line)
+            self._csv_writer.writerow(csv_line)
 
             del cv_workflow
 

diff --git a/inferelator/default.py b/inferelator/default.py
@@ -9,14 +9,8 @@
 
 # This is a dict, keyed by the class setattr variable name, of tuples (env name, coercion function, default value)
 SBATCH_VARS = dict(output_dir=('RUNDIR', str, None),
-                   input_dir=('DATADIR', str, None),
-                   rank=('SLURM_PROCID', int, 0),
-                   cores=('SLURM_NTASKS_PER_NODE', int, 1),
-                   tasks=('SLURM_NTASKS', int, 1),
-                   node=('SLURM_NODEID', int, 0),
-                   num_nodes=('SLURM_JOB_NUM_NODES', int, 1))
-
-SBATCH_VARS_FOR_KVS = ["rank", "cores", "tasks", "node", "num_nodes"]
+                   input_dir=('DATADIR', str, None))
+
 SBATCH_VARS_FOR_WORKFLOW = ["output_dir", "input_dir"]
 
 """Default Data File Settings"""

diff --git a/inferelator/distributed/__init__.py b/inferelator/distributed/__init__.py
@@ -7,9 +7,6 @@ class AbstractController:
     # The object which handles the multiprocessing
     client = None
 
-    # Boolean to identify master processes where needed
-    is_master = False
-
     # The chunk sizes for calls to map
     chunk = 25
 
@@ -56,15 +53,6 @@ def set_processes(cls, process_count):
         """
         raise NotImplementedError
 
-    @classmethod
-    @abstractmethod
-    def sync_processes(cls, *args, **kwargs):
-        """
-        This synchronizes multiple processes. Multiprocessing methods which have a defined hierarchy and no risk of
-        race conditions may simply return True
-        """
-        raise NotImplementedError
-
     @classmethod
     @abstractmethod
     def shutdown(cls):

diff --git a/inferelator/distributed/dask_cluster_controller.py b/inferelator/distributed/dask_cluster_controller.py
@@ -112,7 +112,6 @@ class DaskHPCClusterController(AbstractController):
     _controller_name = "dask-cluster"
     _controller_dask = True
 
-    is_master = True
     client = None
 
     # Cluster Controller
@@ -278,14 +277,7 @@ def set_processes(cls, process_count):
         utils.Debug.vprint("Using `set_job_size_params` is highly preferred", level=0)
         utils.Debug.vprint("Configured {n} jobs with {w} workers per job".format(n=cls._job_n, w=cls._job_n_workers),
                            level=0)
-
-    @classmethod
-    def sync_processes(cls, *args, **kwargs):
-        """
-        This is a thing for KVS. Just return True.
-        """
-        return True
-
+
     @classmethod
     def add_worker_env_line(cls, line):
         """

diff --git a/inferelator/distributed/dask_functions.py b/inferelator/distributed/dask_functions.py
@@ -17,7 +17,7 @@
 
 def amusr_regress_dask(X, Y, priors, prior_weight, n_tasks, genes, tfs, G, remove_autoregulation=True,
                        lambda_Bs=None, lambda_Ss=None, Cs=None, Ss=None, regression_function=None, 
-                       tol=None, rel_tol=None):
+                       tol=None, rel_tol=None, use_numba=False):
     """
     Execute multitask (AMUSR)
 
@@ -55,7 +55,8 @@ def regression_maker(j, x_df, y_list, prior, tf):
 
         prior = format_prior(prior, gene, tasks, prior_weight, tfs=tf)
         return j, regression_function(x, y, tf, tasks, gene, prior,
-                                      lambda_Bs=lambda_Bs, lambda_Ss=lambda_Ss, Cs=Cs, Ss=Ss, tol=tol, rel_tol=rel_tol)
+                                      lambda_Bs=lambda_Bs, lambda_Ss=lambda_Ss, Cs=Cs, Ss=Ss, 
+                                      tol=tol, rel_tol=rel_tol, use_numba=use_numba)
 
     def response_maker(y_df, i):
         y = []

diff --git a/inferelator/distributed/dask_k8_controller.py b/inferelator/distributed/dask_k8_controller.py
@@ -27,7 +27,6 @@ class DaskK8Controller(AbstractController):
     _controller_name = "dask-k8"
     _controller_dask = True
 
-    is_master = True
     client = None
 
     ## Dask controller variables ##
@@ -76,13 +75,6 @@ def set_processes(cls, process_count):
         """
         cls.processes = process_count
 
-    @classmethod
-    def sync_processes(self, *args, **kwargs):
-        """
-        This is a thing for KVS. Just return True.
-        """
-        return True
-
     @classmethod
     def check_cluster_state(cls, *args, **kwargs):
         """