Skip to content

Commit

Permalink
Merge pull request #48 from flatironinstitute/dev
Browse files Browse the repository at this point in the history
v0.5.7
  • Loading branch information
asistradition authored Sep 29, 2021
2 parents dd532f4 + bffa1f2 commit 2a33c74
Show file tree
Hide file tree
Showing 41 changed files with 380 additions and 710 deletions.
1 change: 1 addition & 0 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ jobs:
python -m pip install -r requirements.txt
python -m pip install -r requirements-test.txt
python -m pip install -r requirements-multiprocessing.txt
python -m pip install numba
- name: Test with pytest & coverage
run: |
python -m coverage run -m pytest
Expand Down
1 change: 1 addition & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
Copyright (c) 2016-2021 The Simons Foundation, Inc.
Copyright (c) 2021 Broad Institute of MIT and Harvard
All rights reserved.

Redistribution and use in source and binary forms, with or without
Expand Down
23 changes: 20 additions & 3 deletions docs/changelog.rst
Original file line number Diff line number Diff line change
@@ -1,22 +1,39 @@
Change Log
==========

Inferelator v0.5.7 `September 29, 2021`
---------------------------------------

New Functionality:

- Added support for numba acceleration of AMuSR with ``.set_run_parameters(use_numba=True)`` (PR #46)

Code Refactoring:

- Updated example scripts
- Removed deprecated KVS multiprocessing and associated code

Bug Fixes:

- Gene labels are included as the first column of the produced confidences TSV file by default
- Matplotlib backend selection checks for non-interactive mode

Inferelator v0.5.6 `August 16, 2021`
-----------------------------------
------------------------------------

New Functionality:

- Added code to randomly generate noise in prior with ``.set_shuffle_parameters(add_prior_noise=None)``
- Added in-workflow benchmarks for CellOracle and pySCENIC
-

Code Refactoring:

- Minor changes to matplotlib interface
- Improved testing for multitask workflows
- Improved error messaging around prior and gold standard
- Switch from Travis.ci to GitHub Actions for continuous integration
-

Inferelator v0.5.5 `April 29, 2021`
-----------------------------------
Expand Down
2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
author = 'Chris Jackson'

# The full version, including alpha/beta/rc tags
release = 'v0.5.6'
release = 'v0.5.7'


# -- General configuration ---------------------------------------------------
Expand Down
34 changes: 31 additions & 3 deletions examples/Bsubtilis_Network_Inference.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,8 @@
"worker.append_to_path('output_dir', 'final')\n",
"worker.set_crossvalidation_parameters(split_gold_standard_for_crossvalidation=False, cv_split_ratio=None)\n",
"worker.set_run_parameters(num_bootstraps=50, random_seed=100)\n",
"final_network = worker.run()"
"\n",
"final_network_results = worker.run()"
]
},
{
Expand All @@ -154,8 +155,35 @@
"outputs": [],
"source": [
"# Visualize network results\n",
"# The workflow returns an InferelatorResults object\n",
"\n",
"# There is a dataframe with an edge table for the final network\n",
"final_network_results.network.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# There is a list of dataframes with model coeffcients\n",
"# Each list element is a dataframe with the results from one bootstrap\n",
"# The dataframes are genes x TFs\n",
"\n",
"final_network_results.betas[0].iloc[0:5, 0:5]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# The confidence scores for each network edge are also accessible\n",
"# This dataframe is genes x TFs\n",
"\n",
"final_network.head()"
"final_network_results.combined_confidences.iloc[0:5, 0:5]"
]
}
],
Expand All @@ -175,7 +203,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
"version": "3.7.10"
}
},
"nbformat": 4,
Expand Down
34 changes: 31 additions & 3 deletions examples/Yeast_Network_Inference.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,8 @@
"worker.append_to_path('output_dir', 'final')\n",
"worker.set_crossvalidation_parameters(split_gold_standard_for_crossvalidation=False, cv_split_ratio=None)\n",
"worker.set_run_parameters(num_bootstraps=50, random_seed=100)\n",
"final_network = worker.run()"
"\n",
"final_network_results = worker.run()"
]
},
{
Expand All @@ -219,8 +220,35 @@
"outputs": [],
"source": [
"# Visualize network results\n",
"# The workflow returns an InferelatorResults object\n",
"\n",
"# There is a dataframe with an edge table for the final network\n",
"final_network_results.network.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# There is a list of dataframes with model coeffcients\n",
"# Each list element is a dataframe with the results from one bootstrap\n",
"# The dataframes are genes x TFs\n",
"\n",
"final_network_results.betas[0].iloc[0:5, 0:5]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# The confidence scores for each network edge are also accessible\n",
"# This dataframe is genes x TFs\n",
"\n",
"final_network.head()"
"final_network_results.combined_confidences.iloc[0:5, 0:5]"
]
}
],
Expand All @@ -240,7 +268,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
"version": "3.7.10"
}
},
"nbformat": 4,
Expand Down
18 changes: 8 additions & 10 deletions inferelator/amusr_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,16 +386,14 @@ def emit_results(self, betas, rescaled_betas, gold_standard, priors_data):
This is called when `.startup()` is run. It is not necessary to call separately.
"""
if self.is_master():
self.create_output_dir()
rp = self._result_processor_driver(betas, rescaled_betas, filter_method=self.gold_standard_filter_method,
metric=self.metric)
rp.tasks_names = self._task_names
self.results = rp.summarize_network(self.output_dir, gold_standard, self._task_priors)
self.task_results = rp.tasks_networks
return self.results
else:
return None

self.create_output_dir()
rp = self._result_processor_driver(betas, rescaled_betas, filter_method=self.gold_standard_filter_method,
metric=self.metric)
rp.tasks_names = self._task_names
self.results = rp.summarize_network(self.output_dir, gold_standard, self._task_priors)
self.task_results = rp.tasks_networks
return self.results


def create_task_data_object(workflow_class="single-cell"):
Expand Down
8 changes: 5 additions & 3 deletions inferelator/benchmarking/scenic.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,10 +187,12 @@ def reprocess_scenic_output_to_inferelator_results(scenic_df, prior_data):
scenic_df.columns = scenic_df.columns.droplevel(0)

mat = [pd.DataFrame(data).set_index(0).rename({1: tf}, axis=1)
for tf, data in scenic_df['TargetGenes'].iteritems()]

mat = pd.concat(mat, axis=0).reindex(prior_data.columns, axis=1).fillna(0)
for tf, data in scenic_df['TargetGenes'].iteritems()]

mat = pd.concat(mat, axis=0).fillna(0)
mat = mat.groupby(mat.index).agg('max')
mat = mat.reindex(prior_data.columns, axis=1).reindex(prior_data.index, axis=0).fillna(0)

return [mat], [mat.copy()]

@staticmethod
Expand Down
36 changes: 17 additions & 19 deletions inferelator/crossvalidation_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,33 +266,32 @@ def _create_writer(self):
Create a CSVWriter and stash it in self.writer
"""

if MPControl.is_master:
# Create a CSV header from grid search param names
self._csv_header = copy.copy(self.grid_params) if self.grid_params is not None else []
# Create a CSV header from grid search param names
self._csv_header = copy.copy(self.grid_params) if self.grid_params is not None else []

# Add Test & Value columns for dropouts/etc
self._csv_header.extend(["Test", "Value", "Num_Obs"])
# Add Test & Value columns for dropouts/etc
self._csv_header.extend(["Test", "Value", "Num_Obs"])

# Also add the metric name
self._csv_header.extend(MetricHandler.get_metric(self.workflow.metric).all_names())
# Also add the metric name
self._csv_header.extend(MetricHandler.get_metric(self.workflow.metric).all_names())

# Create a CSV writer
self._create_output_path()
self._open_csv_handle()
# Create a CSV writer
self._create_output_path()
self._open_csv_handle()

self._csv_writer = self._csv_writer_object(self._csv_file_handle,
delimiter="\t", lineterminator="\n", quoting=csv.QUOTE_NONE)
self._csv_writer = self._csv_writer_object(self._csv_file_handle,
delimiter="\t", lineterminator="\n", quoting=csv.QUOTE_NONE)

# Write the header line
self._csv_writer.writerow(self._csv_header)
# Write the header line
self._csv_writer.writerow(self._csv_header)

def _destroy_writer(self):
"""
Delete the CSVWriter and close the file handle
"""
if MPControl.is_master:
self._csv_file_handle.close()
self._csv_writer = None

self._csv_file_handle.close()
self._csv_writer = None

def _harmonize_paths(self):
"""
Expand Down Expand Up @@ -417,8 +416,7 @@ def _grid_search(self, test=None, value=None, mask_function=None):

results.append(((test, value), result))

if MPControl.is_master:
self._csv_writer.writerow(csv_line)
self._csv_writer.writerow(csv_line)

del cv_workflow

Expand Down
10 changes: 2 additions & 8 deletions inferelator/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,8 @@

# This is a dict, keyed by the class setattr variable name, of tuples (env name, coercion function, default value)
SBATCH_VARS = dict(output_dir=('RUNDIR', str, None),
input_dir=('DATADIR', str, None),
rank=('SLURM_PROCID', int, 0),
cores=('SLURM_NTASKS_PER_NODE', int, 1),
tasks=('SLURM_NTASKS', int, 1),
node=('SLURM_NODEID', int, 0),
num_nodes=('SLURM_JOB_NUM_NODES', int, 1))

SBATCH_VARS_FOR_KVS = ["rank", "cores", "tasks", "node", "num_nodes"]
input_dir=('DATADIR', str, None))

SBATCH_VARS_FOR_WORKFLOW = ["output_dir", "input_dir"]

"""Default Data File Settings"""
Expand Down
12 changes: 0 additions & 12 deletions inferelator/distributed/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,6 @@ class AbstractController:
# The object which handles the multiprocessing
client = None

# Boolean to identify master processes where needed
is_master = False

# The chunk sizes for calls to map
chunk = 25

Expand Down Expand Up @@ -56,15 +53,6 @@ def set_processes(cls, process_count):
"""
raise NotImplementedError

@classmethod
@abstractmethod
def sync_processes(cls, *args, **kwargs):
"""
This synchronizes multiple processes. Multiprocessing methods which have a defined hierarchy and no risk of
race conditions may simply return True
"""
raise NotImplementedError

@classmethod
@abstractmethod
def shutdown(cls):
Expand Down
10 changes: 1 addition & 9 deletions inferelator/distributed/dask_cluster_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,6 @@ class DaskHPCClusterController(AbstractController):
_controller_name = "dask-cluster"
_controller_dask = True

is_master = True
client = None

# Cluster Controller
Expand Down Expand Up @@ -278,14 +277,7 @@ def set_processes(cls, process_count):
utils.Debug.vprint("Using `set_job_size_params` is highly preferred", level=0)
utils.Debug.vprint("Configured {n} jobs with {w} workers per job".format(n=cls._job_n, w=cls._job_n_workers),
level=0)

@classmethod
def sync_processes(cls, *args, **kwargs):
"""
This is a thing for KVS. Just return True.
"""
return True


@classmethod
def add_worker_env_line(cls, line):
"""
Expand Down
5 changes: 3 additions & 2 deletions inferelator/distributed/dask_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

def amusr_regress_dask(X, Y, priors, prior_weight, n_tasks, genes, tfs, G, remove_autoregulation=True,
lambda_Bs=None, lambda_Ss=None, Cs=None, Ss=None, regression_function=None,
tol=None, rel_tol=None):
tol=None, rel_tol=None, use_numba=False):
"""
Execute multitask (AMUSR)
Expand Down Expand Up @@ -55,7 +55,8 @@ def regression_maker(j, x_df, y_list, prior, tf):

prior = format_prior(prior, gene, tasks, prior_weight, tfs=tf)
return j, regression_function(x, y, tf, tasks, gene, prior,
lambda_Bs=lambda_Bs, lambda_Ss=lambda_Ss, Cs=Cs, Ss=Ss, tol=tol, rel_tol=rel_tol)
lambda_Bs=lambda_Bs, lambda_Ss=lambda_Ss, Cs=Cs, Ss=Ss,
tol=tol, rel_tol=rel_tol, use_numba=use_numba)

def response_maker(y_df, i):
y = []
Expand Down
8 changes: 0 additions & 8 deletions inferelator/distributed/dask_k8_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ class DaskK8Controller(AbstractController):
_controller_name = "dask-k8"
_controller_dask = True

is_master = True
client = None

## Dask controller variables ##
Expand Down Expand Up @@ -76,13 +75,6 @@ def set_processes(cls, process_count):
"""
cls.processes = process_count

@classmethod
def sync_processes(self, *args, **kwargs):
"""
This is a thing for KVS. Just return True.
"""
return True

@classmethod
def check_cluster_state(cls, *args, **kwargs):
"""
Expand Down
Loading

0 comments on commit 2a33c74

Please sign in to comment.