Skip to content

Commit

Permalink
Merge pull request #61 from flatironinstitute/multitask_multigene
Browse files Browse the repository at this point in the history
Multitask multigene
  • Loading branch information
asistradition authored May 8, 2023
2 parents cc43a81 + 3eb3371 commit 1865cac
Show file tree
Hide file tree
Showing 49 changed files with 5,223 additions and 2,173 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: ['3.7', '3.8', '3.9', '3.10']
python-version: ['3.8', '3.9', '3.10']

steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
Expand Down
21 changes: 21 additions & 0 deletions docs/changelog.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,27 @@
Change Log
==========

Inferelator v0.6.2 `May 8, 2023`
----------------------------------------

New Functionality:

- Generates & reports non-bootstrap model weights as part of results
- Saves full model information into an h5ad file
- Added new experimental prediction modules
- Added new preprocessing & normalization options

Code Refactoring:

- Logging messages now use logging module

Bug Fixes:

- Fixed several errors when sparse data was passed unexpectedly
- Corrected several deprecated numpy calls
- Updated calls and version requirement to anndata


Inferelator v0.6.1 `January 3, 2023`
----------------------------------------

Expand Down
2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
author = 'Chris Jackson'

# The full version, including alpha/beta/rc tags
release = 'v0.6.1'
release = 'v0.6.2'


# -- General configuration ---------------------------------------------------
Expand Down
9 changes: 8 additions & 1 deletion inferelator/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,11 @@
from inferelator.utils import inferelator_verbose_level
from inferelator.distributed.inferelator_mp import MPControl

from inferelator.workflows import amusr_workflow, single_cell_workflow, tfa_workflow, velocity_workflow
from inferelator.workflows import (
amusr_workflow,
single_cell_workflow,
tfa_workflow,
velocity_workflow
)

from inferelator.regression.base_regression import PreprocessData
47 changes: 38 additions & 9 deletions inferelator/benchmarking/celloracle.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,10 +84,25 @@ def reprocess_prior_to_base_GRN(priors_data):
@staticmethod
def reprocess_co_output_to_inferelator_results(co_out):

betas = [r.pivot(index='target', columns='source', values='coef_mean').fillna(0) for k, r in co_out.items()]
rankers = [r.pivot(index='target', columns='source', values='-logp').fillna(0) for k, r in co_out.items()]

return betas, rankers
betas = [
r.pivot(
index='target',
columns='source',
values='coef_mean'
).fillna(0)
for k, r in co_out.items()
]

rankers = [
r.pivot(
index='target',
columns='source',
values='-logp'
).fillna(0)
for k, r in co_out.items()
]

return betas, rankers, betas[0], rankers[0]


class CellOracleRegression(_RegressionWorkflowMixin):
Expand All @@ -108,14 +123,23 @@ def run_regression(self):
oracle.perform_PCA(100)

# Add prior
oracle.addTFinfo_dictionary(self.reprocess_prior_to_base_GRN(self.priors_data))
oracle.addTFinfo_dictionary(
self.reprocess_prior_to_base_GRN(self.priors_data)
)

utils.Debug.vprint("Imputation Preprocessing")

if self.oracle_imputation:

# Heuristics from Celloracle documentation
n_comps = np.where(np.diff(np.diff(np.cumsum(oracle.pca.explained_variance_ratio_))>0.002))[0][0]
n_comps = np.where(
np.diff(
np.diff(
np.cumsum(oracle.pca.explained_variance_ratio_)
) > 0.002
)
)[0][0]

k = int(0.025 * oracle.adata.shape[0])

# Make sure n_comps is between 10 and 50
Expand All @@ -125,15 +149,20 @@ def run_regression(self):
# Make sure k is at least 25 too I guess
k = max(k, 25)

oracle.knn_imputation(n_pca_dims=n_comps, k=k, balanced=True, b_sight=k*8,
b_maxl=k*4, n_jobs=4)
oracle.knn_imputation(
n_pca_dims=n_comps,
k=k, balanced=True,
b_sight=k*8,
b_maxl=k*4,
n_jobs=4
)

# Pretend to do imputation
else:
oracle.adata.layers["imputed_count"] = oracle.adata.layers["normalized_count"].copy()

utils.Debug.vprint("CellOracle GRN inference")

# Call GRN inference
links = oracle.get_links(cluster_name_for_GRN_unit="louvain", alpha=10,
verbose_level=0, test_mode=False)
Expand Down
4 changes: 2 additions & 2 deletions inferelator/benchmarking/scenic.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,13 +193,13 @@ def reprocess_scenic_output_to_inferelator_results(scenic_df, prior_data):
mat = mat.groupby(mat.index).agg('max')
mat = mat.reindex(prior_data.columns, axis=1).reindex(prior_data.index, axis=0).fillna(0)

return [mat], [mat.copy()]
return [mat], [mat.copy()], mat.copy(), mat.copy()

@staticmethod
def reprocess_adj_to_inferelator_results(adj):
mat = adj.pivot(index='target', columns='TF', values='importance').fillna(0.)

return [mat], [mat.copy()]
return [mat], [mat.copy()], mat.copy(), mat.copy()

# This code is lifted from https://github.com/aertslab/create_cisTarget_databases/cistarget_db.py
# It is not reimplemented in order to ensure that the methodology for ranking is identical
Expand Down
52 changes: 33 additions & 19 deletions inferelator/distributed/dask_cluster_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,10 +192,10 @@ def connect(cls, *args, **kwargs):
cores=cls._job_n_workers * cls._worker_n_threads,
processes=cls._job_n_workers,
job_mem=cls._job_mem,
env_extra=cls._config_env(),
job_script_prologue=cls._config_env(),
local_directory=cls._local_directory,
memory=cls._job_mem,
job_extra=cls._job_slurm_commands,
job_extra_directives=cls._job_slurm_commands,
job_cls=SLURMJobNoMemLimit,
**kwargs
)
Expand Down Expand Up @@ -475,10 +475,11 @@ def _config_str(cls):

return (
f"Dask cluster: Allocated {cls._job_n} jobs ({cls._job_n_workers} "
f"workers with {cls._job_mem} memory per job)\n"
f"SLURM: -p {cls._queue}, -A {cls._project}, " +
", ".join(cls._job_slurm_commands) + "\n",
"ENV: " + "\n\t".join(cls._job_extra_env_commands) + "\n"
f"workers with {cls._job_mem} memory per job) "
f"plus {cls._num_local_workers} local workers "
f"[SLURM]: -p {cls._queue}, -A {cls._project}, "
f"{', '.join(cls._job_slurm_commands)} "
f"[ENV]: {', '.join(cls._job_extra_env_commands)}"
)

@classmethod
Expand All @@ -491,17 +492,21 @@ def _config_env(cls):
@classmethod
def _scale_jobs(cls):
"""
Update the worker tracker. If an entire slurm job is dead, start a new one to replace it.
Update the worker tracker. If an entire slurm job is dead,
start a new one to replace it.
"""
cls._tracker.update_lists(
cls.local_cluster.observed,
cls.local_cluster.worker_spec
)

new_jobs = cls._job_n + cls._tracker.num_dead
max_jobs = cls._runaway_protection * cls._job_n

if cls._runaway_protection is not None and new_jobs > cls._runaway_protection * cls._job_n:
raise RuntimeError("Aborting excessive worker startups / Protecting against runaway job queueing")
if cls._runaway_protection is not None and new_jobs > max_jobs:
raise RuntimeError(
"Aborting excessive worker startups and "
"protecting against runaway job queueing")
elif new_jobs > len(cls.local_cluster.worker_spec):
cls.local_cluster.scale(jobs=new_jobs)

Expand All @@ -513,20 +518,29 @@ def _add_local_node_workers(cls, num_workers):
:param num_workers: The number of workers to start on this node
:type num_workers: int
"""
check.argument_integer(num_workers, low=0, allow_none=True)
check.argument_integer(
num_workers,
low=0,
allow_none=True
)

if num_workers is not None and num_workers > 0:

# Build a dask-worker command
cmd = [cls._local_worker_command,
str(cls.local_cluster.scheduler_address),
"--nprocs", str(num_workers),
"--nthreads", str(cls._worker_n_threads),
"--memory-limit", "0",
"--local-directory", str(cls._local_directory)]
cmd = [
cls._local_worker_command,
str(cls.local_cluster.scheduler_address),
"--nprocs", str(num_workers),
"--nthreads", str(cls._worker_n_threads),
"--memory-limit", "0",
"--local-directory", str(cls._local_directory)
]

# Execute it through the Popen ()
out_path = cls._log_directory if cls._log_directory is not None else "."
if cls._log_directory is not None:
out_path = cls._log_directory
else:
out_path = "."

if not os.path.exists(out_path):
os.makedirs(out_path, exist_ok=True)
Expand Down Expand Up @@ -561,8 +575,8 @@ def _total_workers(
_total = cls._job_n_workers if cls._job_n_workers is not None else 0
_total *= cls._job_n if cls._job_n is not None else 0

if include_local:
_total += cls._num_local_workers if cls._num_local_workers is not None else 0
if include_local and cls._num_local_workers is not None:
_total += cls._num_local_workers

return _total

Expand Down
10 changes: 7 additions & 3 deletions inferelator/postprocessing/column_names.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
"""
This is a central location for dataframe column names used in the postprocessing modules
This is a central location for dataframe column names used in the
postprocessing modules
They're weirdly named for historical reasons and can be changed with no consequence unless you have other
They're weirdly named for historical reasons and can be changed with
no consequence unless you have other
code that uses these names
"""

Expand All @@ -14,6 +16,9 @@
TARGET_COLUMN = "target"
REGULATOR_COLUMN = "regulator"

MODEL_COEF_COLUMN = "model_coefficient"
MODEL_EXP_VAR_COLUMN = "model_exp_var"

# Precision/Recall

PRECISION_COLUMN = "precision"
Expand All @@ -30,4 +35,3 @@
# Confusion Matrix

TP, FP, TN, FN = 'TP', 'FP', 'TN', 'FN'

Loading

0 comments on commit 1865cac

Please sign in to comment.