Skip to content

Commit

Permalink
Merge pull request #65 from flatironinstitute/multitask_multigene
Browse files Browse the repository at this point in the history
v0.6.3
  • Loading branch information
asistradition authored Jul 8, 2024
2 parents 5fd992e + 23848c9 commit 9a0e0a8
Show file tree
Hide file tree
Showing 48 changed files with 1,642 additions and 550 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: ['3.8', '3.9', '3.10']
python-version: ['3.9', '3.10', '3.11']

steps:
- uses: actions/checkout@v3
Expand Down
11 changes: 11 additions & 0 deletions docs/changelog.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,17 @@
Change Log
==========

Inferelator v0.6.3 `August 15, 2023`
----------------------------------------

New Functionality:

- Accepts h5ad objects directly into the constructor

Bug Fixes:

- Fixed several deprecated arguments in dependencies

Inferelator v0.6.2 `May 8, 2023`
----------------------------------------

Expand Down
2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
author = 'Chris Jackson'

# The full version, including alpha/beta/rc tags
release = 'v0.6.2'
release = 'v0.6.3'


# -- General configuration ---------------------------------------------------
Expand Down
8 changes: 1 addition & 7 deletions inferelator/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,4 @@
# Set threading control variables if they're not already set

import os
os.environ["MKL_NUM_THREADS"] = os.environ.get("MKL_NUM_THREADS", "1")
os.environ["NUMEXPR_NUM_THREADS"] = os.environ.get("NUMEXPR_NUM_THREADS", "1")
os.environ["OMP_NUM_THREADS"] = os.environ.get("OMP_NUM_THREADS", "1")
os.environ["OPENBLAS_NUM_THREADS"] = os.environ.get("OPENBLAS_NUM_THREADS", "1")
__version__ = '0.6.3'

from inferelator.workflow import inferelator_workflow
from inferelator.crossvalidation_workflow import CrossValidationManager
Expand Down
198 changes: 136 additions & 62 deletions inferelator/benchmarking/scenic.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,39 +25,51 @@
FEATHER_FILE_NAME = "RANKED.gene_based.max.feather"
MOTIF_TABLE_NAME = "motifs-from-binary-prior.tbl"

MOTIF_TABLE_COLS = ['#motif_id',
'motif_name',
'motif_description',
'source_name',
'source_version',
'gene_name',
'motif_similarity_qvalue',
'similar_motif_id',
'similar_motif_description',
'orthologous_identity',
'orthologous_gene_name',
'orthologous_species',
'description']

MOTIF_TABLE_DEFAULTS = {'source_name': "binary_prior",
'source_version': 1.0,
'motif_similarity_qvalue': 0.0,
'similar_motif_id': None,
'similar_motif_description': None,
'orthologous_identity': 1.0,
'orthologous_gene_name': None,
'orthologous_species': None,
'description': "Gene"}

MOTIF_NAME_COLS = ['#motif_id',
'motif_name',
'motif_description',
'source_name',
'source_version',
'gene_name']
MOTIF_TABLE_COLS = [
'#motif_id',
'motif_name',
'motif_description',
'source_name',
'source_version',
'gene_name',
'motif_similarity_qvalue',
'similar_motif_id',
'similar_motif_description',
'orthologous_identity',
'orthologous_gene_name',
'orthologous_species',
'description'
]

MOTIF_TABLE_DEFAULTS = {
'source_name': "binary_prior",
'source_version': 1.0,
'motif_similarity_qvalue': 0.0,
'similar_motif_id': None,
'similar_motif_description': None,
'orthologous_identity': 1.0,
'orthologous_gene_name': None,
'orthologous_species': None,
'description': "Gene"
}

MOTIF_NAME_COLS = [
'#motif_id',
'motif_name',
'motif_description',
'source_name',
'source_version',
'gene_name'
]


class SCENICWorkflow(SingleCellWorkflow):


do_scenic = True

_do_preprocessing = True
_do_scaling = True

dask_temp_path = None
_tmp_handle = None

Expand All @@ -68,45 +80,72 @@ class SCENICWorkflow(SingleCellWorkflow):
def tmp_dir(self):

if self._tmp_handle is None:
self._tmp_handle = tempfile.TemporaryDirectory(prefix="SCENIC_", dir=self.dask_temp_path)

self._tmp_handle = tempfile.TemporaryDirectory(
prefix="SCENIC_",
dir=self.dask_temp_path
)

return self._tmp_handle.name

def startup_finish(self):

self.align_priors_and_expression()

tf_names = self.tf_names if self.tf_names is not None else self.priors_data.columns
self.tf_names = [t for t in tf_names if t in self.data.gene_names]
if self.tf_names is not None:
tf_names = self.tf_names
else:
tf_names = self.priors_data.columns

self.tf_names = [
t for t in tf_names
if t in self.data.gene_names
]

utils.Debug.vprint("Generating SCENIC prior files", level=0)

self._feather_rank_file = self.create_feather_file_from_prior()
self._motif_link_table_file = self.create_motif_table_from_prior()
if self.do_scenic:
self._feather_rank_file = self.create_feather_file_from_prior()
self._motif_link_table_file = self.create_motif_table_from_prior()

if self._do_preprocessing:
utils.Debug.vprint("Preprocessing data")

utils.Debug.vprint("Preprocessing data")
sc.pp.filter_cells(self.data._adata, min_genes=200)
sc.pp.filter_genes(self.data._adata, min_cells=3)

sc.pp.filter_cells(self.data._adata, min_genes=200)
sc.pp.filter_genes(self.data._adata, min_cells=3)
self.data.convert_to_float()

self.data.convert_to_float()
sc.pp.normalize_per_cell(
self.data._adata,
counts_per_cell_after=1e4
)

sc.pp.normalize_per_cell(self.data._adata, counts_per_cell_after=1e4)
sc.pp.log1p(self.data._adata)
sc.pp.scale(self.data._adata, max_value=10)
sc.pp.log1p(self.data._adata)

if self._do_scaling:
sc.pp.scale(self.data._adata, max_value=10)

def create_feather_file_from_prior(self):

# Get rid of TFs which have no edges
new_prior = self.priors_data.loc[:, (self.priors_data != 0).sum(axis=0) > 0]
new_prior = self.priors_data.loc[
:,
(self.priors_data != 0).sum(axis=0) > 0
]

# Make sure to include all genes
new_prior = new_prior.reindex(self.data.gene_names, axis=0).fillna(0).T.astype(int)
new_prior = new_prior.reindex(
self.data.gene_names,
axis=0
).fillna(0).T.astype(int)

new_prior.index.name = 'features'

for i in range(new_prior.shape[0]):
new_prior.iloc[i, :] = scenic_ranking_prior(new_prior.iloc[i, :], seed=42 + i).astype(int)
new_prior.iloc[i, :] = scenic_ranking_prior(
new_prior.iloc[i, :],
seed=42 + i
).astype(int)

new_prior.reset_index(inplace=True)
feather_file = os.path.join(self.tmp_dir, FEATHER_FILE_NAME)
Expand All @@ -129,18 +168,21 @@ def create_motif_table_from_prior(self):
mt.to_csv(motif_table_file, sep="\t", index=False)

return motif_table_file


class SCENICRegression(_RegressionWorkflowMixin):

adjacency_method = "grnboost2"
do_scenic = True

def run_regression(self):

data_df = self.data.to_df()

utils.Debug.vprint("Calculating {m} adjacencies".format(m=self.adjacency_method), level=0)
utils.Debug.vprint(
f"Calculating {self.adjacency_method} adjacencies",
level=0
)

# Get adjacencies
adj_method = ADJ_METHODS[self.adjacency_method]
Expand All @@ -151,53 +193,85 @@ def run_regression(self):
else:
client_or_address = 'local'

adjacencies = adj_method(data_df, tf_names=self.tf_names, verbose=True, client_or_address=client_or_address,
seed=self.random_seed)
adjacencies = adj_method(
data_df,
tf_names=self.tf_names,
verbose=True,
client_or_address=client_or_address,
seed=self.random_seed
)

utils.Debug.vprint(
f"{self.adjacency_method} adjacencies {adjacencies.shape}",
level=0
)

if self.do_scenic:

# Convert adjacencies to modules
modules = list(modules_from_adjacencies(adjacencies, data_df))

# Load feather (rank) databases
dbs = [RankingDatabase(fname = self._feather_rank_file, name = "RANKING_PRIOR")]
dbs = [
RankingDatabase(
fname=self._feather_rank_file,
name="RANKING_PRIOR"
)
]

utils.Debug.vprint("Pruning adjacencies with SCENIC", level=0)

# Prune to df
df = prune2df(dbs, modules, self._motif_link_table_file, client_or_address=client_or_address)

return self.reprocess_scenic_output_to_inferelator_results(df, self.priors_data)
df = prune2df(
dbs,
modules,
self._motif_link_table_file,
client_or_address=client_or_address
)

return self.reprocess_scenic_output_to_inferelator_results(
df,
self.priors_data
)

else:

return self.reprocess_adj_to_inferelator_results(adjacencies)


@staticmethod
def reprocess_scenic_output_to_inferelator_results(scenic_df, prior_data):

# if there's nothing in the scenic output make an empty dataframe of 0s
if scenic_df.shape[0] == 0:
mat = pd.DataFrame(0.0, index=prior_data.index, columns=prior_data.columns)
mat = pd.DataFrame(
0.0,
index=prior_data.index,
columns=prior_data.columns
)

else:
scenic_df = scenic_df.copy()
scenic_df.index = scenic_df.index.droplevel(1)
scenic_df.columns = scenic_df.columns.droplevel(0)

mat = [pd.DataFrame(data).set_index(0).rename({1: tf}, axis=1)
for tf, data in scenic_df['TargetGenes'].iteritems()]
mat = [
pd.DataFrame(data).set_index(0).rename({1: tf}, axis=1)
for tf, data in scenic_df['TargetGenes'].iteritems()
]

mat = pd.concat(mat, axis=0).fillna(0)
mat = mat.groupby(mat.index).agg('max')
mat = mat.reindex(prior_data.columns, axis=1).reindex(prior_data.index, axis=0).fillna(0)

return [mat], [mat.copy()], mat.copy(), mat.copy()

@staticmethod
def reprocess_adj_to_inferelator_results(adj):
mat = adj.pivot(index='target', columns='TF', values='importance').fillna(0.)
mat = adj.pivot(
index='target',
columns='TF',
values='importance'
).fillna(0.)

return [mat], [mat.copy()], mat.copy(), mat.copy()

Expand Down
16 changes: 9 additions & 7 deletions inferelator/crossvalidation_workflow.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
"""
This is a manager which will take an Inferelator workflow and repeatedly run it with different parameters.
This is implemented using deep copies; it is therefore memory-intensive.
This is a manager which will take an Inferelator workflow and repeatedly
run it with different parameters.
This is implemented using deep copies;
it is therefore memory-intensive.
"""

from __future__ import print_function

# I hate py2 now
try:
from builtins import FileExistsError
Expand All @@ -20,8 +20,6 @@
import numpy as np
import pandas as pd


from inferelator.distributed.inferelator_mp import MPControl
from inferelator.utils import Validator as check
from inferelator import utils
from inferelator import workflow
Expand Down Expand Up @@ -102,7 +100,11 @@ def workflow(self):

@workflow.setter
def workflow(self, wkf):
assert check.argument_is_subclass(wkf, workflow.WorkflowBase, allow_none=True)
assert check.argument_is_subclass(
wkf,
workflow.WorkflowBase,
allow_none=True
)
if self._baseline_workflow is not None:
warnings.warn("Replacing stored workflow with a new workflow")
self._baseline_workflow = wkf
Expand Down
Loading

0 comments on commit 9a0e0a8

Please sign in to comment.