Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Multitask multigene #65

Merged
merged 20 commits into from
Jul 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: ['3.8', '3.9', '3.10']
python-version: ['3.9', '3.10', '3.11']

steps:
- uses: actions/checkout@v3
Expand Down
11 changes: 11 additions & 0 deletions docs/changelog.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,17 @@
Change Log
==========

Inferelator v0.6.3 `August 15, 2023`
----------------------------------------

New Functionality:

- Accepts h5ad objects directly into the constructor

Bug Fixes:

- Fixed several deprecated arguments in dependencies

Inferelator v0.6.2 `May 8, 2023`
----------------------------------------

Expand Down
2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
author = 'Chris Jackson'

# The full version, including alpha/beta/rc tags
release = 'v0.6.2'
release = 'v0.6.3'


# -- General configuration ---------------------------------------------------
Expand Down
8 changes: 1 addition & 7 deletions inferelator/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,4 @@
# Set threading control variables if they're not already set

import os
os.environ["MKL_NUM_THREADS"] = os.environ.get("MKL_NUM_THREADS", "1")
os.environ["NUMEXPR_NUM_THREADS"] = os.environ.get("NUMEXPR_NUM_THREADS", "1")
os.environ["OMP_NUM_THREADS"] = os.environ.get("OMP_NUM_THREADS", "1")
os.environ["OPENBLAS_NUM_THREADS"] = os.environ.get("OPENBLAS_NUM_THREADS", "1")
__version__ = '0.6.3'

from inferelator.workflow import inferelator_workflow
from inferelator.crossvalidation_workflow import CrossValidationManager
Expand Down
198 changes: 136 additions & 62 deletions inferelator/benchmarking/scenic.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,39 +25,51 @@
FEATHER_FILE_NAME = "RANKED.gene_based.max.feather"
MOTIF_TABLE_NAME = "motifs-from-binary-prior.tbl"

MOTIF_TABLE_COLS = ['#motif_id',
'motif_name',
'motif_description',
'source_name',
'source_version',
'gene_name',
'motif_similarity_qvalue',
'similar_motif_id',
'similar_motif_description',
'orthologous_identity',
'orthologous_gene_name',
'orthologous_species',
'description']

MOTIF_TABLE_DEFAULTS = {'source_name': "binary_prior",
'source_version': 1.0,
'motif_similarity_qvalue': 0.0,
'similar_motif_id': None,
'similar_motif_description': None,
'orthologous_identity': 1.0,
'orthologous_gene_name': None,
'orthologous_species': None,
'description': "Gene"}

MOTIF_NAME_COLS = ['#motif_id',
'motif_name',
'motif_description',
'source_name',
'source_version',
'gene_name']
MOTIF_TABLE_COLS = [
'#motif_id',
'motif_name',
'motif_description',
'source_name',
'source_version',
'gene_name',
'motif_similarity_qvalue',
'similar_motif_id',
'similar_motif_description',
'orthologous_identity',
'orthologous_gene_name',
'orthologous_species',
'description'
]

MOTIF_TABLE_DEFAULTS = {
'source_name': "binary_prior",
'source_version': 1.0,
'motif_similarity_qvalue': 0.0,
'similar_motif_id': None,
'similar_motif_description': None,
'orthologous_identity': 1.0,
'orthologous_gene_name': None,
'orthologous_species': None,
'description': "Gene"
}

MOTIF_NAME_COLS = [
'#motif_id',
'motif_name',
'motif_description',
'source_name',
'source_version',
'gene_name'
]


class SCENICWorkflow(SingleCellWorkflow):


do_scenic = True

_do_preprocessing = True
_do_scaling = True

dask_temp_path = None
_tmp_handle = None

Expand All @@ -68,45 +80,72 @@ class SCENICWorkflow(SingleCellWorkflow):
def tmp_dir(self):

if self._tmp_handle is None:
self._tmp_handle = tempfile.TemporaryDirectory(prefix="SCENIC_", dir=self.dask_temp_path)

self._tmp_handle = tempfile.TemporaryDirectory(
prefix="SCENIC_",
dir=self.dask_temp_path
)

return self._tmp_handle.name

def startup_finish(self):

self.align_priors_and_expression()

tf_names = self.tf_names if self.tf_names is not None else self.priors_data.columns
self.tf_names = [t for t in tf_names if t in self.data.gene_names]
if self.tf_names is not None:
tf_names = self.tf_names
else:
tf_names = self.priors_data.columns

self.tf_names = [
t for t in tf_names
if t in self.data.gene_names
]

utils.Debug.vprint("Generating SCENIC prior files", level=0)

self._feather_rank_file = self.create_feather_file_from_prior()
self._motif_link_table_file = self.create_motif_table_from_prior()
if self.do_scenic:
self._feather_rank_file = self.create_feather_file_from_prior()
self._motif_link_table_file = self.create_motif_table_from_prior()

if self._do_preprocessing:
utils.Debug.vprint("Preprocessing data")

utils.Debug.vprint("Preprocessing data")
sc.pp.filter_cells(self.data._adata, min_genes=200)
sc.pp.filter_genes(self.data._adata, min_cells=3)

sc.pp.filter_cells(self.data._adata, min_genes=200)
sc.pp.filter_genes(self.data._adata, min_cells=3)
self.data.convert_to_float()

self.data.convert_to_float()
sc.pp.normalize_per_cell(
self.data._adata,
counts_per_cell_after=1e4
)

sc.pp.normalize_per_cell(self.data._adata, counts_per_cell_after=1e4)
sc.pp.log1p(self.data._adata)
sc.pp.scale(self.data._adata, max_value=10)
sc.pp.log1p(self.data._adata)

if self._do_scaling:
sc.pp.scale(self.data._adata, max_value=10)

def create_feather_file_from_prior(self):

# Get rid of TFs which have no edges
new_prior = self.priors_data.loc[:, (self.priors_data != 0).sum(axis=0) > 0]
new_prior = self.priors_data.loc[
:,
(self.priors_data != 0).sum(axis=0) > 0
]

# Make sure to include all genes
new_prior = new_prior.reindex(self.data.gene_names, axis=0).fillna(0).T.astype(int)
new_prior = new_prior.reindex(
self.data.gene_names,
axis=0
).fillna(0).T.astype(int)

new_prior.index.name = 'features'

for i in range(new_prior.shape[0]):
new_prior.iloc[i, :] = scenic_ranking_prior(new_prior.iloc[i, :], seed=42 + i).astype(int)
new_prior.iloc[i, :] = scenic_ranking_prior(
new_prior.iloc[i, :],
seed=42 + i
).astype(int)

new_prior.reset_index(inplace=True)
feather_file = os.path.join(self.tmp_dir, FEATHER_FILE_NAME)
Expand All @@ -129,18 +168,21 @@ def create_motif_table_from_prior(self):
mt.to_csv(motif_table_file, sep="\t", index=False)

return motif_table_file


class SCENICRegression(_RegressionWorkflowMixin):

adjacency_method = "grnboost2"
do_scenic = True

def run_regression(self):

data_df = self.data.to_df()

utils.Debug.vprint("Calculating {m} adjacencies".format(m=self.adjacency_method), level=0)
utils.Debug.vprint(
f"Calculating {self.adjacency_method} adjacencies",
level=0
)

# Get adjacencies
adj_method = ADJ_METHODS[self.adjacency_method]
Expand All @@ -151,53 +193,85 @@ def run_regression(self):
else:
client_or_address = 'local'

adjacencies = adj_method(data_df, tf_names=self.tf_names, verbose=True, client_or_address=client_or_address,
seed=self.random_seed)
adjacencies = adj_method(
data_df,
tf_names=self.tf_names,
verbose=True,
client_or_address=client_or_address,
seed=self.random_seed
)

utils.Debug.vprint(
f"{self.adjacency_method} adjacencies {adjacencies.shape}",
level=0
)

if self.do_scenic:

# Convert adjacencies to modules
modules = list(modules_from_adjacencies(adjacencies, data_df))

# Load feather (rank) databases
dbs = [RankingDatabase(fname = self._feather_rank_file, name = "RANKING_PRIOR")]
dbs = [
RankingDatabase(
fname=self._feather_rank_file,
name="RANKING_PRIOR"
)
]

utils.Debug.vprint("Pruning adjacencies with SCENIC", level=0)

# Prune to df
df = prune2df(dbs, modules, self._motif_link_table_file, client_or_address=client_or_address)

return self.reprocess_scenic_output_to_inferelator_results(df, self.priors_data)
df = prune2df(
dbs,
modules,
self._motif_link_table_file,
client_or_address=client_or_address
)

return self.reprocess_scenic_output_to_inferelator_results(
df,
self.priors_data
)

else:

return self.reprocess_adj_to_inferelator_results(adjacencies)


@staticmethod
def reprocess_scenic_output_to_inferelator_results(scenic_df, prior_data):

# if there's nothing in the scenic output make an empty dataframe of 0s
if scenic_df.shape[0] == 0:
mat = pd.DataFrame(0.0, index=prior_data.index, columns=prior_data.columns)
mat = pd.DataFrame(
0.0,
index=prior_data.index,
columns=prior_data.columns
)

else:
scenic_df = scenic_df.copy()
scenic_df.index = scenic_df.index.droplevel(1)
scenic_df.columns = scenic_df.columns.droplevel(0)

mat = [pd.DataFrame(data).set_index(0).rename({1: tf}, axis=1)
for tf, data in scenic_df['TargetGenes'].iteritems()]
mat = [
pd.DataFrame(data).set_index(0).rename({1: tf}, axis=1)
for tf, data in scenic_df['TargetGenes'].iteritems()
]

mat = pd.concat(mat, axis=0).fillna(0)
mat = mat.groupby(mat.index).agg('max')
mat = mat.reindex(prior_data.columns, axis=1).reindex(prior_data.index, axis=0).fillna(0)

return [mat], [mat.copy()], mat.copy(), mat.copy()

@staticmethod
def reprocess_adj_to_inferelator_results(adj):
mat = adj.pivot(index='target', columns='TF', values='importance').fillna(0.)
mat = adj.pivot(
index='target',
columns='TF',
values='importance'
).fillna(0.)

return [mat], [mat.copy()], mat.copy(), mat.copy()

Expand Down
16 changes: 9 additions & 7 deletions inferelator/crossvalidation_workflow.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
"""
This is a manager which will take an Inferelator workflow and repeatedly run it with different parameters.
This is implemented using deep copies; it is therefore memory-intensive.
This is a manager which will take an Inferelator workflow and repeatedly
run it with different parameters.
This is implemented using deep copies;
it is therefore memory-intensive.
"""

from __future__ import print_function

# I hate py2 now
try:
from builtins import FileExistsError
Expand All @@ -20,8 +20,6 @@
import numpy as np
import pandas as pd


from inferelator.distributed.inferelator_mp import MPControl
from inferelator.utils import Validator as check
from inferelator import utils
from inferelator import workflow
Expand Down Expand Up @@ -102,7 +100,11 @@ def workflow(self):

@workflow.setter
def workflow(self, wkf):
assert check.argument_is_subclass(wkf, workflow.WorkflowBase, allow_none=True)
assert check.argument_is_subclass(
wkf,
workflow.WorkflowBase,
allow_none=True
)
if self._baseline_workflow is not None:
warnings.warn("Replacing stored workflow with a new workflow")
self._baseline_workflow = wkf
Expand Down
Loading
Loading