From 529450f40e2739779201619dfe5a196377395cc8 Mon Sep 17 00:00:00 2001 From: Chris Flerin Date: Tue, 9 Jun 2020 12:37:52 +0200 Subject: [PATCH 1/6] Add grn multiprocessing script to package CLI --- setup.py | 4 +++- .../pyscenic/cli}/arboreto_with_multiprocessing.py | 0 2 files changed, 3 insertions(+), 1 deletion(-) rename {scripts => src/pyscenic/cli}/arboreto_with_multiprocessing.py (100%) diff --git a/setup.py b/setup.py index d483792..e6a4ad4 100644 --- a/setup.py +++ b/setup.py @@ -78,6 +78,7 @@ def read_requirements(fname): py_modules=[os.path.splitext(os.path.basename(path))[0] for path in glob.glob('src/*.py')], include_package_data=True, install_requires=read_requirements('requirements.txt'), + scripts=['src/pyscenic/cli/arboreto_with_multiprocessing.py'], entry_points = { 'console_scripts': ['pyscenic = pyscenic.cli.pyscenic:main', 'db2feather = pyscenic.cli.db2feather:main', @@ -85,4 +86,5 @@ def read_requirements(fname): 'invertdb = pyscenic.cli.invertdb:main', 'gmt2regions = pyscenic.cli.gmt2regions:main'], } -) \ No newline at end of file +) + diff --git a/scripts/arboreto_with_multiprocessing.py b/src/pyscenic/cli/arboreto_with_multiprocessing.py similarity index 100% rename from scripts/arboreto_with_multiprocessing.py rename to src/pyscenic/cli/arboreto_with_multiprocessing.py From 90f9bc68510f6aa521ac9ee0c09095d63c3faa57 Mon Sep 17 00:00:00 2001 From: Chris Flerin Date: Tue, 9 Jun 2020 12:53:12 +0200 Subject: [PATCH 2/6] Updated docs for mp script --- README.rst | 2 +- docs/faq.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 7f8bf44..23342ce 100644 --- a/README.rst +++ b/README.rst @@ -30,7 +30,7 @@ News **0.10.0 release** -* Added a helper script `arboreto_with_multiprocessing.py `_ that runs the Arboreto GRN algorithms (GRNBoost2, GENIE3) without Dask for compatibility. +* Added a helper script `arboreto_with_multiprocessing.py `_ that runs the Arboreto GRN algorithms (GRNBoost2, GENIE3) without Dask for compatibility. * Ability to set a fixed seed in both the AUCell step and in the calculation of regulon thresholds (CLI parameter :code:`--seed`; aucell function parameter :code:`seed`). diff --git a/docs/faq.rst b/docs/faq.rst index d65a5b5..62a903c 100644 --- a/docs/faq.rst +++ b/docs/faq.rst @@ -14,7 +14,7 @@ It is recommended to use the older version of Dask and Distributed for stability But in many cases this still results in issues with the GRN step. -An alternative is to use the multiprocessing implementation of Arboreto recently included in pySCENIC (`arboreto_with_multiprocessing.py `_). +An alternative is to use the multiprocessing implementation of Arboreto recently included in pySCENIC (`arboreto_with_multiprocessing.py `_). This script uses the Arboreto and pySCENIC codebase to run GRNBoost2 (or GENIE3) without Dask. The eliminates the possibility of running the GRN step across multiple nodes, but brings provides additional stability. The run time is generally equivalent to the Dask implementation using the same number of workers. From 08e2bbc1f7eff3081640e73b2555991c4735c788 Mon Sep 17 00:00:00 2001 From: Chris Flerin Date: Mon, 18 May 2020 21:02:25 +0200 Subject: [PATCH 3/6] Add error message when regulons file is empty - Resolves #133 --- src/pyscenic/transform.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/pyscenic/transform.py b/src/pyscenic/transform.py index 5ea86dd..36b2f23 100644 --- a/src/pyscenic/transform.py +++ b/src/pyscenic/transform.py @@ -300,6 +300,7 @@ def df2regulons(df, save_columns=[]) -> Sequence[Regulon]: :return: A sequence of regulons. """ + assert not df.empty, 'Signatures dataframe is empty!' print("Create regulons from a dataframe of enriched features.") print("Additional columns saved: {}".format(save_columns)) From 3205b9a9f0037d98a2ec69a2259e075870c736fe Mon Sep 17 00:00:00 2001 From: Chris Flerin Date: Wed, 3 Jun 2020 10:23:17 +0200 Subject: [PATCH 4/6] Bugfix in TF-gene correlation calculation: - If there are genes present in the network adjacencies that are missing from the expression matrix, the modules_from_adjacencies function will fail with a pandas KeyError. - Fixes #103 and #149 --- src/pyscenic/utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/pyscenic/utils.py b/src/pyscenic/utils.py index 79d0d1a..ebbde01 100644 --- a/src/pyscenic/utils.py +++ b/src/pyscenic/utils.py @@ -263,6 +263,9 @@ def iter_modules(adjc, context): # Add correlation column and create two disjoint set of adjacencies. LOGGER.info("Calculating Pearson correlations.") + # test for genes present in the adjacencies but not present in the expression matrix: + unique_adj_genes = set(adjacencies[COLUMN_NAME_TF]).union(set(adjacencies[COLUMN_NAME_TARGET])) - set(ex_mtx.columns) + assert len(unique_adj_genes)==0, f"Found {len(unique_adj_genes)} genes present in the network (adjacencies) output, but missing from the expression matrix. Is this a different gene expression matrix?" LOGGER.warn(f"Note on correlation calculation: the default behaviour for calculating the correlations has changed after pySCENIC verion 0.9.16. Previously, the default was to calculate the correlation between a TF and target gene using only cells with non-zero expression values (mask_dropouts=True). The current default is now to use all cells to match the behavior of the R verision of SCENIC. The original settings can be retained by setting 'rho_mask_dropouts=True' in the modules_from_adjacencies function, or '--mask_dropouts' from the CLI.\n\tDropout masking is currently set to [{rho_mask_dropouts}].") adjacencies = add_correlation(adjacencies, ex_mtx, rho_threshold=rho_threshold, mask_dropouts=rho_mask_dropouts) From 18f5f4e302b11e1a26aaf9d0c7332372ebbf1f6d Mon Sep 17 00:00:00 2001 From: Chris Flerin Date: Wed, 3 Jun 2020 21:59:56 +0200 Subject: [PATCH 5/6] cisTarget step: Check for modules with zero db overlap. - Previously such modules would cause an error, now these modules are skipped. - Related to #158, #177, #132, #85 --- src/pyscenic/transform.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/pyscenic/transform.py b/src/pyscenic/transform.py index 36b2f23..acdf964 100644 --- a/src/pyscenic/transform.py +++ b/src/pyscenic/transform.py @@ -124,6 +124,11 @@ def module2features_auc1st_impl(db: Type[RankingDatabase], module: Regulon, moti features, genes, rankings = df.index.values, df.columns.values, df.values weights = np.asarray([module[gene] for gene in genes]) if weighted_recovery else np.ones(len(genes)) + # include check for modules with no genes that could be mapped to the db. This can happen when including non protein-coding genes in the expression matrix. + if(df.empty): + LOGGER.warning("No genes in module {} could be mapped to {}. Skipping this module.".format(module.name, db.name)) + return pd.DataFrame(), None, None, genes, None + # Calculate recovery curves, AUC and NES values. # For fast unweighted implementation so weights to None. aucs = calc_aucs(df, db.total_genes, weights, auc_threshold) From 107fa427764daf73239e183215d5e422c8c8ab65 Mon Sep 17 00:00:00 2001 From: Chris Flerin Date: Tue, 16 Jun 2020 14:59:49 +0200 Subject: [PATCH 6/6] Fix bug in motif url contruction - Fixes #158 --- src/pyscenic/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pyscenic/utils.py b/src/pyscenic/utils.py index ebbde01..ce0effe 100644 --- a/src/pyscenic/utils.py +++ b/src/pyscenic/utils.py @@ -319,7 +319,7 @@ def add_motif_url(df: pd.DataFrame, base_url: str): :param base_url: :return: """ - df[("Enrichment", COLUMN_NAME_MOTIF_URL)] = list(map(partial(urljoin, base=base_url), df.index.get_level_values(COLUMN_NAME_MOTIF_ID))) + df[("Enrichment", COLUMN_NAME_MOTIF_URL)] = list(map(partial(urljoin, base_url), df.index.get_level_values(COLUMN_NAME_MOTIF_ID))) return df