From 06c6e03b7d4c16fd583390ffce1e0db87f63463d Mon Sep 17 00:00:00 2001 From: memgonzales Date: Tue, 29 Aug 2023 22:12:58 +0800 Subject: [PATCH] Refactor universal enrichment logic following clusterProfiler's enricher --- callbacks/coexpression/util.py | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/callbacks/coexpression/util.py b/callbacks/coexpression/util.py index 70273063..d8f189b6 100644 --- a/callbacks/coexpression/util.py +++ b/callbacks/coexpression/util.py @@ -190,27 +190,33 @@ def do_module_enrichment_analysis(implicated_gene_ids, genomic_intervals, addl_g # provided by clusterProfiler # ==================================================================================== with open(IMPLICATED_GENES_PATH) as implicated_genes_file, open(MODULES_PATH) as modules_file, open(f'{INPUT_GENES_DIR}/enriched_modules.tsv', 'w') as enriched_modules_file: - background_genes = set() - for line in modules_file: - background_genes = background_genes.union( - set(line.strip().split('\t'))) - + # There is only a single line, which lists all the implicated genes for line in implicated_genes_file: line = line.strip().split('\t') implicated_genes = set(line) + modules = [] + background_genes = set() + for idx, line in enumerate(modules_file): + module_genes = set(line.strip().split('\t')) + background_genes = background_genes.union(module_genes) + if implicated_genes.intersection(module_genes): + modules.append(idx) + p_values_indices = [] p_values = [] modules_file.seek(0) for idx, line in enumerate(modules_file): - module = line.strip().split('\t') - module_genes = set(module) - table = construct_contigency_table( - background_genes, implicated_genes, module_genes) - - p_value = fisher_exact(table, alternative='greater').pvalue - if not (0.999999999 < p_value and p_value < 1.000000001): - p_values.append(p_value) + if idx in modules: + module = line.strip().split('\t') + module_genes = set(module) + table = construct_contigency_table( + background_genes, implicated_genes, module_genes) + + p_values.append(fisher_exact( + table, alternative='greater').pvalue) + + # Add 1 since user-facing module number is one-based p_values_indices.append(idx + 1) _, adj_p_values, _, _ = sm.multipletests(