Merge pull request #116 from bioinfodlsu/lift-over-updates

Display InterPro information in lift-over results
bioinfodlsu · Sep 10, 2023 · b3d2a9d · b3d2a9d
2 parents a477d62 + a7d617e
commit b3d2a9d
Show file tree

Hide file tree

Showing 6 changed files with 163 additions and 24 deletions.
diff --git a/callbacks/constants.py b/callbacks/constants.py
@@ -14,11 +14,13 @@ class Constants(object):
     GENE_DESCRIPTIONS = f'{APP_DATA}/gene_descriptions'
     TEXT_MINING = f'{APP_DATA}/text_mining'
     QTARO = f'{APP_DATA}/qtaro'
+    IRIC = f'{APP_DATA}/iric_data'
 
     GENE_ID_MAPPING = f'{APP_DATA}/gene_id_mapping'
     MSU_MAPPING = f'{GENE_ID_MAPPING}/msu_mapping'
     OGI_MAPPING = f'{GENE_ID_MAPPING}/ogi_mapping'
     NB_MAPPING = f'{GENE_ID_MAPPING}/nb_mapping'
+    IRIC_MAPPING = f'{GENE_ID_MAPPING}/iric_mapping'
 
     GENOMES_NIPPONBARE = f'{APP_DATA}/genomes/Nipponbare'
     ANNOTATIONS_NB = f'{ANNOTATIONS}/Nb'
@@ -41,12 +43,8 @@ class Constants(object):
     GENOME_WIDE_BED = 'query_genomic_intervals'
     TFBS_ANNOTATION = f'{TFBS_BEDS}/annotation'
 
-    DATA_PREPARATION_SCRIPTS = 'prepare_data/workflow/scripts'
-    ENRICHMENT_ANALYSIS_SCRIPTS = f'{DATA_PREPARATION_SCRIPTS}/enrichment_analysis'
-
     ENRICHMENT_ANALYSIS = f'{APP_DATA}/enrichment_analysis'
     ENRICHMENT_ANALYSIS_MAPPING = 'mapping'
-    ENRICHMENT_ANALYSIS_MODULES = 'modules'
 
     KEGG_DOSA_GENESET = f'{ENRICHMENT_ANALYSIS_MAPPING}/kegg-dosa-geneset.pickle'
     KEGG_DOSA_PATHWAY_NAMES = f'{ENRICHMENT_ANALYSIS_MAPPING}/kegg-dosa-pathway-names.tsv'

diff --git a/callbacks/lift_over/util.py b/callbacks/lift_over/util.py
@@ -29,7 +29,7 @@
                      'IR64': 'indica IR64',
                      'CMeo': 'japonica CHAO MEO'}
 
-NB_COLUMNS = ['Name', 'Description', 'UniProtKB/Swiss-Prot',
+NB_COLUMNS = ['Name', 'Description', 'UniProtKB/Swiss-Prot', 'InterPro',
               'OGI', 'Chromosome', 'Start', 'End', 'Strand', 'QTL Analyses', 'PubMed Article IDs']
 OTHER_REF_COLUMNS = ['OGI', 'Name', 'Chromosome', 'Start', 'End', 'Strand']
 FRONT_FACING_COLUMNS = ['Name', 'Description', 'UniProtKB/Swiss-Prot', 'OGI']
@@ -403,8 +403,11 @@ def get_qtaro_entry(mapping, gene):
         return NULL_PLACEHOLDER
 
 
-def get_qtaro_entries(mapping, genes):
-    return [get_qtaro_entry(mapping, gene) for gene in genes]
+def get_qtaro_entries(genes):
+    with open(Constants.QTARO_DICTIONARY, 'rb') as f:
+        qtaro_dict = pickle.load(f)
+
+    return [get_qtaro_entry(qtaro_dict, gene) for gene in genes]
 
 
 def get_pubmed_entry(gene):
@@ -417,17 +420,19 @@ def get_pubmed_entry(gene):
     except FileNotFoundError:
         return NULL_PLACEHOLDER
 
-    pubmed_str = ''
-    for idx, pubmed in enumerate(pubmed_ids):
-        if idx % 2 == 0:
-            pubmed_str += f'{pubmed}&nbsp;&nbsp;&nbsp;'
-        else:
-            pubmed_str += f'{pubmed}\n'
+    return '\n'.join(pubmed_ids)
 
-    if pubmed_str[-1] == '\n':        # Ends in a newline
-        return pubmed_str[:-len('\n')]
 
-    return pubmed_str[:-len('&nbsp;&nbsp;&nbsp;')]
+def get_interpro_entry(gene):
+    with open(f'{Constants.IRIC}/interpro.pickle', 'rb') as interpro_f,  open(f'{Constants.IRIC_MAPPING}/msu_to_iric.pickle', 'rb') as iric_mapping_f:
+        interpro_mapping = pickle.load(interpro_f)
+        iric_mapping = pickle.load(iric_mapping_f)
+
+        try:
+            return '<br><br>'.join([get_interpro_link_single_str(entry[1], entry[0])
+                                    for entry in interpro_mapping[iric_mapping[gene]] if entry[1]])
+        except KeyError:
+            return NULL_PLACEHOLDER
 
 
 def get_nb_ortholog(gene, ref):
@@ -472,13 +477,10 @@ def get_genes_in_Nb(nb_intervals):
             ogi_list = get_ogi_list([sanitize_gene_id(gene.id)
                                      for gene in genes_in_interval], ogi_mapping)
 
-        # Get QTARO annotations
-        with open(Constants.QTARO_DICTIONARY, 'rb') as f:
-            qtaro_dict = pickle.load(f)
-            qtaro_list = get_qtaro_entries(
-                qtaro_dict, [gene.id for gene in genes_in_interval])
-
+        qtaro_list = get_qtaro_entries([gene.id for gene in genes_in_interval])
         pubmed_ids = [get_pubmed_entry(gene.id) for gene in genes_in_interval]
+        interpro_list = [get_interpro_entry(
+            gene.id) for gene in genes_in_interval]
 
         # Construct the data frame
         df = pd.DataFrame({
@@ -489,7 +491,8 @@ def get_genes_in_Nb(nb_intervals):
             'End': [gene.end for gene in genes_in_interval],
             'Strand': [gene.strand for gene in genes_in_interval],
             'QTL Analyses': qtaro_list,
-            'PubMed Article IDs': pubmed_ids
+            'PubMed Article IDs': pubmed_ids,
+            'InterPro': interpro_list
         })
 
         dfs.append(df)

diff --git a/callbacks/links_util.py b/callbacks/links_util.py
@@ -1,6 +1,7 @@
 A_HREF = '<a style="white-space:nowrap" target = "_blank" href="'
+A_HREF_WITH_WORD_WRAP = '<a target = "_blank" href="'
 CLOSE_A_HREF = '">'
-LINK_ICON = '&nbsp;&nbsp;<i class="fa-solid fa-up-right-from-square fa-2xs"></i></a>'
+LINK_ICON = '<span style="white-space:nowrap">&nbsp;&nbsp;<i class="fa-solid fa-up-right-from-square fa-2xs"></i></span></a>'
 
 
 def get_genes_from_kegg_link(link):
@@ -56,3 +57,7 @@ def get_rgi_genecard_link(result, id_col):
 
 def get_rgi_orthogroup_link(result, id_col):
     return A_HREF + 'https://riceome.hzau.edu.cn/orthogroup/' + result[id_col] + CLOSE_A_HREF + result[id_col] + LINK_ICON
+
+
+def get_interpro_link_single_str(term, id):
+    return A_HREF_WITH_WORD_WRAP + 'https://www.ebi.ac.uk/interpro/entry/InterPro/' + id + CLOSE_A_HREF + term + LINK_ICON
diff --git a/callbacks/text_mining/callbacks.py b/callbacks/text_mining/callbacks.py
@@ -125,6 +125,8 @@ def display_text_mining_results(text_mining_is_submitted, homepage_submitted, te
 
                 if num_unique_entries == 1:
                     stats = f'Found matches across {num_unique_entries} publication'
+                elif num_unique_entries == MAX_NUM_RESULTS:
+                    stats = f'Found matches across over {num_unique_entries} publications. Consider making your search query more specific'
                 else:
                     stats = f'Found matches across {num_unique_entries} publications'
 

diff --git a/prepare_data/workflow/scripts/iric_description/map-gene-to-interpro.py b/prepare_data/workflow/scripts/iric_description/map-gene-to-interpro.py
@@ -0,0 +1,74 @@
+import pandas as pd
+from collections import defaultdict
+import csv
+import os
+import pickle
+
+
+def convert_default_to_vanilla_dict(d):
+    """
+    Lifted from https://stackoverflow.com/questions/26496831/how-to-convert-defaultdict-of-defaultdicts-of-defaultdicts-to-dict-of-dicts-o
+    """
+    if isinstance(d, defaultdict):
+        d = {k: convert_default_to_vanilla_dict(v) for k, v in d.items()}
+    return d
+
+
+def map_interpro_to_name(interpro_to_name_file, accession_query):
+    with open(interpro_to_name_file) as f:
+        csv_reader = csv.reader(f, delimiter='\t')
+        for line in csv_reader:
+            accession = line[0].strip()
+            name = line[-1].strip()
+
+            if accession == accession_query:
+                return name
+
+
+def generate_dict(iric_data_file, interpro_to_name_file):
+    mapping_dict = defaultdict(set)
+
+    df = pd.read_pickle(iric_data_file)
+    idx = 0
+    DISPLAY_PROGRESS = 1000
+    for _, row in df.iterrows():
+        if row['KNETMINER_RICE'] and row['InterPro:term']:
+            for accession in row['KNETMINER_RICE']:
+                for term in row['InterPro:term']:
+                    mapping_dict[accession].add(
+                        (term, map_interpro_to_name(interpro_to_name_file, term)))
+
+        if idx % DISPLAY_PROGRESS == 0:
+            print("Processed", idx + 1, "entries")
+        idx += 1
+
+    print("Generated dictionary from IRIC annotation file")
+
+    return convert_default_to_vanilla_dict(mapping_dict)
+
+
+def export_mapping(mapping, output_dir):
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    with open(f'{output_dir}/interpro.pickle', 'wb') as handle:
+        pickle.dump(mapping, handle, protocol=pickle.HIGHEST_PROTOCOL)
+
+    print(f'Generated {output_dir}/interpro.pickle')
+
+
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        'iric_data_file', help='InterPro annotation file from IRIC')
+    parser.add_argument(
+        'interpro_to_name_file', help='text file mapping InterPro accessions to their respective names')
+    parser.add_argument(
+        'output_dir', help='output directory for the pickled accession-to-InterPro annotation dictionary')
+
+    args = parser.parse_args()
+
+    mapping_dict = generate_dict(
+        args.iric_data_file, args.interpro_to_name_file)
+    export_mapping(mapping_dict, args.output_dir)
diff --git a/prepare_data/workflow/scripts/ogi_mapping/generate-nb-to-iric-dicts.py b/prepare_data/workflow/scripts/ogi_mapping/generate-nb-to-iric-dicts.py
@@ -0,0 +1,57 @@
+import csv
+import os
+import pickle
+
+
+def separate_paralogs(genes):
+    if ',' in genes:
+        paralogs = genes.split(',')
+        return paralogs
+
+    return [genes]
+
+
+def generate_dict(ogi_file, mapping_dict):
+    with open(ogi_file) as f:
+        csv_reader = csv.reader(f, delimiter='\t')
+
+        # Skip header row
+        next(csv_reader, None)
+
+        for row in csv_reader:
+            MSU_ACCESSION = 1
+            IRIC_ACCESSION = 2
+
+            msu = row[MSU_ACCESSION].strip()
+            iric = row[IRIC_ACCESSION].strip()
+
+            if msu != '.' and iric != '.':
+                for msu_id, iric_id in zip(separate_paralogs(msu), separate_paralogs(iric)):
+                    if msu_id != '' and iric_id != '':
+                        mapping_dict[msu_id] = iric_id
+
+
+def export_mapping_dict(mapping_dict, output_dir):
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    with open(f'{output_dir}/msu_to_iric.pickle', 'wb') as f:
+        pickle.dump(mapping_dict, f, protocol=pickle.HIGHEST_PROTOCOL)
+
+
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        'input_dir', help='directory containing the gene ID mapping from RGI')
+    parser.add_argument(
+        'output_dir', help='output directory for the pickled accession-to-OGI mapping dictionaries')
+
+    args = parser.parse_args()
+
+    mapping_dict = {}
+    for file in os.listdir(args.input_dir):
+        generate_dict(f'{args.input_dir}/{file}', mapping_dict)
+        print(f'Generated dictionary for {args.input_dir}/{file}')
+
+    export_mapping_dict(mapping_dict, args.output_dir)