Merge branch 'main' of https://github.com/bioinfodlsu/rice-pilaf into…

… input
bioinfodlsu · Sep 11, 2023 · 2efdf27 · 2efdf27
2 parents 45c6b93 + b3d2a9d
commit 2efdf27
Show file tree

Hide file tree

Showing 13 changed files with 226 additions and 75 deletions.
diff --git a/callbacks/coexpression/util.py b/callbacks/coexpression/util.py
@@ -264,7 +264,7 @@ def convert_transcript_to_msu_id(transcript_ids_str, network):
     Returns:
     - Equivalent MSU accessions of the KEGG transcript IDs
     """
-    with open(f'{Constants.GENE_ID_MAPPING}/{network}/transcript-to-msu-id.pickle', 'rb') as f:
+    with open(f'{Constants.MSU_MAPPING}/{network}/transcript-to-msu-id.pickle', 'rb') as f:
         mapping_dict = pickle.load(f)
 
     output_str = ''

diff --git a/callbacks/constants.py b/callbacks/constants.py
@@ -1,22 +1,26 @@
 class Constants(object):
-    LIFT_OVER = 'lift-over'
-    COEXPRESSION = 'co-expression'
-    TFBS = 'tf-enrichment'
-    IGV = 'browse-loci'
-    TEXT_MINING = 'text-mining'
+    LABEL_LIFT_OVER = 'lift-over'
+    LABEL_COEXPRESSION = 'co-expression'
+    LABEL_TFBS = 'tf-enrichment'
+    LABEL_IGV = 'browse-loci'
+    LABEL_TEXT_MINING = 'text-mining'
 
     DATA = 'static'
     APP_DATA = f'{DATA}/app_data'
     RAW_DATA = f'{DATA}/raw_data'
 
     ANNOTATIONS = f'{APP_DATA}/annotations'
     ALIGNMENTS = f'{APP_DATA}/alignments'
-    OGI_MAPPING = f'{APP_DATA}/ogi_mapping'
-    NB_MAPPING = f'{APP_DATA}/nb_mapping'
     GENE_DESCRIPTIONS = f'{APP_DATA}/gene_descriptions'
-    GENE_ID_MAPPING = f'{APP_DATA}/gene_id_mapping'
     TEXT_MINING = f'{APP_DATA}/text_mining'
     QTARO = f'{APP_DATA}/qtaro'
+    IRIC = f'{APP_DATA}/iric_data'
+
+    GENE_ID_MAPPING = f'{APP_DATA}/gene_id_mapping'
+    MSU_MAPPING = f'{GENE_ID_MAPPING}/msu_mapping'
+    OGI_MAPPING = f'{GENE_ID_MAPPING}/ogi_mapping'
+    NB_MAPPING = f'{GENE_ID_MAPPING}/nb_mapping'
+    IRIC_MAPPING = f'{GENE_ID_MAPPING}/iric_mapping'
 
     GENOMES_NIPPONBARE = f'{APP_DATA}/genomes/Nipponbare'
     ANNOTATIONS_NB = f'{ANNOTATIONS}/Nb'
@@ -39,12 +43,8 @@ class Constants(object):
     GENOME_WIDE_BED = 'query_genomic_intervals'
     TFBS_ANNOTATION = f'{TFBS_BEDS}/annotation'
 
-    DATA_PREPARATION_SCRIPTS = 'prepare_data/workflow/scripts'
-    ENRICHMENT_ANALYSIS_SCRIPTS = f'{DATA_PREPARATION_SCRIPTS}/enrichment_analysis'
-
     ENRICHMENT_ANALYSIS = f'{APP_DATA}/enrichment_analysis'
     ENRICHMENT_ANALYSIS_MAPPING = 'mapping'
-    ENRICHMENT_ANALYSIS_MODULES = 'modules'
 
     KEGG_DOSA_GENESET = f'{ENRICHMENT_ANALYSIS_MAPPING}/kegg-dosa-geneset.pickle'
     KEGG_DOSA_PATHWAY_NAMES = f'{ENRICHMENT_ANALYSIS_MAPPING}/kegg-dosa-pathway-names.tsv'

diff --git a/callbacks/lift_over/util.py b/callbacks/lift_over/util.py
@@ -29,7 +29,7 @@
                      'IR64': 'indica IR64',
                      'CMeo': 'japonica CHAO MEO'}
 
-NB_COLUMNS = ['Name', 'Description', 'UniProtKB/Swiss-Prot',
+NB_COLUMNS = ['Name', 'Description', 'UniProtKB/Swiss-Prot', 'InterPro',
               'OGI', 'Chromosome', 'Start', 'End', 'Strand', 'QTL Analyses', 'PubMed Article IDs']
 OTHER_REF_COLUMNS = ['OGI', 'Name', 'Chromosome', 'Start', 'End', 'Strand']
 FRONT_FACING_COLUMNS = ['Name', 'Description', 'UniProtKB/Swiss-Prot', 'OGI']
@@ -354,6 +354,11 @@ def get_ogi_other_ref(ref, nb_intervals):
         for intersection in gff_intersections:
             ref_interval = to_genomic_interval(
                 intersection.attributes['Name'][0])
+
+            # Skip if assembler does not know what to do with contig
+            if is_error(ref_interval):
+                continue
+
             genes_in_interval = list(db_annotation.region(region=(ref_interval.chrom, ref_interval.start, ref_interval.stop),
                                                           completely_within=False, featuretype='gene'))
 
@@ -398,8 +403,11 @@ def get_qtaro_entry(mapping, gene):
         return NULL_PLACEHOLDER
 
 
-def get_qtaro_entries(mapping, genes):
-    return [get_qtaro_entry(mapping, gene) for gene in genes]
+def get_qtaro_entries(genes):
+    with open(Constants.QTARO_DICTIONARY, 'rb') as f:
+        qtaro_dict = pickle.load(f)
+
+    return [get_qtaro_entry(qtaro_dict, gene) for gene in genes]
 
 
 def get_pubmed_entry(gene):
@@ -412,17 +420,19 @@ def get_pubmed_entry(gene):
     except FileNotFoundError:
         return NULL_PLACEHOLDER
 
-    pubmed_str = ''
-    for idx, pubmed in enumerate(pubmed_ids):
-        if idx % 2 == 0:
-            pubmed_str += f'{pubmed}&nbsp;&nbsp;&nbsp;'
-        else:
-            pubmed_str += f'{pubmed}\n'
+    return '\n'.join(pubmed_ids)
 
-    if pubmed_str[-1] == '\n':        # Ends in a newline
-        return pubmed_str[:-len('\n')]
 
-    return pubmed_str[:-len('&nbsp;&nbsp;&nbsp;')]
+def get_interpro_entry(gene):
+    with open(f'{Constants.IRIC}/interpro.pickle', 'rb') as interpro_f,  open(f'{Constants.IRIC_MAPPING}/msu_to_iric.pickle', 'rb') as iric_mapping_f:
+        interpro_mapping = pickle.load(interpro_f)
+        iric_mapping = pickle.load(iric_mapping_f)
+
+        try:
+            return '<br><br>'.join([get_interpro_link_single_str(entry[1], entry[0])
+                                    for entry in interpro_mapping[iric_mapping[gene]] if entry[1]])
+        except KeyError:
+            return NULL_PLACEHOLDER
 
 
 def get_nb_ortholog(gene, ref):
@@ -467,13 +477,10 @@ def get_genes_in_Nb(nb_intervals):
             ogi_list = get_ogi_list([sanitize_gene_id(gene.id)
                                      for gene in genes_in_interval], ogi_mapping)
 
-        # Get QTARO annotations
-        with open(Constants.QTARO_DICTIONARY, 'rb') as f:
-            qtaro_dict = pickle.load(f)
-            qtaro_list = get_qtaro_entries(
-                qtaro_dict, [gene.id for gene in genes_in_interval])
-
+        qtaro_list = get_qtaro_entries([gene.id for gene in genes_in_interval])
         pubmed_ids = [get_pubmed_entry(gene.id) for gene in genes_in_interval]
+        interpro_list = [get_interpro_entry(
+            gene.id) for gene in genes_in_interval]
 
         # Construct the data frame
         df = pd.DataFrame({
@@ -484,7 +491,8 @@ def get_genes_in_Nb(nb_intervals):
             'End': [gene.end for gene in genes_in_interval],
             'Strand': [gene.strand for gene in genes_in_interval],
             'QTL Analyses': qtaro_list,
-            'PubMed Article IDs': pubmed_ids
+            'PubMed Article IDs': pubmed_ids,
+            'InterPro': interpro_list
         })
 
         dfs.append(df)
@@ -544,6 +552,11 @@ def get_genes_in_other_ref(ref, nb_intervals):
         for intersection in gff_intersections:
             ref_interval = to_genomic_interval(
                 intersection.attributes['Name'][0])
+
+            # Skip if assembler does not know what to do with contig
+            if is_error(ref_interval):
+                continue
+
             genes_in_interval = list(db_annotation.region(region=(ref_interval.chrom, ref_interval.start, ref_interval.stop),
                                                           completely_within=False, featuretype='gene'))
 

diff --git a/callbacks/links_util.py b/callbacks/links_util.py
@@ -1,6 +1,7 @@
 A_HREF = '<a style="white-space:nowrap" target = "_blank" href="'
+A_HREF_WITH_WORD_WRAP = '<a target = "_blank" href="'
 CLOSE_A_HREF = '">'
-LINK_ICON = '&nbsp;&nbsp;<i class="fa-solid fa-up-right-from-square fa-2xs"></i></a>'
+LINK_ICON = '<span style="white-space:nowrap">&nbsp;&nbsp;<i class="fa-solid fa-up-right-from-square fa-2xs"></i></span></a>'
 
 
 def get_genes_from_kegg_link(link):
@@ -56,3 +57,7 @@ def get_rgi_genecard_link(result, id_col):
 
 def get_rgi_orthogroup_link(result, id_col):
     return A_HREF + 'https://riceome.hzau.edu.cn/orthogroup/' + result[id_col] + CLOSE_A_HREF + result[id_col] + LINK_ICON
+
+
+def get_interpro_link_single_str(term, id):
+    return A_HREF_WITH_WORD_WRAP + 'https://www.ebi.ac.uk/interpro/entry/InterPro/' + id + CLOSE_A_HREF + term + LINK_ICON
diff --git a/callbacks/text_mining/callbacks.py b/callbacks/text_mining/callbacks.py
@@ -125,6 +125,8 @@ def display_text_mining_results(text_mining_is_submitted, homepage_submitted, te
 
                 if num_unique_entries == 1:
                     stats = f'Found matches across {num_unique_entries} publication'
+                elif num_unique_entries == MAX_NUM_RESULTS:
+                    stats = f'Found matches across over {num_unique_entries} publications. Consider making your search query more specific'
                 else:
                     stats = f'Found matches across {num_unique_entries} publications'
 

diff --git a/pages/analysis/browse_loci.py b/pages/analysis/browse_loci.py
@@ -6,7 +6,7 @@
 layout = html.Div(
     id={
         'type': 'analysis-layout',
-        'label': Constants.IGV
+        'label': Constants.LABEL_IGV
     },
     hidden=True,
     children=[

diff --git a/pages/analysis/co_expr.py b/pages/analysis/co_expr.py
@@ -221,7 +221,7 @@
 layout = html.Div(
     id={
         'type': 'analysis-layout',
-        'label': Constants.COEXPRESSION
+        'label': Constants.LABEL_COEXPRESSION
     },
     hidden=True,
 

diff --git a/pages/analysis/lift_over.py b/pages/analysis/lift_over.py
@@ -7,25 +7,25 @@
 layout = html.Div(
     id={
         'type': 'analysis-layout',
-        'label': Constants.LIFT_OVER
+        'label': Constants.LABEL_LIFT_OVER
     },
     hidden=True,
     children=[
         html.Div([
             html.P(
-               ['In this page, you can obtain the list of genes overlapping your input intervals. '
-                'Optionally, you can choose genomes to lift-over your Nipponbare coordinates to. Click ',
+                ['In this page, you can obtain the list of genes overlapping your input intervals. '
+                 'Optionally, you can choose genomes to lift-over your Nipponbare coordinates to. Click ',
                  dcc.Link(
-                    ['here ', html.I(
-                        id='demo-link',
-                        className='fa-solid fa-up-right-from-square fa-2xs'
-                    )],
-                    href='https://github.com/bioinfodlsu/rice-pilaf/wiki/2.1-Gene-List-and-Lift%E2%80%90over',
-                    target='_blank',
-                    className='top-navbar-item'
-                ),
-                ' for user guide.'
-               ]
+                     ['here ', html.I(
+                         id='demo-link',
+                         className='fa-solid fa-up-right-from-square fa-2xs'
+                     )],
+                     href='https://github.com/bioinfodlsu/rice-pilaf/wiki/2.1-Gene-List-and-Lift%E2%80%90over',
+                     target='_blank',
+                     className='top-navbar-item'
+                 ),
+                 ' for user guide.'
+                 ]
             )
         ], className='analysis-intro p-3'),
 

diff --git a/pages/analysis/text_mining.py b/pages/analysis/text_mining.py
@@ -6,24 +6,24 @@
 layout = html.Div(
     id={
         'type': 'analysis-layout',
-        'label': Constants.TEXT_MINING
+        'label': Constants.LABEL_TEXT_MINING
     },
     hidden=True,
     children=[
 
         html.Div([
             html.P(
                 [
-                   'In this page, you can retrieve gene names associated with traits, diseases, chemicals, etc. '
-                   'from a database constructed from text-mined PubMed abstracts. Click ',
+                    'In this page, you can retrieve gene names associated with traits, diseases, chemicals, etc. '
+                    'from a database constructed from text-mined PubMed abstracts. Click ',
                     dcc.Link(
-                    ['here ', html.I(
-                        id='demo-link',
-                        className='fa-solid fa-up-right-from-square fa-2xs'
-                    )],
-                    href='https://github.com/bioinfodlsu/rice-pilaf/wiki/2.2-Gene-retrieval-by-text-mining',
-                    target='_blank',
-                    className='top-navbar-item'
+                        ['here ', html.I(
+                            id='demo-link',
+                            className='fa-solid fa-up-right-from-square fa-2xs'
+                        )],
+                        href='https://github.com/bioinfodlsu/rice-pilaf/wiki/2.2-Gene-retrieval-by-text-mining',
+                        target='_blank',
+                        className='top-navbar-item'
                     ),
                     ' for user guide.'
                 ]

diff --git a/pages/analysis/tf_enrich.py b/pages/analysis/tf_enrich.py
@@ -6,24 +6,24 @@
 layout = html.Div(
     id={
         'type': 'analysis-layout',
-        'label': Constants.TFBS
+        'label': Constants.LABEL_TFBS
     },
     hidden=True,
     children=[
         html.Div([
             html.P(
                 ['In this page, you can search for transcription factors whose binding sites overlap significantly with your intervals,'
-                'the idea being that your intervals might contain variants that affect the binding affinity of transcription factors. Click ',
-                dcc.Link(
-                    ['here ', html.I(
-                        id='demo-link',
-                        className='fa-solid fa-up-right-from-square fa-2xs'
-                    )],
-                    href='https://github.com/bioinfodlsu/rice-pilaf/wiki/2.4-Regulatory-Feature-Enrichment',
-                    target='_blank',
-                    className='top-navbar-item'
-                ),
-                ' for user guide.']
+                 'the idea being that your intervals might contain variants that affect the binding affinity of transcription factors. Click ',
+                 dcc.Link(
+                     ['here ', html.I(
+                         id='demo-link',
+                         className='fa-solid fa-up-right-from-square fa-2xs'
+                     )],
+                     href='https://github.com/bioinfodlsu/rice-pilaf/wiki/2.4-Regulatory-Feature-Enrichment',
+                     target='_blank',
+                     className='top-navbar-item'
+                 ),
+                    ' for user guide.']
             )
         ], className='analysis-intro p-3'),
 

diff --git a/pages/analysis_layout.py b/pages/analysis_layout.py
@@ -13,11 +13,11 @@
 
 def get_analaysis_layout_dictionary():
     return OrderedDict({
-        Constants.LIFT_OVER: 'Gene List and Lift-Over',
-        Constants.TEXT_MINING: 'Gene Retrieval by Text Mining',
-        Constants.COEXPRESSION: 'Co-Expression Network Analysis',
-        Constants.TFBS: 'Regulatory Feature Enrichment',
-        Constants.IGV: 'Browse Loci'
+        Constants.LABEL_LIFT_OVER: 'Gene List and Lift-Over',
+        Constants.LABEL_TEXT_MINING: 'Gene Retrieval by Text Mining',
+        Constants.LABEL_COEXPRESSION: 'Co-Expression Network Analysis',
+        Constants.LABEL_TFBS: 'Regulatory Feature Enrichment',
+        Constants.LABEL_IGV: 'Browse Loci'
     })