Skip to content

Commit

Permalink
Merge pull request #116 from bioinfodlsu/lift-over-updates
Browse files Browse the repository at this point in the history
Display InterPro information in lift-over results
  • Loading branch information
pbong authored Sep 10, 2023
2 parents a477d62 + a7d617e commit b3d2a9d
Show file tree
Hide file tree
Showing 6 changed files with 163 additions and 24 deletions.
6 changes: 2 additions & 4 deletions callbacks/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,13 @@ class Constants(object):
GENE_DESCRIPTIONS = f'{APP_DATA}/gene_descriptions'
TEXT_MINING = f'{APP_DATA}/text_mining'
QTARO = f'{APP_DATA}/qtaro'
IRIC = f'{APP_DATA}/iric_data'

GENE_ID_MAPPING = f'{APP_DATA}/gene_id_mapping'
MSU_MAPPING = f'{GENE_ID_MAPPING}/msu_mapping'
OGI_MAPPING = f'{GENE_ID_MAPPING}/ogi_mapping'
NB_MAPPING = f'{GENE_ID_MAPPING}/nb_mapping'
IRIC_MAPPING = f'{GENE_ID_MAPPING}/iric_mapping'

GENOMES_NIPPONBARE = f'{APP_DATA}/genomes/Nipponbare'
ANNOTATIONS_NB = f'{ANNOTATIONS}/Nb'
Expand All @@ -41,12 +43,8 @@ class Constants(object):
GENOME_WIDE_BED = 'query_genomic_intervals'
TFBS_ANNOTATION = f'{TFBS_BEDS}/annotation'

DATA_PREPARATION_SCRIPTS = 'prepare_data/workflow/scripts'
ENRICHMENT_ANALYSIS_SCRIPTS = f'{DATA_PREPARATION_SCRIPTS}/enrichment_analysis'

ENRICHMENT_ANALYSIS = f'{APP_DATA}/enrichment_analysis'
ENRICHMENT_ANALYSIS_MAPPING = 'mapping'
ENRICHMENT_ANALYSIS_MODULES = 'modules'

KEGG_DOSA_GENESET = f'{ENRICHMENT_ANALYSIS_MAPPING}/kegg-dosa-geneset.pickle'
KEGG_DOSA_PATHWAY_NAMES = f'{ENRICHMENT_ANALYSIS_MAPPING}/kegg-dosa-pathway-names.tsv'
Expand Down
41 changes: 22 additions & 19 deletions callbacks/lift_over/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
'IR64': 'indica IR64',
'CMeo': 'japonica CHAO MEO'}

NB_COLUMNS = ['Name', 'Description', 'UniProtKB/Swiss-Prot',
NB_COLUMNS = ['Name', 'Description', 'UniProtKB/Swiss-Prot', 'InterPro',
'OGI', 'Chromosome', 'Start', 'End', 'Strand', 'QTL Analyses', 'PubMed Article IDs']
OTHER_REF_COLUMNS = ['OGI', 'Name', 'Chromosome', 'Start', 'End', 'Strand']
FRONT_FACING_COLUMNS = ['Name', 'Description', 'UniProtKB/Swiss-Prot', 'OGI']
Expand Down Expand Up @@ -403,8 +403,11 @@ def get_qtaro_entry(mapping, gene):
return NULL_PLACEHOLDER


def get_qtaro_entries(mapping, genes):
return [get_qtaro_entry(mapping, gene) for gene in genes]
def get_qtaro_entries(genes):
with open(Constants.QTARO_DICTIONARY, 'rb') as f:
qtaro_dict = pickle.load(f)

return [get_qtaro_entry(qtaro_dict, gene) for gene in genes]


def get_pubmed_entry(gene):
Expand All @@ -417,17 +420,19 @@ def get_pubmed_entry(gene):
except FileNotFoundError:
return NULL_PLACEHOLDER

pubmed_str = ''
for idx, pubmed in enumerate(pubmed_ids):
if idx % 2 == 0:
pubmed_str += f'{pubmed}   '
else:
pubmed_str += f'{pubmed}\n'
return '\n'.join(pubmed_ids)

if pubmed_str[-1] == '\n': # Ends in a newline
return pubmed_str[:-len('\n')]

return pubmed_str[:-len('   ')]
def get_interpro_entry(gene):
with open(f'{Constants.IRIC}/interpro.pickle', 'rb') as interpro_f, open(f'{Constants.IRIC_MAPPING}/msu_to_iric.pickle', 'rb') as iric_mapping_f:
interpro_mapping = pickle.load(interpro_f)
iric_mapping = pickle.load(iric_mapping_f)

try:
return '<br><br>'.join([get_interpro_link_single_str(entry[1], entry[0])
for entry in interpro_mapping[iric_mapping[gene]] if entry[1]])
except KeyError:
return NULL_PLACEHOLDER


def get_nb_ortholog(gene, ref):
Expand Down Expand Up @@ -472,13 +477,10 @@ def get_genes_in_Nb(nb_intervals):
ogi_list = get_ogi_list([sanitize_gene_id(gene.id)
for gene in genes_in_interval], ogi_mapping)

# Get QTARO annotations
with open(Constants.QTARO_DICTIONARY, 'rb') as f:
qtaro_dict = pickle.load(f)
qtaro_list = get_qtaro_entries(
qtaro_dict, [gene.id for gene in genes_in_interval])

qtaro_list = get_qtaro_entries([gene.id for gene in genes_in_interval])
pubmed_ids = [get_pubmed_entry(gene.id) for gene in genes_in_interval]
interpro_list = [get_interpro_entry(
gene.id) for gene in genes_in_interval]

# Construct the data frame
df = pd.DataFrame({
Expand All @@ -489,7 +491,8 @@ def get_genes_in_Nb(nb_intervals):
'End': [gene.end for gene in genes_in_interval],
'Strand': [gene.strand for gene in genes_in_interval],
'QTL Analyses': qtaro_list,
'PubMed Article IDs': pubmed_ids
'PubMed Article IDs': pubmed_ids,
'InterPro': interpro_list
})

dfs.append(df)
Expand Down
7 changes: 6 additions & 1 deletion callbacks/links_util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
A_HREF = '<a style="white-space:nowrap" target = "_blank" href="'
A_HREF_WITH_WORD_WRAP = '<a target = "_blank" href="'
CLOSE_A_HREF = '">'
LINK_ICON = '&nbsp;&nbsp;<i class="fa-solid fa-up-right-from-square fa-2xs"></i></a>'
LINK_ICON = '<span style="white-space:nowrap">&nbsp;&nbsp;<i class="fa-solid fa-up-right-from-square fa-2xs"></i></span></a>'


def get_genes_from_kegg_link(link):
Expand Down Expand Up @@ -56,3 +57,7 @@ def get_rgi_genecard_link(result, id_col):

def get_rgi_orthogroup_link(result, id_col):
return A_HREF + 'https://riceome.hzau.edu.cn/orthogroup/' + result[id_col] + CLOSE_A_HREF + result[id_col] + LINK_ICON


def get_interpro_link_single_str(term, id):
return A_HREF_WITH_WORD_WRAP + 'https://www.ebi.ac.uk/interpro/entry/InterPro/' + id + CLOSE_A_HREF + term + LINK_ICON
2 changes: 2 additions & 0 deletions callbacks/text_mining/callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,8 @@ def display_text_mining_results(text_mining_is_submitted, homepage_submitted, te

if num_unique_entries == 1:
stats = f'Found matches across {num_unique_entries} publication'
elif num_unique_entries == MAX_NUM_RESULTS:
stats = f'Found matches across over {num_unique_entries} publications. Consider making your search query more specific'
else:
stats = f'Found matches across {num_unique_entries} publications'

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import pandas as pd
from collections import defaultdict
import csv
import os
import pickle


def convert_default_to_vanilla_dict(d):
"""
Lifted from https://stackoverflow.com/questions/26496831/how-to-convert-defaultdict-of-defaultdicts-of-defaultdicts-to-dict-of-dicts-o
"""
if isinstance(d, defaultdict):
d = {k: convert_default_to_vanilla_dict(v) for k, v in d.items()}
return d


def map_interpro_to_name(interpro_to_name_file, accession_query):
with open(interpro_to_name_file) as f:
csv_reader = csv.reader(f, delimiter='\t')
for line in csv_reader:
accession = line[0].strip()
name = line[-1].strip()

if accession == accession_query:
return name


def generate_dict(iric_data_file, interpro_to_name_file):
mapping_dict = defaultdict(set)

df = pd.read_pickle(iric_data_file)
idx = 0
DISPLAY_PROGRESS = 1000
for _, row in df.iterrows():
if row['KNETMINER_RICE'] and row['InterPro:term']:
for accession in row['KNETMINER_RICE']:
for term in row['InterPro:term']:
mapping_dict[accession].add(
(term, map_interpro_to_name(interpro_to_name_file, term)))

if idx % DISPLAY_PROGRESS == 0:
print("Processed", idx + 1, "entries")
idx += 1

print("Generated dictionary from IRIC annotation file")

return convert_default_to_vanilla_dict(mapping_dict)


def export_mapping(mapping, output_dir):
if not os.path.exists(output_dir):
os.makedirs(output_dir)

with open(f'{output_dir}/interpro.pickle', 'wb') as handle:
pickle.dump(mapping, handle, protocol=pickle.HIGHEST_PROTOCOL)

print(f'Generated {output_dir}/interpro.pickle')


if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument(
'iric_data_file', help='InterPro annotation file from IRIC')
parser.add_argument(
'interpro_to_name_file', help='text file mapping InterPro accessions to their respective names')
parser.add_argument(
'output_dir', help='output directory for the pickled accession-to-InterPro annotation dictionary')

args = parser.parse_args()

mapping_dict = generate_dict(
args.iric_data_file, args.interpro_to_name_file)
export_mapping(mapping_dict, args.output_dir)
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import csv
import os
import pickle


def separate_paralogs(genes):
if ',' in genes:
paralogs = genes.split(',')
return paralogs

return [genes]


def generate_dict(ogi_file, mapping_dict):
with open(ogi_file) as f:
csv_reader = csv.reader(f, delimiter='\t')

# Skip header row
next(csv_reader, None)

for row in csv_reader:
MSU_ACCESSION = 1
IRIC_ACCESSION = 2

msu = row[MSU_ACCESSION].strip()
iric = row[IRIC_ACCESSION].strip()

if msu != '.' and iric != '.':
for msu_id, iric_id in zip(separate_paralogs(msu), separate_paralogs(iric)):
if msu_id != '' and iric_id != '':
mapping_dict[msu_id] = iric_id


def export_mapping_dict(mapping_dict, output_dir):
if not os.path.exists(output_dir):
os.makedirs(output_dir)

with open(f'{output_dir}/msu_to_iric.pickle', 'wb') as f:
pickle.dump(mapping_dict, f, protocol=pickle.HIGHEST_PROTOCOL)


if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument(
'input_dir', help='directory containing the gene ID mapping from RGI')
parser.add_argument(
'output_dir', help='output directory for the pickled accession-to-OGI mapping dictionaries')

args = parser.parse_args()

mapping_dict = {}
for file in os.listdir(args.input_dir):
generate_dict(f'{args.input_dir}/{file}', mapping_dict)
print(f'Generated dictionary for {args.input_dir}/{file}')

export_mapping_dict(mapping_dict, args.output_dir)

0 comments on commit b3d2a9d

Please sign in to comment.