-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #116 from bioinfodlsu/lift-over-updates
Display InterPro information in lift-over results
- Loading branch information
Showing
6 changed files
with
163 additions
and
24 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
74 changes: 74 additions & 0 deletions
74
prepare_data/workflow/scripts/iric_description/map-gene-to-interpro.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
import pandas as pd | ||
from collections import defaultdict | ||
import csv | ||
import os | ||
import pickle | ||
|
||
|
||
def convert_default_to_vanilla_dict(d): | ||
""" | ||
Lifted from https://stackoverflow.com/questions/26496831/how-to-convert-defaultdict-of-defaultdicts-of-defaultdicts-to-dict-of-dicts-o | ||
""" | ||
if isinstance(d, defaultdict): | ||
d = {k: convert_default_to_vanilla_dict(v) for k, v in d.items()} | ||
return d | ||
|
||
|
||
def map_interpro_to_name(interpro_to_name_file, accession_query): | ||
with open(interpro_to_name_file) as f: | ||
csv_reader = csv.reader(f, delimiter='\t') | ||
for line in csv_reader: | ||
accession = line[0].strip() | ||
name = line[-1].strip() | ||
|
||
if accession == accession_query: | ||
return name | ||
|
||
|
||
def generate_dict(iric_data_file, interpro_to_name_file): | ||
mapping_dict = defaultdict(set) | ||
|
||
df = pd.read_pickle(iric_data_file) | ||
idx = 0 | ||
DISPLAY_PROGRESS = 1000 | ||
for _, row in df.iterrows(): | ||
if row['KNETMINER_RICE'] and row['InterPro:term']: | ||
for accession in row['KNETMINER_RICE']: | ||
for term in row['InterPro:term']: | ||
mapping_dict[accession].add( | ||
(term, map_interpro_to_name(interpro_to_name_file, term))) | ||
|
||
if idx % DISPLAY_PROGRESS == 0: | ||
print("Processed", idx + 1, "entries") | ||
idx += 1 | ||
|
||
print("Generated dictionary from IRIC annotation file") | ||
|
||
return convert_default_to_vanilla_dict(mapping_dict) | ||
|
||
|
||
def export_mapping(mapping, output_dir): | ||
if not os.path.exists(output_dir): | ||
os.makedirs(output_dir) | ||
|
||
with open(f'{output_dir}/interpro.pickle', 'wb') as handle: | ||
pickle.dump(mapping, handle, protocol=pickle.HIGHEST_PROTOCOL) | ||
|
||
print(f'Generated {output_dir}/interpro.pickle') | ||
|
||
|
||
if __name__ == '__main__': | ||
import argparse | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument( | ||
'iric_data_file', help='InterPro annotation file from IRIC') | ||
parser.add_argument( | ||
'interpro_to_name_file', help='text file mapping InterPro accessions to their respective names') | ||
parser.add_argument( | ||
'output_dir', help='output directory for the pickled accession-to-InterPro annotation dictionary') | ||
|
||
args = parser.parse_args() | ||
|
||
mapping_dict = generate_dict( | ||
args.iric_data_file, args.interpro_to_name_file) | ||
export_mapping(mapping_dict, args.output_dir) |
57 changes: 57 additions & 0 deletions
57
prepare_data/workflow/scripts/ogi_mapping/generate-nb-to-iric-dicts.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
import csv | ||
import os | ||
import pickle | ||
|
||
|
||
def separate_paralogs(genes): | ||
if ',' in genes: | ||
paralogs = genes.split(',') | ||
return paralogs | ||
|
||
return [genes] | ||
|
||
|
||
def generate_dict(ogi_file, mapping_dict): | ||
with open(ogi_file) as f: | ||
csv_reader = csv.reader(f, delimiter='\t') | ||
|
||
# Skip header row | ||
next(csv_reader, None) | ||
|
||
for row in csv_reader: | ||
MSU_ACCESSION = 1 | ||
IRIC_ACCESSION = 2 | ||
|
||
msu = row[MSU_ACCESSION].strip() | ||
iric = row[IRIC_ACCESSION].strip() | ||
|
||
if msu != '.' and iric != '.': | ||
for msu_id, iric_id in zip(separate_paralogs(msu), separate_paralogs(iric)): | ||
if msu_id != '' and iric_id != '': | ||
mapping_dict[msu_id] = iric_id | ||
|
||
|
||
def export_mapping_dict(mapping_dict, output_dir): | ||
if not os.path.exists(output_dir): | ||
os.makedirs(output_dir) | ||
|
||
with open(f'{output_dir}/msu_to_iric.pickle', 'wb') as f: | ||
pickle.dump(mapping_dict, f, protocol=pickle.HIGHEST_PROTOCOL) | ||
|
||
|
||
if __name__ == '__main__': | ||
import argparse | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument( | ||
'input_dir', help='directory containing the gene ID mapping from RGI') | ||
parser.add_argument( | ||
'output_dir', help='output directory for the pickled accession-to-OGI mapping dictionaries') | ||
|
||
args = parser.parse_args() | ||
|
||
mapping_dict = {} | ||
for file in os.listdir(args.input_dir): | ||
generate_dict(f'{args.input_dir}/{file}', mapping_dict) | ||
print(f'Generated dictionary for {args.input_dir}/{file}') | ||
|
||
export_mapping_dict(mapping_dict, args.output_dir) |