From 99c20ba0fcb6fc4707c404975a62ea3741f12b93 Mon Sep 17 00:00:00 2001 From: Natalie Catlett Date: Mon, 3 Mar 2014 16:01:41 -0500 Subject: [PATCH] added script to write gene-orthology .bel file --- datasets.py | 19 ++++++++---- orthology.py | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+), 5 deletions(-) create mode 100755 orthology.py diff --git a/datasets.py b/datasets.py index d3942b2..f4752c8 100644 --- a/datasets.py +++ b/datasets.py @@ -32,14 +32,19 @@ class OrthologyData(DataSet): def __init__(self, dictionary={}, prefix='use-index-term-prefix'): super().__init__(dictionary, prefix) + def get_values(self): + for term_id in self._dict: + yield term_id + def get_orthologs(self, term_id): orthologs = set() mapping = self._dict.get(term_id) mouse_orthologs = mapping.get('mouse_ortholog_id').split('|') orthologs.update(mouse_orthologs) - human_orthologs = mapping.get('human_ortholog_id').split('|') - human_orthologs = {'HGNC:' + ortho for ortho in human_orthologs} - orthologs.update(human_orthologs) + if mapping.get('human_ortholog_id') is not '': + human_orthologs = mapping.get('human_ortholog_id').split('|') + human_orthologs = {'HGNC:' + ortho for ortho in human_orthologs} + orthologs.update(human_orthologs) return orthologs def __str__(self): @@ -293,8 +298,12 @@ def get_values(self): def get_label(self, term_id): ''' Return preferred label associated with term id. ''' - label = self._dict.get(term_id).get('Symbol') - return label + mapping = self._dict.get(term_id) + if mapping is None: + return None + else: + label = mapping.get('Symbol') + return label def get_encoding(self, term_id): mapping = self._dict.get(term_id) diff --git a/orthology.py b/orthology.py new file mode 100755 index 0000000..3277646 --- /dev/null +++ b/orthology.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +import argparse +import os +import pickle +import time +import datasets +import bel_functions +from collections import defaultdict + +if __name__=='__main__': + # command line arguments - directory for pickled data objects + parser = argparse.ArgumentParser(description="""Generate BEL orthology file + from pickled data objects.""") + + parser.add_argument("-n", required=True, metavar="DIRECTORY", + help="directory to store the new namespace equivalence data") + args = parser.parse_args() + if os.path.exists(args.n): + os.chdir(args.n) + else: + print('data directory {0} not found!'.format(args.n)) + + data_list = ['rgd', 'hgnc', 'rgd_ortho', 'mgi'] + data_dict = {} + for files in os.listdir("."): + if files.endswith("parsed_data.pickle"): + with open(files,'rb') as f: + d = pickle.load(f) + if str(d) in data_list: + data_dict[str(d)] = d + for data in data_list: + if data not in data_dict.keys(): + print('missing required dependency {0}!'.format(data)) + + hgnc_ortho_statements = set() + for term_id in data_dict.get('hgnc').get_values(): + term_label = data_dict.get('hgnc').get_label(term_id) + hgnc_term = bel_functions.bel_term(term_label, 'HGNC', 'g') + orthos = data_dict.get('hgnc').get_orthologs(term_id) + if orthos is not None: + for o in orthos: + if len(o.split(':')) == 2: + prefix, value = o.split(':') + if prefix == 'RGD': + o_label = data_dict.get('rgd').get_label(value) + if prefix == 'MGI': + o_label = data_dict.get('mgi').get_label(value) + if o_label is None: + print('WARNING - missing label for {0}, {1}'.format(prefix, value)) + continue + ortho_term = bel_functions.bel_term(o_label, prefix, 'g') + hgnc_ortho_statements.add('{0} orthologous {1}'.format(hgnc_term, ortho_term)) + + rgd_ortho_statements = set() + for term_id in data_dict.get('rgd_ortho').get_values(): + term_label = data_dict.get('rgd').get_label(term_id) + rgd_term = bel_functions.bel_term(term_label, 'RGD', 'g') + orthos = data_dict.get('rgd_ortho').get_orthologs(term_id) + if orthos is not None: + for o in orthos: + prefix = '' + if len(o.split(':')) == 2: + prefix, value = o.split(':') + if prefix == 'HGNC': + o_label = data_dict.get('hgnc').get_label(value) + if prefix == 'MGI': + o_label = data_dict.get('mgi').get_label(value) + if o_label is None: + print('WARNING - {2} missing label for {0}, {1}'.format(prefix, value, rgd_term)) + continue + ortho_term = bel_functions.bel_term(o_label, prefix, 'g') + rgd_ortho_statements.add('{0} orthologous {1}'.format(rgd_term, ortho_term)) + + with open('gene-orthology.bel', 'w') as ortho: + ortho.write('SET Citation = {"Online Resource", "HUGO Gene Nomenclature Committee data download", "ftp://ftp.ebi.ac.uk/pub/databases/genenames/hgnc_complete_set.txt.gz"}\n') + for s in sorted(hgnc_ortho_statements): + ortho.write(s + '\n') + ortho.write('\n') + ortho.write('SET Citation = {"Online Resource","RGD Orthology FTP file", "ftp://rgd.mcw.edu/pub/data_release/RGD_ORTHOLOGS.txt"}\n') + for s in sorted(rgd_ortho_statements): + ortho.write(s + '\n')