From 21a604207300920491be565b90e5939b6309c506 Mon Sep 17 00:00:00 2001 From: fairliereese Date: Mon, 22 Jun 2020 15:09:45 -0700 Subject: [PATCH] added support for incorrectly ordered GTF entries and test cases for it --- swan_vis/swangraph.py | 11 +++++++++-- swan_vis/utils.py | 12 ++++++++++++ testing/input_files/weird_gtf_entries.gtf | 11 +++++++++++ testing/test_adding_datasets.py | 23 +++++++++++++++++++++++ 4 files changed, 55 insertions(+), 2 deletions(-) create mode 100644 testing/input_files/weird_gtf_entries.gtf diff --git a/swan_vis/swangraph.py b/swan_vis/swangraph.py index 977c3fc..62118f9 100644 --- a/swan_vis/swangraph.py +++ b/swan_vis/swangraph.py @@ -161,7 +161,7 @@ def add_dataset(self, col, fname, if ftype == 'gtf': self.create_dfs_gtf(fname) elif ftype == 'db': - self.create_dfs_db(fname, annot, whitelist, 'hepg2_1') + self.create_dfs_db(fname, annot, whitelist, dataset_name) # add column to each df to indicate where data came from self.loc_df[col] = True @@ -175,7 +175,7 @@ def add_dataset(self, col, fname, if ftype == 'gtf': temp.create_dfs_gtf(fname) elif ftype == 'db': - temp.create_dfs_db(fname, annot, whitelist, 'hepg2_1') + temp.create_dfs_db(fname, annot, whitelist, dataset_name) self.merge_dfs(temp, col) # remove isms if we have access to that information @@ -541,6 +541,9 @@ def create_dfs_gtf(self, gtf_file): locs[key] = vertex_id vertex_id += 1 + # create inverse loc dict to sort paths by + locs_inv = {v: k for k, v in locs.items()} + # add locs-indexed path to transcripts, and populate edges edges = {} for _,t in transcripts.items(): @@ -580,6 +583,10 @@ def create_dfs_gtf(self, gtf_file): if key not in edges: edges[key] = {'edge_id': edge_id, 'edge_type': 'intron'} + # sort the path based on chromosomal coordinates and strand + # in case there's some weird ordering in the gtf + t['path'] = reorder_locs(t['path'], strand, locs_inv) + # turn transcripts, edges, and locs into dataframes locs = [{'chrom': key[0], 'coord': key[1], diff --git a/swan_vis/utils.py b/swan_vis/utils.py index bc94b78..f5a90af 100644 --- a/swan_vis/utils.py +++ b/swan_vis/utils.py @@ -204,6 +204,18 @@ def find_edge_start_stop(v1, v2, strand): stop = max([v1, v2]) return start, stop +# reorder the locations in a transcript's path based on +# chromosomal coordinate +# TODO +def reorder_locs(path, strand, locs): + coords = [locs[i] for i in path] + path_coords = sorted(zip(path, coords), key=lambda x: x[1]) + path = [i[0] for i in path_coords] + coords = [i[1][1] for i in path_coords] + if strand == '-': + path.reverse() + return path + # get novelty types associated with each transcript def get_transcript_novelties(fields): if fields['transcript_status'] == 'KNOWN': diff --git a/testing/input_files/weird_gtf_entries.gtf b/testing/input_files/weird_gtf_entries.gtf new file mode 100644 index 0000000..e97ddfd --- /dev/null +++ b/testing/input_files/weird_gtf_entries.gtf @@ -0,0 +1,11 @@ +chr6 HAVANA transcript 143815949 143832857 . - . gene_id "ENSG00000001036.14_4"; transcript_id "ENST00000002165.11_3"; gene_type "protein_coding"; gene_name "FUCA2"; transcript_type "protein_coding"; transcript_name "FUCA2-201"; level 2; protein_id "ENSP00000002165.5"; transcript_support_level 1; hgnc_id "HGNC:4008"; tag "basic"; tag "MANE_Select"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS5200.1"; havana_gene "OTTHUMG00000015728.3_4"; havana_transcript "OTTHUMT00000042521.3_3"; remap_num_mappings 1; remap_status "full_contig"; remap_target_status "overlap"; +chr6 HAVANA exon 143815949 143816984 . - . gene_id "ENSG00000001036.14_4"; transcript_id "ENST00000002165.11_3"; gene_type "protein_coding"; gene_name "FUCA2"; transcript_type "protein_coding"; transcript_name "FUCA2-201"; exon_number 1; exon_id "ENSE00001828368.3_1"; level 2; protein_id "ENSP00000002165.5"; transcript_support_level 1; hgnc_id "HGNC:4008"; tag "basic"; tag "MANE_Select"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS5200.1"; havana_gene "OTTHUMG00000015728.3_4"; havana_transcript "OTTHUMT00000042521.3_3"; remap_original_location "chr6:-:143494812-143495847"; remap_status "full_contig"; +chr6 HAVANA exon 143818526 143818634 . - . gene_id "ENSG00000001036.14_4"; transcript_id "ENST00000002165.11_3"; gene_type "protein_coding"; gene_name "FUCA2"; transcript_type "protein_coding"; transcript_name "FUCA2-201"; exon_number 2; exon_id "ENSE00002227591.1_1"; level 2; protein_id "ENSP00000002165.5"; transcript_support_level 1; hgnc_id "HGNC:4008"; tag "basic"; tag "MANE_Select"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS5200.1"; havana_gene "OTTHUMG00000015728.3_4"; havana_transcript "OTTHUMT00000042521.3_3"; remap_original_location "chr6:-:143497389-143497497"; remap_status "full_contig"; +chr6 HAVANA exon 143823069 143823259 . - . gene_id "ENSG00000001036.14_4"; transcript_id "ENST00000002165.11_3"; gene_type "protein_coding"; gene_name "FUCA2"; transcript_type "protein_coding"; transcript_name "FUCA2-201"; exon_number 3; exon_id "ENSE00003473218.1_1"; level 2; protein_id "ENSP00000002165.5"; transcript_support_level 1; hgnc_id "HGNC:4008"; tag "basic"; tag "MANE_Select"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS5200.1"; havana_gene "OTTHUMG00000015728.3_4"; havana_transcript "OTTHUMT00000042521.3_3"; remap_original_location "chr6:-:143501932-143502122"; remap_status "full_contig"; +chr6 HAVANA exon 143823492 143823702 . - . gene_id "ENSG00000001036.14_4"; transcript_id "ENST00000002165.11_3"; gene_type "protein_coding"; gene_name "FUCA2"; transcript_type "protein_coding"; transcript_name "FUCA2-201"; exon_number 4; exon_id "ENSE00002258449.1_1"; level 2; protein_id "ENSP00000002165.5"; transcript_support_level 1; hgnc_id "HGNC:4008"; tag "basic"; tag "MANE_Select"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS5200.1"; havana_gene "OTTHUMG00000015728.3_4"; havana_transcript "OTTHUMT00000042521.3_3"; remap_original_location "chr6:-:143502355-143502565"; remap_status "full_contig"; +chr6 HAVANA exon 143825050 143825389 . - . gene_id "ENSG00000001036.14_4"; transcript_id "ENST00000002165.11_3"; gene_type "protein_coding"; gene_name "FUCA2"; transcript_type "protein_coding"; transcript_name "FUCA2-201"; exon_number 5; exon_id "ENSE00002248349.1_1"; level 2; protein_id "ENSP00000002165.5"; transcript_support_level 1; hgnc_id "HGNC:4008"; tag "basic"; tag "MANE_Select"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS5200.1"; havana_gene "OTTHUMG00000015728.3_4"; havana_transcript "OTTHUMT00000042521.3_3"; remap_original_location "chr6:-:143503913-143504252"; remap_status "full_contig"; +chr6 HAVANA exon 143828374 143828561 . - . gene_id "ENSG00000001036.14_4"; transcript_id "ENST00000002165.11_3"; gene_type "protein_coding"; gene_name "FUCA2"; transcript_type "protein_coding"; transcript_name "FUCA2-201"; exon_number 6; exon_id "ENSE00003708374.1_1"; level 2; protein_id "ENSP00000002165.5"; transcript_support_level 1; hgnc_id "HGNC:4008"; tag "basic"; tag "MANE_Select"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS5200.1"; havana_gene "OTTHUMG00000015728.3_4"; havana_transcript "OTTHUMT00000042521.3_3"; remap_original_location "chr6:-:143507237-143507424"; remap_status "full_contig"; +chr6 HAVANA exon 143832548 143832857 . - . gene_id "ENSG00000001036.14_4"; transcript_id "ENST00000002165.11_3"; gene_type "protein_coding"; gene_name "FUCA2"; transcript_type "protein_coding"; transcript_name "FUCA2-201"; exon_number 7; exon_id "ENSE00003705756.2_1"; level 2; protein_id "ENSP00000002165.5"; transcript_support_level 1; hgnc_id "HGNC:4008"; tag "basic"; tag "MANE_Select"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS5200.1"; havana_gene "OTTHUMG00000015728.3_4"; havana_transcript "OTTHUMT00000042521.3_3"; remap_original_location "chr6:-:143511411-143511720"; remap_status "full_contig"; +chr1 HAVANA transcript 326096 328112 . + . gene_id "ENSG00000250575.1"; transcript_id "ENST00000514436.1"; gene_type "pseudogene"; gene_status "KNOWN"; gene_name "RP4-669L17.8"; transcript_type "unprocessed_pseudogene"; transcript_status "KNOWN"; transcript_name "RP4-669L17.8-001"; level 1; ont "PGO:0000005"; tag "pseudo_consens"; havana_gene "OTTHUMG00000002861.2"; havana_transcript "OTTHUMT00000008000.2"; remap_substituted_missing_target "V19"; +chr1 HAVANA exon 326096 326569 . + . gene_id "ENSG00000250575.1"; transcript_id "ENST00000514436.1"; gene_type "pseudogene"; gene_status "KNOWN"; gene_name "RP4-669L17.8"; transcript_type "unprocessed_pseudogene"; transcript_status "KNOWN"; transcript_name "RP4-669L17.8-001"; exon_number 1; exon_id "ENSE00002058739.1"; level 1; ont "PGO:0000005"; tag "pseudo_consens"; havana_gene "OTTHUMG00000002861.2"; havana_transcript "OTTHUMT00000008000.2"; remap_substituted_missing_target "V19"; +chr1 HAVANA exon 327348 328112 . + . gene_id "ENSG00000250575.1"; transcript_id "ENST00000514436.1"; gene_type "pseudogene"; gene_status "KNOWN"; gene_name "RP4-669L17.8"; transcript_type "unprocessed_pseudogene"; transcript_status "KNOWN"; transcript_name "RP4-669L17.8-001"; exon_number 2; exon_id "ENSE00002064640.1"; level 1; ont "PGO:0000005"; tag "pseudo_consens"; havana_gene "OTTHUMG00000002861.2"; havana_transcript "OTTHUMT00000008000.2"; remap_substituted_missing_target "V19"; diff --git a/testing/test_adding_datasets.py b/testing/test_adding_datasets.py index ea6238b..c82ccfc 100644 --- a/testing/test_adding_datasets.py +++ b/testing/test_adding_datasets.py @@ -39,6 +39,29 @@ def test_add_annotation(self): test = sg.t_df.apply(lambda x: (x.tid, x.novelty), axis=1) check_pairs(control, test) + def test_weird_gtf(self): + sg = swan.SwanGraph() + sg.add_dataset('test', 'input_files/weird_gtf_entries.gtf') + print(sg.t_df) + + # check each transcript + tid = 'ENST00000002165.11_3' + path = sg.t_df.loc[tid, 'path'] + print(path) + coords = sg.loc_df.loc[path, 'coord'].tolist() + ctrl_coords = [143832857, 143832548, 143828561, + 143828374, 143825389, 143825050, + 143823702, 143823492, 143823259, + 143823069, 143818634, 143818526, + 143816984, 143815949] + check_pairs(ctrl_coords, coords) + tid = 'ENST00000514436.1' + path = sg.t_df.loc[tid, 'path'] + print(path) + coords = sg.loc_df.loc[path, 'coord'].tolist() + ctrl_coords = [326096, 326569, 327348, 328112] + check_pairs(ctrl_coords, coords) + def check_pairs(control, test): print('control') print(control)