Skip to content

Commit

Permalink
added support for incorrectly ordered GTF entries and test cases for it
Browse files Browse the repository at this point in the history
  • Loading branch information
fairliereese committed Jun 22, 2020
1 parent 68b199d commit 21a6042
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 2 deletions.
11 changes: 9 additions & 2 deletions swan_vis/swangraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ def add_dataset(self, col, fname,
if ftype == 'gtf':
self.create_dfs_gtf(fname)
elif ftype == 'db':
self.create_dfs_db(fname, annot, whitelist, 'hepg2_1')
self.create_dfs_db(fname, annot, whitelist, dataset_name)

# add column to each df to indicate where data came from
self.loc_df[col] = True
Expand All @@ -175,7 +175,7 @@ def add_dataset(self, col, fname,
if ftype == 'gtf':
temp.create_dfs_gtf(fname)
elif ftype == 'db':
temp.create_dfs_db(fname, annot, whitelist, 'hepg2_1')
temp.create_dfs_db(fname, annot, whitelist, dataset_name)
self.merge_dfs(temp, col)

# remove isms if we have access to that information
Expand Down Expand Up @@ -541,6 +541,9 @@ def create_dfs_gtf(self, gtf_file):
locs[key] = vertex_id
vertex_id += 1

# create inverse loc dict to sort paths by
locs_inv = {v: k for k, v in locs.items()}

# add locs-indexed path to transcripts, and populate edges
edges = {}
for _,t in transcripts.items():
Expand Down Expand Up @@ -580,6 +583,10 @@ def create_dfs_gtf(self, gtf_file):
if key not in edges:
edges[key] = {'edge_id': edge_id, 'edge_type': 'intron'}

# sort the path based on chromosomal coordinates and strand
# in case there's some weird ordering in the gtf
t['path'] = reorder_locs(t['path'], strand, locs_inv)

# turn transcripts, edges, and locs into dataframes
locs = [{'chrom': key[0],
'coord': key[1],
Expand Down
12 changes: 12 additions & 0 deletions swan_vis/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,18 @@ def find_edge_start_stop(v1, v2, strand):
stop = max([v1, v2])
return start, stop

# reorder the locations in a transcript's path based on
# chromosomal coordinate
# TODO
def reorder_locs(path, strand, locs):
coords = [locs[i] for i in path]
path_coords = sorted(zip(path, coords), key=lambda x: x[1])
path = [i[0] for i in path_coords]
coords = [i[1][1] for i in path_coords]
if strand == '-':
path.reverse()
return path

# get novelty types associated with each transcript
def get_transcript_novelties(fields):
if fields['transcript_status'] == 'KNOWN':
Expand Down
11 changes: 11 additions & 0 deletions testing/input_files/weird_gtf_entries.gtf
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
chr6 HAVANA transcript 143815949 143832857 . - . gene_id "ENSG00000001036.14_4"; transcript_id "ENST00000002165.11_3"; gene_type "protein_coding"; gene_name "FUCA2"; transcript_type "protein_coding"; transcript_name "FUCA2-201"; level 2; protein_id "ENSP00000002165.5"; transcript_support_level 1; hgnc_id "HGNC:4008"; tag "basic"; tag "MANE_Select"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS5200.1"; havana_gene "OTTHUMG00000015728.3_4"; havana_transcript "OTTHUMT00000042521.3_3"; remap_num_mappings 1; remap_status "full_contig"; remap_target_status "overlap";
chr6 HAVANA exon 143815949 143816984 . - . gene_id "ENSG00000001036.14_4"; transcript_id "ENST00000002165.11_3"; gene_type "protein_coding"; gene_name "FUCA2"; transcript_type "protein_coding"; transcript_name "FUCA2-201"; exon_number 1; exon_id "ENSE00001828368.3_1"; level 2; protein_id "ENSP00000002165.5"; transcript_support_level 1; hgnc_id "HGNC:4008"; tag "basic"; tag "MANE_Select"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS5200.1"; havana_gene "OTTHUMG00000015728.3_4"; havana_transcript "OTTHUMT00000042521.3_3"; remap_original_location "chr6:-:143494812-143495847"; remap_status "full_contig";
chr6 HAVANA exon 143818526 143818634 . - . gene_id "ENSG00000001036.14_4"; transcript_id "ENST00000002165.11_3"; gene_type "protein_coding"; gene_name "FUCA2"; transcript_type "protein_coding"; transcript_name "FUCA2-201"; exon_number 2; exon_id "ENSE00002227591.1_1"; level 2; protein_id "ENSP00000002165.5"; transcript_support_level 1; hgnc_id "HGNC:4008"; tag "basic"; tag "MANE_Select"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS5200.1"; havana_gene "OTTHUMG00000015728.3_4"; havana_transcript "OTTHUMT00000042521.3_3"; remap_original_location "chr6:-:143497389-143497497"; remap_status "full_contig";
chr6 HAVANA exon 143823069 143823259 . - . gene_id "ENSG00000001036.14_4"; transcript_id "ENST00000002165.11_3"; gene_type "protein_coding"; gene_name "FUCA2"; transcript_type "protein_coding"; transcript_name "FUCA2-201"; exon_number 3; exon_id "ENSE00003473218.1_1"; level 2; protein_id "ENSP00000002165.5"; transcript_support_level 1; hgnc_id "HGNC:4008"; tag "basic"; tag "MANE_Select"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS5200.1"; havana_gene "OTTHUMG00000015728.3_4"; havana_transcript "OTTHUMT00000042521.3_3"; remap_original_location "chr6:-:143501932-143502122"; remap_status "full_contig";
chr6 HAVANA exon 143823492 143823702 . - . gene_id "ENSG00000001036.14_4"; transcript_id "ENST00000002165.11_3"; gene_type "protein_coding"; gene_name "FUCA2"; transcript_type "protein_coding"; transcript_name "FUCA2-201"; exon_number 4; exon_id "ENSE00002258449.1_1"; level 2; protein_id "ENSP00000002165.5"; transcript_support_level 1; hgnc_id "HGNC:4008"; tag "basic"; tag "MANE_Select"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS5200.1"; havana_gene "OTTHUMG00000015728.3_4"; havana_transcript "OTTHUMT00000042521.3_3"; remap_original_location "chr6:-:143502355-143502565"; remap_status "full_contig";
chr6 HAVANA exon 143825050 143825389 . - . gene_id "ENSG00000001036.14_4"; transcript_id "ENST00000002165.11_3"; gene_type "protein_coding"; gene_name "FUCA2"; transcript_type "protein_coding"; transcript_name "FUCA2-201"; exon_number 5; exon_id "ENSE00002248349.1_1"; level 2; protein_id "ENSP00000002165.5"; transcript_support_level 1; hgnc_id "HGNC:4008"; tag "basic"; tag "MANE_Select"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS5200.1"; havana_gene "OTTHUMG00000015728.3_4"; havana_transcript "OTTHUMT00000042521.3_3"; remap_original_location "chr6:-:143503913-143504252"; remap_status "full_contig";
chr6 HAVANA exon 143828374 143828561 . - . gene_id "ENSG00000001036.14_4"; transcript_id "ENST00000002165.11_3"; gene_type "protein_coding"; gene_name "FUCA2"; transcript_type "protein_coding"; transcript_name "FUCA2-201"; exon_number 6; exon_id "ENSE00003708374.1_1"; level 2; protein_id "ENSP00000002165.5"; transcript_support_level 1; hgnc_id "HGNC:4008"; tag "basic"; tag "MANE_Select"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS5200.1"; havana_gene "OTTHUMG00000015728.3_4"; havana_transcript "OTTHUMT00000042521.3_3"; remap_original_location "chr6:-:143507237-143507424"; remap_status "full_contig";
chr6 HAVANA exon 143832548 143832857 . - . gene_id "ENSG00000001036.14_4"; transcript_id "ENST00000002165.11_3"; gene_type "protein_coding"; gene_name "FUCA2"; transcript_type "protein_coding"; transcript_name "FUCA2-201"; exon_number 7; exon_id "ENSE00003705756.2_1"; level 2; protein_id "ENSP00000002165.5"; transcript_support_level 1; hgnc_id "HGNC:4008"; tag "basic"; tag "MANE_Select"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS5200.1"; havana_gene "OTTHUMG00000015728.3_4"; havana_transcript "OTTHUMT00000042521.3_3"; remap_original_location "chr6:-:143511411-143511720"; remap_status "full_contig";
chr1 HAVANA transcript 326096 328112 . + . gene_id "ENSG00000250575.1"; transcript_id "ENST00000514436.1"; gene_type "pseudogene"; gene_status "KNOWN"; gene_name "RP4-669L17.8"; transcript_type "unprocessed_pseudogene"; transcript_status "KNOWN"; transcript_name "RP4-669L17.8-001"; level 1; ont "PGO:0000005"; tag "pseudo_consens"; havana_gene "OTTHUMG00000002861.2"; havana_transcript "OTTHUMT00000008000.2"; remap_substituted_missing_target "V19";
chr1 HAVANA exon 326096 326569 . + . gene_id "ENSG00000250575.1"; transcript_id "ENST00000514436.1"; gene_type "pseudogene"; gene_status "KNOWN"; gene_name "RP4-669L17.8"; transcript_type "unprocessed_pseudogene"; transcript_status "KNOWN"; transcript_name "RP4-669L17.8-001"; exon_number 1; exon_id "ENSE00002058739.1"; level 1; ont "PGO:0000005"; tag "pseudo_consens"; havana_gene "OTTHUMG00000002861.2"; havana_transcript "OTTHUMT00000008000.2"; remap_substituted_missing_target "V19";
chr1 HAVANA exon 327348 328112 . + . gene_id "ENSG00000250575.1"; transcript_id "ENST00000514436.1"; gene_type "pseudogene"; gene_status "KNOWN"; gene_name "RP4-669L17.8"; transcript_type "unprocessed_pseudogene"; transcript_status "KNOWN"; transcript_name "RP4-669L17.8-001"; exon_number 2; exon_id "ENSE00002064640.1"; level 1; ont "PGO:0000005"; tag "pseudo_consens"; havana_gene "OTTHUMG00000002861.2"; havana_transcript "OTTHUMT00000008000.2"; remap_substituted_missing_target "V19";
23 changes: 23 additions & 0 deletions testing/test_adding_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,29 @@ def test_add_annotation(self):
test = sg.t_df.apply(lambda x: (x.tid, x.novelty), axis=1)
check_pairs(control, test)

def test_weird_gtf(self):
sg = swan.SwanGraph()
sg.add_dataset('test', 'input_files/weird_gtf_entries.gtf')
print(sg.t_df)

# check each transcript
tid = 'ENST00000002165.11_3'
path = sg.t_df.loc[tid, 'path']
print(path)
coords = sg.loc_df.loc[path, 'coord'].tolist()
ctrl_coords = [143832857, 143832548, 143828561,
143828374, 143825389, 143825050,
143823702, 143823492, 143823259,
143823069, 143818634, 143818526,
143816984, 143815949]
check_pairs(ctrl_coords, coords)
tid = 'ENST00000514436.1'
path = sg.t_df.loc[tid, 'path']
print(path)
coords = sg.loc_df.loc[path, 'coord'].tolist()
ctrl_coords = [326096, 326569, 327348, 328112]
check_pairs(ctrl_coords, coords)

def check_pairs(control, test):
print('control')
print(control)
Expand Down

0 comments on commit 21a6042

Please sign in to comment.