added support for incorrectly ordered GTF entries and test cases for it

mortazavilab · Jun 22, 2020 · 21a6042 · 21a6042
1 parent 68b199d
commit 21a6042
Show file tree

Hide file tree

Showing 4 changed files with 55 additions and 2 deletions.
diff --git a/swan_vis/swangraph.py b/swan_vis/swangraph.py
@@ -161,7 +161,7 @@ def add_dataset(self, col, fname,
 			if ftype == 'gtf':
 				self.create_dfs_gtf(fname)
 			elif ftype == 'db':
-				self.create_dfs_db(fname, annot, whitelist, 'hepg2_1')
+				self.create_dfs_db(fname, annot, whitelist, dataset_name)
 
 			# add column to each df to indicate where data came from
 			self.loc_df[col] = True
@@ -175,7 +175,7 @@ def add_dataset(self, col, fname,
 			if ftype == 'gtf':
 				temp.create_dfs_gtf(fname)
 			elif ftype == 'db':
-				temp.create_dfs_db(fname, annot, whitelist, 'hepg2_1')
+				temp.create_dfs_db(fname, annot, whitelist, dataset_name)
 			self.merge_dfs(temp, col)
 
 		# remove isms if we have access to that information
@@ -541,6 +541,9 @@ def create_dfs_gtf(self, gtf_file):
 				locs[key] = vertex_id
 				vertex_id += 1
 
+		# create inverse loc dict to sort paths by
+		locs_inv = {v: k for k, v in locs.items()}
+
 		# add locs-indexed path to transcripts, and populate edges
 		edges = {}
 		for _,t in transcripts.items():
@@ -580,6 +583,10 @@ def create_dfs_gtf(self, gtf_file):
 					if key not in edges:
 						edges[key] = {'edge_id': edge_id, 'edge_type': 'intron'}
 
+			# sort the path based on chromosomal coordinates and strand
+			# in case there's some weird ordering in the gtf 
+			t['path'] = reorder_locs(t['path'], strand, locs_inv)
+
 		# turn transcripts, edges, and locs into dataframes
 		locs = [{'chrom': key[0],
 				 'coord': key[1],

diff --git a/swan_vis/utils.py b/swan_vis/utils.py
@@ -204,6 +204,18 @@ def find_edge_start_stop(v1, v2, strand):
 		stop = max([v1, v2])
 	return start, stop
 
+# reorder the locations in a transcript's path based on
+# chromosomal coordinate
+# TODO
+def reorder_locs(path, strand, locs):
+	coords = [locs[i] for i in path]
+	path_coords = sorted(zip(path, coords), key=lambda x: x[1])
+	path = [i[0] for i in path_coords]
+	coords = [i[1][1] for i in path_coords]
+	if strand == '-':
+		path.reverse()
+	return path 
+
 # get novelty types associated with each transcript
 def get_transcript_novelties(fields):
 	if fields['transcript_status'] == 'KNOWN':

diff --git a/testing/input_files/weird_gtf_entries.gtf b/testing/input_files/weird_gtf_entries.gtf
@@ -0,0 +1,11 @@
+chr6	HAVANA	transcript	143815949	143832857	.	-	.	gene_id "ENSG00000001036.14_4"; transcript_id "ENST00000002165.11_3"; gene_type "protein_coding"; gene_name "FUCA2"; transcript_type "protein_coding"; transcript_name "FUCA2-201"; level 2; protein_id "ENSP00000002165.5"; transcript_support_level 1; hgnc_id "HGNC:4008"; tag "basic"; tag "MANE_Select"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS5200.1"; havana_gene "OTTHUMG00000015728.3_4"; havana_transcript "OTTHUMT00000042521.3_3"; remap_num_mappings 1; remap_status "full_contig"; remap_target_status "overlap";
+chr6	HAVANA	exon	143815949	143816984	.	-	.	gene_id "ENSG00000001036.14_4"; transcript_id "ENST00000002165.11_3"; gene_type "protein_coding"; gene_name "FUCA2"; transcript_type "protein_coding"; transcript_name "FUCA2-201"; exon_number 1; exon_id "ENSE00001828368.3_1"; level 2; protein_id "ENSP00000002165.5"; transcript_support_level 1; hgnc_id "HGNC:4008"; tag "basic"; tag "MANE_Select"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS5200.1"; havana_gene "OTTHUMG00000015728.3_4"; havana_transcript "OTTHUMT00000042521.3_3"; remap_original_location "chr6:-:143494812-143495847"; remap_status "full_contig";
+chr6	HAVANA	exon	143818526	143818634	.	-	.	gene_id "ENSG00000001036.14_4"; transcript_id "ENST00000002165.11_3"; gene_type "protein_coding"; gene_name "FUCA2"; transcript_type "protein_coding"; transcript_name "FUCA2-201"; exon_number 2; exon_id "ENSE00002227591.1_1"; level 2; protein_id "ENSP00000002165.5"; transcript_support_level 1; hgnc_id "HGNC:4008"; tag "basic"; tag "MANE_Select"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS5200.1"; havana_gene "OTTHUMG00000015728.3_4"; havana_transcript "OTTHUMT00000042521.3_3"; remap_original_location "chr6:-:143497389-143497497"; remap_status "full_contig";
+chr6	HAVANA	exon	143823069	143823259	.	-	.	gene_id "ENSG00000001036.14_4"; transcript_id "ENST00000002165.11_3"; gene_type "protein_coding"; gene_name "FUCA2"; transcript_type "protein_coding"; transcript_name "FUCA2-201"; exon_number 3; exon_id "ENSE00003473218.1_1"; level 2; protein_id "ENSP00000002165.5"; transcript_support_level 1; hgnc_id "HGNC:4008"; tag "basic"; tag "MANE_Select"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS5200.1"; havana_gene "OTTHUMG00000015728.3_4"; havana_transcript "OTTHUMT00000042521.3_3"; remap_original_location "chr6:-:143501932-143502122"; remap_status "full_contig";
+chr6	HAVANA	exon	143823492	143823702	.	-	.	gene_id "ENSG00000001036.14_4"; transcript_id "ENST00000002165.11_3"; gene_type "protein_coding"; gene_name "FUCA2"; transcript_type "protein_coding"; transcript_name "FUCA2-201"; exon_number 4; exon_id "ENSE00002258449.1_1"; level 2; protein_id "ENSP00000002165.5"; transcript_support_level 1; hgnc_id "HGNC:4008"; tag "basic"; tag "MANE_Select"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS5200.1"; havana_gene "OTTHUMG00000015728.3_4"; havana_transcript "OTTHUMT00000042521.3_3"; remap_original_location "chr6:-:143502355-143502565"; remap_status "full_contig";
+chr6	HAVANA	exon	143825050	143825389	.	-	.	gene_id "ENSG00000001036.14_4"; transcript_id "ENST00000002165.11_3"; gene_type "protein_coding"; gene_name "FUCA2"; transcript_type "protein_coding"; transcript_name "FUCA2-201"; exon_number 5; exon_id "ENSE00002248349.1_1"; level 2; protein_id "ENSP00000002165.5"; transcript_support_level 1; hgnc_id "HGNC:4008"; tag "basic"; tag "MANE_Select"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS5200.1"; havana_gene "OTTHUMG00000015728.3_4"; havana_transcript "OTTHUMT00000042521.3_3"; remap_original_location "chr6:-:143503913-143504252"; remap_status "full_contig";
+chr6	HAVANA	exon	143828374	143828561	.	-	.	gene_id "ENSG00000001036.14_4"; transcript_id "ENST00000002165.11_3"; gene_type "protein_coding"; gene_name "FUCA2"; transcript_type "protein_coding"; transcript_name "FUCA2-201"; exon_number 6; exon_id "ENSE00003708374.1_1"; level 2; protein_id "ENSP00000002165.5"; transcript_support_level 1; hgnc_id "HGNC:4008"; tag "basic"; tag "MANE_Select"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS5200.1"; havana_gene "OTTHUMG00000015728.3_4"; havana_transcript "OTTHUMT00000042521.3_3"; remap_original_location "chr6:-:143507237-143507424"; remap_status "full_contig";
+chr6	HAVANA	exon	143832548	143832857	.	-	.	gene_id "ENSG00000001036.14_4"; transcript_id "ENST00000002165.11_3"; gene_type "protein_coding"; gene_name "FUCA2"; transcript_type "protein_coding"; transcript_name "FUCA2-201"; exon_number 7; exon_id "ENSE00003705756.2_1"; level 2; protein_id "ENSP00000002165.5"; transcript_support_level 1; hgnc_id "HGNC:4008"; tag "basic"; tag "MANE_Select"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS5200.1"; havana_gene "OTTHUMG00000015728.3_4"; havana_transcript "OTTHUMT00000042521.3_3"; remap_original_location "chr6:-:143511411-143511720"; remap_status "full_contig";
+chr1	HAVANA	transcript	326096	328112	.	+	.	gene_id "ENSG00000250575.1"; transcript_id "ENST00000514436.1"; gene_type "pseudogene"; gene_status "KNOWN"; gene_name "RP4-669L17.8"; transcript_type "unprocessed_pseudogene"; transcript_status "KNOWN"; transcript_name "RP4-669L17.8-001"; level 1; ont "PGO:0000005"; tag "pseudo_consens"; havana_gene "OTTHUMG00000002861.2"; havana_transcript "OTTHUMT00000008000.2"; remap_substituted_missing_target "V19";
+chr1	HAVANA	exon	326096	326569	.	+	.	gene_id "ENSG00000250575.1"; transcript_id "ENST00000514436.1"; gene_type "pseudogene"; gene_status "KNOWN"; gene_name "RP4-669L17.8"; transcript_type "unprocessed_pseudogene"; transcript_status "KNOWN"; transcript_name "RP4-669L17.8-001"; exon_number 1; exon_id "ENSE00002058739.1"; level 1; ont "PGO:0000005"; tag "pseudo_consens"; havana_gene "OTTHUMG00000002861.2"; havana_transcript "OTTHUMT00000008000.2"; remap_substituted_missing_target "V19";
+chr1	HAVANA	exon	327348	328112	.	+	.	gene_id "ENSG00000250575.1"; transcript_id "ENST00000514436.1"; gene_type "pseudogene"; gene_status "KNOWN"; gene_name "RP4-669L17.8"; transcript_type "unprocessed_pseudogene"; transcript_status "KNOWN"; transcript_name "RP4-669L17.8-001"; exon_number 2; exon_id "ENSE00002064640.1"; level 1; ont "PGO:0000005"; tag "pseudo_consens"; havana_gene "OTTHUMG00000002861.2"; havana_transcript "OTTHUMT00000008000.2"; remap_substituted_missing_target "V19";
diff --git a/testing/test_adding_datasets.py b/testing/test_adding_datasets.py
@@ -39,6 +39,29 @@ def test_add_annotation(self):
 		test = sg.t_df.apply(lambda x: (x.tid, x.novelty), axis=1)
 		check_pairs(control, test)
 
+	def test_weird_gtf(self):
+		sg = swan.SwanGraph()
+		sg.add_dataset('test', 'input_files/weird_gtf_entries.gtf')
+		print(sg.t_df)
+
+		# check each transcript 
+		tid = 'ENST00000002165.11_3'
+		path = sg.t_df.loc[tid, 'path']
+		print(path)
+		coords = sg.loc_df.loc[path, 'coord'].tolist()
+		ctrl_coords = [143832857, 143832548, 143828561,
+					   143828374, 143825389, 143825050,
+					   143823702, 143823492, 143823259,
+					   143823069, 143818634, 143818526, 
+					   143816984, 143815949]
+		check_pairs(ctrl_coords, coords)
+		tid = 'ENST00000514436.1'
+		path = sg.t_df.loc[tid, 'path']
+		print(path)
+		coords = sg.loc_df.loc[path, 'coord'].tolist()
+		ctrl_coords = [326096, 326569, 327348, 328112]
+		check_pairs(ctrl_coords, coords)
+
 def check_pairs(control, test):
 	print('control')
 	print(control)