From 214aac05a89e5ed11a49d9f475dd0db9390bcbde Mon Sep 17 00:00:00 2001 From: jhawkey Date: Thu, 13 Aug 2015 13:39:40 +1000 Subject: [PATCH 01/19] fixed error for when no hits around found but bed files non-empty --- scripts/create_typing_out.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/scripts/create_typing_out.py b/scripts/create_typing_out.py index 0c542c9..ecde99d 100755 --- a/scripts/create_typing_out.py +++ b/scripts/create_typing_out.py @@ -316,7 +316,6 @@ def main(): else: removed_results['region_' + str(lines)] = line.strip() + '\tintersect.bed\n' lines += 1 - # Get size of IS query is_length = insertion_length(args.seq) # Look inside the closest file (known or imprecise hits) @@ -485,6 +484,9 @@ def main(): else: removed_results['region_' + str(region)] = line.strip() + '\tright_unpaired.bed\n' region += 1 + # Open the output file and write in the header + output = open(args.output + '_table.txt', 'w') + output.write('\t'.join(header) + '\n') # Sort regions into the correct order table_keys = [] for key in results: @@ -493,32 +495,30 @@ def main(): for region in table_keys: region_indexes.append(region.split('region_')[1]) arr = np.vstack((table_keys, region_indexes)).transpose() - if arr != 0: - sorted_keys = arr[arr[:,1].astype('int').argsort()] - # Write out the found hits to file - output = open(args.output + '_table.txt', 'w') - output.write('\t'.join(header) + '\n') if arr != 0: + sorted_keys = arr[arr[:,1].astype('int').argsort()] for key in sorted_keys[:,0]: output.write(key + '\t' + '\t'.join(str(i) for i in results[key]) + '\n') - if arr == 0: + # If all the hits failed, write out that no hits were found + elif arr == 0: output.write('No hits found.') output.close() # Write out the found hits to bed file for viewing in IGV with open(args.output + '_hits.bed', 'w') as outfile: - if args.chr_name == 'not_specified': - args.chr_name = SeqIO.read(args.ref, 'genbank').id - if args.igv == 1: - outfile.write('#gffTags \n') - for key in sorted_keys[:,0]: - r = results[key] - outfile.write(args.chr_name + '\t' + r[1] + '\t' + r[2] + '\t' + 'Name=' + key + ';orientation=' + r[0] + ';' + ';'.join([header[i+1] + '=' + str(r[i]) for i in range(3, len(r))])+ '\n') - else: - for key in sorted_keys[:,0]: - r = results[key] - outfile.write(args.chr_name + '\t' + r[1] + '\t' + r[2] + key + '\n') + if arr != 0: + if args.chr_name == 'not_specified': + args.chr_name = SeqIO.read(args.ref, 'genbank').id + if args.igv == 1: + outfile.write('#gffTags \n') + for key in sorted_keys[:,0]: + r = results[key] + outfile.write(args.chr_name + '\t' + r[1] + '\t' + r[2] + '\t' + 'Name=' + key + ';orientation=' + r[0] + ';' + ';'.join([header[i+1] + '=' + str(r[i]) for i in range(3, len(r))])+ '\n') + else: + for key in sorted_keys[:,0]: + r = results[key] + outfile.write(args.chr_name + '\t' + r[1] + '\t' + r[2] + key + '\n') # Write out hits that were removed for whatever reason to file if len(removed_results) != 0: From a4f99497408f01f5ad6c9a55bca74b23cc15f873 Mon Sep 17 00:00:00 2001 From: jhawkey Date: Fri, 14 Aug 2015 10:16:29 +1000 Subject: [PATCH 02/19] ordered feature_list to make binary search function work properly --- scripts/compiled_table.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/scripts/compiled_table.py b/scripts/compiled_table.py index 6f63f3c..0433489 100644 --- a/scripts/compiled_table.py +++ b/scripts/compiled_table.py @@ -317,6 +317,13 @@ def get_flanking_genes(features, feature_list, left, right, cds_features, trna_f left_feature_index = binary_search(feature_list, left, 'L') right_feature_index = binary_search(feature_list, right, 'R') # Extract the SeqFeature object that corresponds to that index + if type(left_feature_index) != int or type(right_feature_index) != int: + print 'left index' + print left_feature_index + print 'right index' + print right_feature_index + print 'left position: ' + str(left) + print 'right position: ' + str(right) left_feature = features[left_feature_index] right_feature = features[right_feature_index] @@ -528,7 +535,8 @@ def main(): feature_count += 1 else: feature_count += 1 - + # Sort the list just in case it's out of order (has caused issues in the past!!) + feature_list = sorted(feature_list, key=itemgetter(0)) # Get flanking genes for pos in list_of_positions: genes_before, genes_after = get_flanking_genes(gb.features, feature_list, pos.x, pos.y, args.cds, args.trna, args.rrna) From 330109107ac8132372620c76317e3fc6167b52fc Mon Sep 17 00:00:00 2001 From: jhawkey Date: Fri, 28 Aug 2015 10:17:44 +1000 Subject: [PATCH 03/19] binary search catches when index error occurs for findAfterPosition --- scripts/compiled_table.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/scripts/compiled_table.py b/scripts/compiled_table.py index 0433489..75007d7 100644 --- a/scripts/compiled_table.py +++ b/scripts/compiled_table.py @@ -294,7 +294,19 @@ def findFeatureBeforePosition(features, isPosition, m): def findFeatureAfterPosition(features, isPosition, m): # If we are looking for the feature to the right of the # IS position, then either m or m+1 is our answer - + print m + print len(features) + print features[m] + print isPosition + # an index error will occur if m is the final feature, so just check that the first part is true + # and return m + try: + features[m+1] + except IndexError: + if features[m][0] > isPosition: + return features[m][2] + else: + return "4 - THIS SHOULDN'T HAPPEN!" # If the end of the m feature is before the IS position, # then m is before the IS and m+1 is the correct feature if features[m][1] < isPosition: @@ -307,7 +319,6 @@ def findFeatureAfterPosition(features, isPosition, m): # then m will be closer to the IS and is the correct feature elif features[m][0] > isPosition and features[m+1][0] > isPosition: return features[m][2] - else: return "3 - THIS SHOULDN'T HAPPEN!" From 732b8fd74786d46e72d1c656cd84226b7855cdae Mon Sep 17 00:00:00 2001 From: jhawkey Date: Fri, 28 Aug 2015 11:02:52 +1000 Subject: [PATCH 04/19] updated distances when flanking genes are first and last genes in ref --- scripts/compiled_table.py | 39 ++++++++++++++++++++++++++++++++---- scripts/create_typing_out.py | 6 +++--- 2 files changed, 38 insertions(+), 7 deletions(-) diff --git a/scripts/compiled_table.py b/scripts/compiled_table.py index 75007d7..c108b97 100644 --- a/scripts/compiled_table.py +++ b/scripts/compiled_table.py @@ -305,8 +305,10 @@ def findFeatureAfterPosition(features, isPosition, m): except IndexError: if features[m][0] > isPosition: return features[m][2] + # otherwise we must be after the final position, so need to + # return the start position of the very first feature else: - return "4 - THIS SHOULDN'T HAPPEN!" + return features[0][2] # If the end of the m feature is before the IS position, # then m is before the IS and m+1 is the correct feature if features[m][1] < isPosition: @@ -322,12 +324,12 @@ def findFeatureAfterPosition(features, isPosition, m): else: return "3 - THIS SHOULDN'T HAPPEN!" -def get_flanking_genes(features, feature_list, left, right, cds_features, trna_features, rrna_features): +def get_flanking_genes(features, feature_list, left, right, cds_features, trna_features, rrna_features, genome_size): # Find the correct indexes left_feature_index = binary_search(feature_list, left, 'L') right_feature_index = binary_search(feature_list, right, 'R') - # Extract the SeqFeature object that corresponds to that index + # Print out information if returning one of the error codes if type(left_feature_index) != int or type(right_feature_index) != int: print 'left index' print left_feature_index @@ -335,6 +337,9 @@ def get_flanking_genes(features, feature_list, left, right, cds_features, trna_f print right_feature_index print 'left position: ' + str(left) print 'right position: ' + str(right) + print left_feature_index + print right_feature_index + # Extract the SeqFeature object that corresponds to that index left_feature = features[left_feature_index] right_feature = features[right_feature_index] @@ -349,6 +354,32 @@ def get_flanking_genes(features, feature_list, left, right, cds_features, trna_f left_dist = abs(max(left_feature.location.start, left_feature.location.end) - left) # The distance to the right gene is the startmost position of the feature - the right IS coord right_dist = abs(min(right_feature.location.start, right_feature.location.end) - right) + + # Here is probably a good place to check if we've got a position that wraps around from start + # to end of the reference + # If we've got a distance that is close to the size of the reference, then we know we need to + # alter it + if left_dist in range(int(round(genome_size * 0.9)), int(round(genome_size * 1.1))): + # The the left hand feature is at the end of the genome + # Distance from IS position to start of the genome is the + # position itself + dist_to_start = left + # Distance from the end of the final gene to the end of the + # genome + dist_to_end = abs(left_feature.location.end - genome_size) + # So the total distnace is those two added together + left_dist = dist_to_start + dist_to_end + + elif right_dist in range(int(round(genome_size * 0.9)), int(round(genome_size * 1.1))): + # Then the right hand feature is at the start of the genome + # Distance from the IS position to the end of the genome + dist_to_end = abs(genome_size - right) + # Distance from the start of the genome to the start of the first feature + # is the start position of the first feature + dist_to_feature = right_feature.location.start + # So the total distance is those two added together + right_dist = dist_to_end + dist_to_feature + # The first string in this values list is the main gene id (eg locus_tag) left_gene = [left_values[0], str(left_dist), left_values[1:]] right_gene = [right_values[0], str(right_dist), right_values[1:]] @@ -550,7 +581,7 @@ def main(): feature_list = sorted(feature_list, key=itemgetter(0)) # Get flanking genes for pos in list_of_positions: - genes_before, genes_after = get_flanking_genes(gb.features, feature_list, pos.x, pos.y, args.cds, args.trna, args.rrna) + genes_before, genes_after = get_flanking_genes(gb.features, feature_list, pos.x, pos.y, args.cds, args.trna, args.rrna, len(gb.seq)) pos.left_feature = genes_before pos.right_feature = genes_after diff --git a/scripts/create_typing_out.py b/scripts/create_typing_out.py index ecde99d..7e48a30 100755 --- a/scripts/create_typing_out.py +++ b/scripts/create_typing_out.py @@ -139,7 +139,7 @@ def novel_hit(x_L, y_L, x_R, y_R, x, y, genbank, ref, cds, trna, rrna, gap, orie genbank.features.append(right_feature) # Get the genes flanking the left and right ends - gene_left, gene_right = get_flanking_genes(features, feature_list, x, y, cds, trna, rrna) + gene_left, gene_right = get_flanking_genes(features, feature_list, x, y, cds, trna, rrna, len(genbank.seq)) #print gene_left #print gene_right # If the genes are the same, then hit is inside the gene @@ -187,7 +187,7 @@ def add_known(x_L, x_R, y_L, y_R, gap, genbank, ref, seq, temp, cds, trna, rrna, # Taking all four coordinates and finding min and max to avoid coordinates # that overlap the actual IS (don't want to return those in gene calls) # Mark as a known call to improve accuracy of gene calling - gene_left, gene_right = get_flanking_genes(features, feature_list, start, end, cds, trna, rrna) + gene_left, gene_right = get_flanking_genes(features, feature_list, start, end, cds, trna, rrna, len(genbank.seq)) #gene_left = get_other_gene(ref, min(y_L, y_R, x_R, x_L), "left", cds, trna, rrna, known=True) #gene_right = get_other_gene(ref, max(y_L, y_R, x_R, x_L), "right", cds, trna, rrna, known=True) @@ -210,7 +210,7 @@ def add_known(x_L, x_R, y_L, y_R, gap, genbank, ref, seq, temp, cds, trna, rrna, else: # Then I'm not sure what this is # Get flanking genes anyway - gene_left, gene_right = get_flanking_genes(features, feature_list, start, end, cds, trna, rrna) + gene_left, gene_right = get_flanking_genes(features, feature_list, start, end, cds, trna, rrna, len(genbank.seq)) if 'unpaired' in file_loc: call = 'Possible related IS?' else: From c7abc1ad4ad1ff8741d646ec6da5feb0b79709fe Mon Sep 17 00:00:00 2001 From: jhawkey Date: Fri, 28 Aug 2015 11:04:37 +1000 Subject: [PATCH 05/19] removed excess print statements --- scripts/compiled_table.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/scripts/compiled_table.py b/scripts/compiled_table.py index c108b97..b342ccd 100644 --- a/scripts/compiled_table.py +++ b/scripts/compiled_table.py @@ -294,10 +294,7 @@ def findFeatureBeforePosition(features, isPosition, m): def findFeatureAfterPosition(features, isPosition, m): # If we are looking for the feature to the right of the # IS position, then either m or m+1 is our answer - print m - print len(features) - print features[m] - print isPosition + # an index error will occur if m is the final feature, so just check that the first part is true # and return m try: From b2a0062a81974432b46fa9cb8314365399aced82 Mon Sep 17 00:00:00 2001 From: jhawkey Date: Fri, 28 Aug 2015 13:37:34 +1000 Subject: [PATCH 06/19] fixed when unpaired hit pairs with itself --- scripts/create_typing_out.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/scripts/create_typing_out.py b/scripts/create_typing_out.py index 7e48a30..c2a5bc2 100755 --- a/scripts/create_typing_out.py +++ b/scripts/create_typing_out.py @@ -402,7 +402,10 @@ def main(): L_range = range(min(x_L, y_L), max(x_L, y_L)) # if one hit is inside the other hit, remove it - don't know what to do with these if (x_L in R_range and y_L in R_range) or (x_R in L_range and y_R in L_range): - removed_results['region_' + str(lines)] = line.strip() + '\tOne hit inside the other, intersect.bed\n' + removed_results['region_' + str(lines)] = line.strip() + '\tOne hit inside the other, left_unpaired.bed\n' + # or if a hit is paired with itself, which means it doesn't have a pair + elif x_L == x_R and y_L == y_R: + removed_results['region_' + str(region)] = line.strip() + '\tleft_unpaired.bed, paired with self\n' else: # Get orientation if x_L < x_R and y_L < y_R: @@ -443,6 +446,7 @@ def main(): with open(args.right_unpaired) as right_unpaired: for line in right_unpaired: info = line.strip().split('\t') + print info #this is an unpaired hit if line.strip().split('\t')[3:6] in line_check: #get coordinate info @@ -454,7 +458,10 @@ def main(): L_range = range(min(x_L, y_L), max(x_L, y_L)) # if one hit is inside the other hit, remove it - don't know what to do with these if (x_L in R_range and y_L in R_range) or (x_R in L_range and y_R in L_range): - removed_results['region_' + str(lines)] = line.strip() + '\tOne hit inside the other, intersect.bed\n' + removed_results['region_' + str(lines)] = line.strip() + '\tOne hit inside the other, right_unpaired.bed\n' + # or if a hit is paired with itself, which means it doesn't have a pair + elif x_L == x_R and y_L == y_R: + removed_results['region_' + str(region)] = line.strip() + '\tright_unpaired.bed, paired with self\n' else: #get orientation if x_L < x_R and y_L < y_R: From f14df560da7fb9a08fa1acf18a01e7815201d576 Mon Sep 17 00:00:00 2001 From: jhawkey Date: Fri, 28 Aug 2015 13:38:16 +1000 Subject: [PATCH 07/19] removed excess print statement --- scripts/create_typing_out.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/create_typing_out.py b/scripts/create_typing_out.py index c2a5bc2..4d07436 100755 --- a/scripts/create_typing_out.py +++ b/scripts/create_typing_out.py @@ -446,7 +446,6 @@ def main(): with open(args.right_unpaired) as right_unpaired: for line in right_unpaired: info = line.strip().split('\t') - print info #this is an unpaired hit if line.strip().split('\t')[3:6] in line_check: #get coordinate info From d254275d8d706333b9aa2164b9973482ffd2202b Mon Sep 17 00:00:00 2001 From: jhawkey Date: Fri, 28 Aug 2015 13:39:48 +1000 Subject: [PATCH 08/19] removed more excess print statements --- scripts/compiled_table.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/scripts/compiled_table.py b/scripts/compiled_table.py index b342ccd..b5defa1 100644 --- a/scripts/compiled_table.py +++ b/scripts/compiled_table.py @@ -334,8 +334,6 @@ def get_flanking_genes(features, feature_list, left, right, cds_features, trna_f print right_feature_index print 'left position: ' + str(left) print 'right position: ' + str(right) - print left_feature_index - print right_feature_index # Extract the SeqFeature object that corresponds to that index left_feature = features[left_feature_index] right_feature = features[right_feature_index] From 475bb845f741c0f49e21b98641f1728fd5ec3117 Mon Sep 17 00:00:00 2001 From: jhawkey Date: Thu, 3 Sep 2015 12:57:13 +1000 Subject: [PATCH 09/19] sorting list of features --- scripts/create_typing_out.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/create_typing_out.py b/scripts/create_typing_out.py index 4d07436..f87df5c 100755 --- a/scripts/create_typing_out.py +++ b/scripts/create_typing_out.py @@ -264,6 +264,7 @@ def main(): else: feature_count_list += 1 + feature_list = sorted(feature_list, key=itemgetter(0)) # Initialise feature count feature_count = 0 From 511cd94f9bfe2557cc4dc17e92e0be939fdfc2e9 Mon Sep 17 00:00:00 2001 From: jhawkey Date: Fri, 18 Sep 2015 14:33:23 +1000 Subject: [PATCH 10/19] updated readme section for slurm script to be more clear --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index aeaeadc..492343f 100755 --- a/README.md +++ b/README.md @@ -211,7 +211,9 @@ eg: ## Running multiple jobs on a SLURM queing system There is no need to set the --output flag in other_args for this script, it sets it itself using the name of the readset. -Also, make sure the query sequence is already index with bwa (using `bwa mem query.fasta`) to prevent corruption of the index once multiple jobs start running. +Also, make sure the query sequence is already index with bwa (using `bwa mem query.fasta`) to prevent corruption of the index once multiple jobs start running. + +When running in improvement mode, ensure that --assemblies and --assembliesid is passed to the slurm script, but --extension and --type are passed to the ismap script through --other_args. ``` python slurm_ismap.py -h From cb140b8cf8fcae238504d9c1f205ed434a63bb51 Mon Sep 17 00:00:00 2001 From: jhawkey Date: Tue, 22 Sep 2015 17:43:22 +1000 Subject: [PATCH 11/19] updated ranges for checking is position is overlapping and therefore should be removed. --- scripts/create_typing_out.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/scripts/create_typing_out.py b/scripts/create_typing_out.py index f87df5c..c21df02 100755 --- a/scripts/create_typing_out.py +++ b/scripts/create_typing_out.py @@ -289,8 +289,8 @@ def main(): y_R = int(info[5]) # Check to see if the gap is reasonable if int(info[6]) <= 15: - R_range = range(min(x_R, y_R), max(x_R, y_R)) - L_range = range(min(x_L, y_L), max(x_L, y_L)) + R_range = range(min(x_R, y_R), max(x_R, y_R)+1) + L_range = range(min(x_L, y_L), max(x_L, y_L)+1) # if one hit is inside the other hit, remove it - don't know what to do with these if (x_L in R_range and y_L in R_range) or (x_R in L_range and y_R in L_range): removed_results['region_' + str(lines)] = line.strip() + '\tOne hit inside the other, intersect.bed\n' @@ -338,8 +338,8 @@ def main(): y_L = int(info[2]) x_R = int(info[4]) y_R = int(info[5]) - R_range = range(min(x_R, y_R), max(x_R, y_R)) - L_range = range(min(x_L, y_L), max(x_L, y_L)) + R_range = range(min(x_R, y_R), max(x_R, y_R)+1) + L_range = range(min(x_L, y_L), max(x_L, y_L)+1) # if one hit is inside the other hit, remove it - don't know what to do with these if (x_L in R_range and y_L in R_range) or (x_R in L_range and y_R in L_range): removed_results['region_' + str(region)] = line.strip() + '\tOne hit inside the other, closest.bed\n' @@ -399,8 +399,8 @@ def main(): y_L = int(info[2]) x_R = int(info[4]) y_R = int(info[5]) - R_range = range(min(x_R, y_R), max(x_R, y_R)) - L_range = range(min(x_L, y_L), max(x_L, y_L)) + R_range = range(min(x_R, y_R), max(x_R, y_R)+1) + L_range = range(min(x_L, y_L), max(x_L, y_L)+1) # if one hit is inside the other hit, remove it - don't know what to do with these if (x_L in R_range and y_L in R_range) or (x_R in L_range and y_R in L_range): removed_results['region_' + str(lines)] = line.strip() + '\tOne hit inside the other, left_unpaired.bed\n' @@ -454,8 +454,8 @@ def main(): y_L = int(info[2]) x_R = int(info[4]) y_R = int(info[5]) - R_range = range(min(x_R, y_R), max(x_R, y_R)) - L_range = range(min(x_L, y_L), max(x_L, y_L)) + R_range = range(min(x_R, y_R), max(x_R, y_R)+1) + L_range = range(min(x_L, y_L), max(x_L, y_L)+1) # if one hit is inside the other hit, remove it - don't know what to do with these if (x_L in R_range and y_L in R_range) or (x_R in L_range and y_R in L_range): removed_results['region_' + str(lines)] = line.strip() + '\tOne hit inside the other, right_unpaired.bed\n' From 83a645755897d3a05fde68e12e8c32536c53c66b Mon Sep 17 00:00:00 2001 From: jhawkey Date: Wed, 23 Sep 2015 17:03:52 +1000 Subject: [PATCH 12/19] fixed possible related IS call - must meet 50% ID and 50% cov to call --- scripts/create_typing_out.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/scripts/create_typing_out.py b/scripts/create_typing_out.py index c21df02..e54ca8e 100755 --- a/scripts/create_typing_out.py +++ b/scripts/create_typing_out.py @@ -207,19 +207,18 @@ def add_known(x_L, x_R, y_L, y_R, gap, genbank, ref, seq, temp, cds, trna, rrna, else: call = 'Known' results['region_' + str(region)] = [orient, str(start), str(end), gap, call, str(seq_results[0]), str('%.2f' % seq_results[1]), gene_left[-1][:-1], gene_left[-1][-1], gene_left[1], gene_right[-1][:-1], gene_right[-1][-1], gene_right[1], func_pred] - else: - # Then I'm not sure what this is - # Get flanking genes anyway + elif len(seq_results) != 0 and seq_results[0] >=50 and seq_results[1] >= 50: + # Calling it a possible related IS if there is 50% nucleotide ID and 50% coverage gene_left, gene_right = get_flanking_genes(features, feature_list, start, end, cds, trna, rrna, len(genbank.seq)) if 'unpaired' in file_loc: call = 'Possible related IS?' else: call = 'Possible releated IS' func_pred = '' - if len(seq_results) !=0: - results['region_' + str(region)] = [orient, str(start), str(end), gap, call, str(seq_results[0]), str('%.2f' % seq_results[1]), gene_left[-1][:-1], gene_left[-1][-1], gene_left[1], gene_right[-1][:-1], gene_right[-1][-1], gene_right[1], func_pred] - else: - removed_results['region_' + str(region)] = line.strip() + '\t' + file_loc +'\n' + results['region_' + str(region)] = [orient, str(start), str(end), gap, call, str(seq_results[0]), str('%.2f' % seq_results[1]), gene_left[-1][:-1], gene_left[-1][-1], gene_left[1], gene_right[-1][:-1], gene_right[-1][-1], gene_right[1], func_pred] + else: + # otherwise this a suprious result + removed_results['region_' + str(region)] = line.strip() + '\t' + file_loc +'\n' def gbk_to_fasta(genbank, fasta): ''' From 71734fefb4c489833dc562591cd91d4470c5b9fd Mon Sep 17 00:00:00 2001 From: jhawkey Date: Fri, 2 Oct 2015 14:44:51 +1000 Subject: [PATCH 13/19] adding threads option for bwa --- scripts/ismap.py | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/scripts/ismap.py b/scripts/ismap.py index 3162590..7898568 100755 --- a/scripts/ismap.py +++ b/scripts/ismap.py @@ -62,6 +62,7 @@ def parse_args(): parser.add_argument('--merging', type=str, required=False, default='100', help='Value for merging left and right hits in bed files together to simply calculation of closest and intersecting regions (default 100).') parser.add_argument('--a', action='store_true', required=False, help='Switch on all alignment reporting for bwa.') parser.add_argument('--T', type=str, required=False, default='30', help='Mapping quality score for bwa (default 30).') + parser.add_argument('--t', type=str, required=False, default='1', help='Number of threads for bwa (default 1).') parser.add_argument('--min_clip', type=int, required=False, default='10', help='Minimum size for softclipped region to be extracted from initial mapping (default 10).') parser.add_argument('--max_clip', type=int, required=False, default=30, help='Maximum size for softclipped regions to be included (default 30).') # Options for table output (typing) @@ -103,7 +104,7 @@ def bwa_index(fasta): Check to see if bwa index for given input fasta exists. If it doesn't, build an index from the given input fasta. ''' - + #check_command_version('bwa') built_index = fasta + '.bwt' print built_index @@ -378,7 +379,7 @@ def extract_clipped_reads(sam_file, min_size, max_size, out_five_file, out_three read_name, cigar = entries[0], entries[5] #parse the cigar info, exclude hard-clipped regions (these can only be on the very edges, outside of soft-clips if soft-clipping is present) map_regions = re.findall('[0-9]+[MIDNSP=X]', cigar) - + # Get the first and last items from the cigar string and see if it's soft-clipped (last letter = S). Soft clips will only ever be at the ends (inside would be I/D/N) # If so, find out how many bases are soft-clipped # If it's the right size, add the read and it's quality score to the appropriate fastq file, reverse-complementing if needed @@ -406,7 +407,7 @@ def main(): start_time = time.time() args = parse_args() - + # Set up logfile if args.log is True: logfile = args.output + ".log" @@ -475,7 +476,7 @@ def main(): make_directories([temp_folder]) # Map to IS query - run_command(['bwa', 'mem', query, forward_read, reverse_read, '>', output_sam], shell=True) + run_command(['bwa', 'mem', '-t', args.t, query, forward_read, reverse_read, '>', output_sam], shell=True) # Pull unmapped reads flanking IS run_command(['samtools view', '-Sb', '-f 36', output_sam, '>', five_bam], shell=True) run_command(['samtools view', '-Sb', '-f 4', '-F 40', output_sam, '>', three_bam], shell=True) @@ -537,12 +538,12 @@ def main(): # Map ends back to contigs bwa_index(assembly) if args.a == True: - run_command(['bwa', 'mem', 'a', '-T', args.T, assembly, final_left_reads, '>', five_to_ref_sam], shell=True) - run_command(['bwa', 'mem', 'a', '-T', args.T, assembly, final_right_reads, '>', three_to_ref_sam], shell=True) + run_command(['bwa', 'mem', 'a', '-T', args.T, '-t', args.t, assembly, final_left_reads, '>', five_to_ref_sam], shell=True) + run_command(['bwa', 'mem', 'a', '-T', args.T, '-t', args.t, assembly, final_right_reads, '>', three_to_ref_sam], shell=True) else: - run_command(['bwa', 'mem', assembly, final_left_reads, '>', five_to_ref_sam], shell=True) - run_command(['bwa', 'mem', assembly, final_right_reads, '>', three_to_ref_sam], shell=True) - + run_command(['bwa', 'mem', '-t', args.t, assembly, final_left_reads, '>', five_to_ref_sam], shell=True) + run_command(['bwa', 'mem', '-t', args.t, assembly, final_right_reads, '>', three_to_ref_sam], shell=True) + run_command(['samtools', 'view', '-Sb', five_to_ref_sam, '>', five_to_ref_bam], shell=True) run_command(['samtools', 'view', '-Sb', three_to_ref_sam, '>', three_to_ref_bam], shell=True) run_command(['samtools', 'sort', five_to_ref_bam, five_bam_sorted], shell=True) @@ -555,7 +556,7 @@ def main(): filter_on_depth(five_cov_bed, five_final_cov, args.cutoff) filter_on_depth(three_cov_bed, three_final_cov, args.cutoff) run_command(['bedtools', 'merge', '-i', five_final_cov, '-d', args.merging, '>', five_merged_bed], shell=True) - run_command(['bedtools', 'merge', '-i', three_final_cov, '-d', args.merging, '>', three_merged_bed], shell=True) + run_command(['bedtools', 'merge', '-i', three_final_cov, '-d', args.merging, '>', three_merged_bed], shell=True) # Create table and genbank if args.extension == '.fasta': run_command([args.path + 'create_genbank_table.py', '--five_bed', five_merged_bed, '--three_bed', three_merged_bed, '--assembly', assembly, '--type fasta', '--output', sample + '_' + query_name], shell=True) @@ -574,7 +575,7 @@ def main(): # Create reference fasta from genbank gbk_to_fasta(args.typingRef, typingRefFasta) # Create bwa index file for typing reference - bwa_index(typingRefFasta) + bwa_index(typingRefFasta) # Set up file names for output files five_header = sample + '_5_' + typingName three_header = sample + '_3_' + typingName @@ -599,11 +600,11 @@ def main(): # Map reads to reference, sort if args.a == True: - run_command(['bwa', 'mem', '-a', '-T', args.T, typingRefFasta, final_left_reads, '>', five_to_ref_sam], shell=True) - run_command(['bwa', 'mem', '-a', '-T', args.T, typingRefFasta, final_right_reads, '>', three_to_ref_sam], shell=True) + run_command(['bwa', 'mem', '-a', '-T', args.T, '-t', args.t, typingRefFasta, final_left_reads, '>', five_to_ref_sam], shell=True) + run_command(['bwa', 'mem', '-a', '-T', args.T, '-t', args.t,typingRefFasta, final_right_reads, '>', three_to_ref_sam], shell=True) else: - run_command(['bwa', 'mem', typingRefFasta, final_left_reads, '>', five_to_ref_sam], shell=True) - run_command(['bwa', 'mem', typingRefFasta, final_right_reads, '>', three_to_ref_sam], shell=True) + run_command(['bwa', 'mem', '-t', args.t, typingRefFasta, final_left_reads, '>', five_to_ref_sam], shell=True) + run_command(['bwa', 'mem', '-t', args.t, typingRefFasta, final_right_reads, '>', three_to_ref_sam], shell=True) run_command(['samtools', 'view', '-Sb', five_to_ref_sam, '>', five_to_ref_bam], shell=True) run_command(['samtools', 'view', '-Sb', three_to_ref_sam, '>', three_to_ref_bam], shell=True) run_command(['samtools', 'sort', five_to_ref_bam, five_bam_sorted], shell=True) @@ -615,7 +616,7 @@ def main(): run_command(['bedtools', 'genomecov', '-ibam', three_bam_sorted + '.bam', '-bg', '>', three_cov_bed], shell=True) run_command(['bedtools', 'merge', '-d', args.merging, '-i', five_cov_bed, '>', five_cov_merged], shell=True) run_command(['bedtools', 'merge', '-d', args.merging, '-i', three_cov_bed, '>', three_cov_merged], shell=True) - # Filter coveraged BED files on coverage cutoff (so only take + # Filter coveraged BED files on coverage cutoff (so only take # high coverage regions for further analysis) filter_on_depth(five_cov_bed, five_final_cov, args.cutoff) filter_on_depth(three_cov_bed, three_final_cov, args.cutoff) @@ -633,9 +634,9 @@ def main(): else: igv_flag = '0' run_command([args.path + 'create_typing_out.py', '--intersect', bed_intersect, '--closest', bed_closest, - '--left_bed', five_merged_bed, '--right_bed', three_merged_bed, - '--left_unpaired', bed_unpaired_five, '--right_unpaired', bed_unpaired_three, - '--seq', query, '--ref', args.typingRef, '--temp', temp_folder, + '--left_bed', five_merged_bed, '--right_bed', three_merged_bed, + '--left_unpaired', bed_unpaired_five, '--right_unpaired', bed_unpaired_three, + '--seq', query, '--ref', args.typingRef, '--temp', temp_folder, '--cds', args.cds, '--trna', args.trna, '--rrna', args.rrna, '--min_range', args.min_range, '--max_range', args.max_range, '--output', sample + '_' + query_name, '--igv', igv_flag, '--chr_name', args.chr_name], shell=True) From 62182b1516d73a7b81b1e0da34987e81ea1efebf Mon Sep 17 00:00:00 2001 From: jhawkey Date: Fri, 2 Oct 2015 14:48:21 +1000 Subject: [PATCH 14/19] added option for specifying an output directory --- scripts/ismap.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/scripts/ismap.py b/scripts/ismap.py index 7898568..f5612e9 100755 --- a/scripts/ismap.py +++ b/scripts/ismap.py @@ -76,6 +76,7 @@ def parse_args(): parser.add_argument('--output', type=str, required=True, help='prefix for output files') parser.add_argument('--temp', action='store_true', required=False, help='Switch on keeping the temp folder instead of deleting it at the end of the program') parser.add_argument('--bam', action='store_true', required=False, help='Switch on keeping the final bam files instead of deleting them at the end of the program') + parser.add_argument('--directory', type=store, required=False, default='', help='Output directory for all output files.') return parser.parse_args() @@ -410,7 +411,7 @@ def main(): # Set up logfile if args.log is True: - logfile = args.output + ".log" + logfile = args.directory + args.output + ".log" else: logfile = None logging.basicConfig( @@ -461,7 +462,10 @@ def main(): # Create the output file and folder names, # make the folders where necessary - current_dir = os.getcwd() + '/' + if args.directory == '': + current_dir = os.getcwd() + '/' + else: + current_dir = args.directory temp_folder = current_dir + sample + '_' + query_name + '_temp/' output_sam = temp_folder + sample + '_' + query_name + '.sam' five_bam = temp_folder + sample + '_' + query_name + '_5.bam' From 022a431f6db15d39677b0559d691406013aecbe5 Mon Sep 17 00:00:00 2001 From: jhawkey Date: Fri, 2 Oct 2015 14:54:39 +1000 Subject: [PATCH 15/19] typo in directory argument --- scripts/ismap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ismap.py b/scripts/ismap.py index f5612e9..d459275 100755 --- a/scripts/ismap.py +++ b/scripts/ismap.py @@ -76,7 +76,7 @@ def parse_args(): parser.add_argument('--output', type=str, required=True, help='prefix for output files') parser.add_argument('--temp', action='store_true', required=False, help='Switch on keeping the temp folder instead of deleting it at the end of the program') parser.add_argument('--bam', action='store_true', required=False, help='Switch on keeping the final bam files instead of deleting them at the end of the program') - parser.add_argument('--directory', type=store, required=False, default='', help='Output directory for all output files.') + parser.add_argument('--directory', type=str, required=False, default='', help='Output directory for all output files.') return parser.parse_args() From 8aed44e5b42acedf4c1d4602575d27cc796cc11a Mon Sep 17 00:00:00 2001 From: jhawkey Date: Fri, 2 Oct 2015 14:56:45 +1000 Subject: [PATCH 16/19] check to make sure directory flag has final /, add if not present --- scripts/ismap.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/ismap.py b/scripts/ismap.py index d459275..5a5358a 100755 --- a/scripts/ismap.py +++ b/scripts/ismap.py @@ -466,6 +466,9 @@ def main(): current_dir = os.getcwd() + '/' else: current_dir = args.directory + if current_dir[-1] != '/': + current_dir = current_dir + '/' + temp_folder = current_dir + sample + '_' + query_name + '_temp/' output_sam = temp_folder + sample + '_' + query_name + '.sam' five_bam = temp_folder + sample + '_' + query_name + '_5.bam' From 3c9cfbe54a086d38ef0f4e24441028580007e3a7 Mon Sep 17 00:00:00 2001 From: jhawkey Date: Fri, 2 Oct 2015 15:42:23 +1000 Subject: [PATCH 17/19] make sure all outputs go to specified directory --- scripts/ismap.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/scripts/ismap.py b/scripts/ismap.py index 5a5358a..362f151 100755 --- a/scripts/ismap.py +++ b/scripts/ismap.py @@ -468,7 +468,7 @@ def main(): current_dir = args.directory if current_dir[-1] != '/': current_dir = current_dir + '/' - + temp_folder = current_dir + sample + '_' + query_name + '_temp/' output_sam = temp_folder + sample + '_' + query_name + '.sam' five_bam = temp_folder + sample + '_' + query_name + '_5.bam' @@ -479,7 +479,7 @@ def main(): right_clipped_reads = temp_folder + sample + '_' + query_name + '_right_clipped.fastq' final_left_reads = temp_folder + sample + '_' + query_name + '_LeftFinal.fastq' final_right_reads = temp_folder + sample + '_' + query_name + '_RightFinal.fastq' - no_hits_table = sample + '_' + query_name + '_table.txt' + no_hits_table = current_dir + sample + '_' + query_name + '_table.txt' make_directories([temp_folder]) # Map to IS query @@ -529,11 +529,11 @@ def main(): three_bam_sorted = three_header + '_' + query_name + '.sorted' five_cov_bed = temp_folder + five_header + '_' + query_name + '_cov.bed' three_cov_bed = temp_folder + three_header + '_' + query_name + '_cov.bed' - five_final_cov = five_header + '_' + query_name + '_finalcov.bed' - three_final_cov = three_header + '_' + query_name + '_finalcov.bed' - five_merged_bed = five_header + '_' + query_name + '_merged.sorted.bed' - three_merged_bed = three_header + '_' + query_name + '_merged.sorted.bed' - final_genbankSingle = sample + '_' + query_name + '_annotatedSingle.gbk' + five_final_cov = current_dir + five_header + '_' + query_name + '_finalcov.bed' + three_final_cov = current_dir + three_header + '_' + query_name + '_finalcov.bed' + five_merged_bed = current_dir + five_header + '_' + query_name + '_merged.sorted.bed' + three_merged_bed = current_dir + three_header + '_' + query_name + '_merged.sorted.bed' + final_genbankSingle = current_dir + sample + '_' + query_name + '_annotatedSingle.gbk' # create fasta file from genbank if required if args.extension == '.gbk': @@ -596,14 +596,14 @@ def main(): three_cov_bed = temp_folder + three_header + '_' + query_name + '_cov.bed' five_cov_merged = temp_folder + five_header + '_' + query_name + '_cov_merged.sorted.bed' three_cov_merged = temp_folder + three_header + '_' + query_name + '_cov_merged.sorted.bed' - five_final_cov = five_header + '_' + query_name + '_finalcov.bed' - three_final_cov = three_header + '_' + query_name + '_finalcov.bed' - five_merged_bed = five_header + '_' + query_name + '_merged.sorted.bed' - three_merged_bed = three_header + '_' + query_name + '_merged.sorted.bed' - bed_intersect = sample + '_' + typingName + '_' + query_name + '_intersect.bed' - bed_closest = sample + '_' + typingName + '_' + query_name + '_closest.bed' - bed_unpaired_five = sample + '_' + typingName + '_' + query_name + '_left_unpaired.bed' - bed_unpaired_three = sample + '_' + typingName + '_' + query_name + '_right_unpaired.bed' + five_final_cov = current_dir + five_header + '_' + query_name + '_finalcov.bed' + three_final_cov = current_dir + three_header + '_' + query_name + '_finalcov.bed' + five_merged_bed = current_dir + five_header + '_' + query_name + '_merged.sorted.bed' + three_merged_bed = current_dir + three_header + '_' + query_name + '_merged.sorted.bed' + bed_intersect = current_dir + sample + '_' + typingName + '_' + query_name + '_intersect.bed' + bed_closest = current_dir + sample + '_' + typingName + '_' + query_name + '_closest.bed' + bed_unpaired_five = current_dir + sample + '_' + typingName + '_' + query_name + '_left_unpaired.bed' + bed_unpaired_three = current_dir + sample + '_' + typingName + '_' + query_name + '_right_unpaired.bed' # Map reads to reference, sort if args.a == True: From 8c4966784bee8e9532167ad1dfe2313f1e13b62b Mon Sep 17 00:00:00 2001 From: jhawkey Date: Wed, 7 Oct 2015 10:42:23 +1100 Subject: [PATCH 18/19] updated for final output files to go in specified directory --- scripts/ismap.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/ismap.py b/scripts/ismap.py index 362f151..1cc6c49 100755 --- a/scripts/ismap.py +++ b/scripts/ismap.py @@ -566,9 +566,9 @@ def main(): run_command(['bedtools', 'merge', '-i', three_final_cov, '-d', args.merging, '>', three_merged_bed], shell=True) # Create table and genbank if args.extension == '.fasta': - run_command([args.path + 'create_genbank_table.py', '--five_bed', five_merged_bed, '--three_bed', three_merged_bed, '--assembly', assembly, '--type fasta', '--output', sample + '_' + query_name], shell=True) + run_command([args.path + 'create_genbank_table.py', '--five_bed', five_merged_bed, '--three_bed', three_merged_bed, '--assembly', assembly, '--type fasta', '--output', current_dir + sample + '_' + query_name], shell=True) elif args.extension == '.gbk': - run_command([args.path + 'create_genbank_table.py', '--five_bed', five_merged_bed, '--three_bed', three_merged_bed, '--assembly', assembly_gbk, '--type genbank', '--output', sample + '_' + query_name], shell=True) + run_command([args.path + 'create_genbank_table.py', '--five_bed', five_merged_bed, '--three_bed', three_merged_bed, '--assembly', assembly_gbk, '--type genbank', '--output', current_dir + sample + '_' + query_name], shell=True) #create single entry genbank multi_to_single(sample + '_' + query_name + '_annotated.gbk', sample, final_genbankSingle) @@ -645,7 +645,7 @@ def main(): '--left_unpaired', bed_unpaired_five, '--right_unpaired', bed_unpaired_three, '--seq', query, '--ref', args.typingRef, '--temp', temp_folder, '--cds', args.cds, '--trna', args.trna, '--rrna', args.rrna, '--min_range', args.min_range, - '--max_range', args.max_range, '--output', sample + '_' + query_name, '--igv', igv_flag, '--chr_name', args.chr_name], shell=True) + '--max_range', args.max_range, '--output', current_dir + sample + '_' + query_name, '--igv', igv_flag, '--chr_name', args.chr_name], shell=True) # remove temp folder if required if args.temp == False: From 66da87c7c45143bd79e967fcdb1ffe6058d4bf70 Mon Sep 17 00:00:00 2001 From: jhawkey Date: Wed, 7 Oct 2015 11:14:01 +1100 Subject: [PATCH 19/19] updated changes,readme and version --- CHANGES.txt | 9 +++++++++ README.md | 12 +++++++----- setup.py | 4 ++-- 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 441a48e..7bf2ec8 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,3 +1,12 @@ +v0.1.4, Wed 7 Oct 2015 + * can now supply output directory using --directory command + * can now ask BWA to use multiple threads using --t command + * fixed some cases where an unpaired hit would pair incorrectly with itself + * correct distances returned if the flanking gene should be the first or last + gene in the reference + * fixed an error where no hits reported but bed files are non-empty + * possible related IS call must now meet 50% ID and 50% coverage threshold + otherwise the hit is discarded v0.1.3, Mon 10 Aug 2015 * fixed soft clipped read selection (Special thanks to Katie Cox for helping with this one) * bam files now deleted by default, can be kept using the --bam flag diff --git a/README.md b/README.md index 492343f..f46830a 100755 --- a/README.md +++ b/README.md @@ -159,7 +159,7 @@ Once ISMapper has finished running, multiple output files will be generated, the ## Common Issues -When running ISMapper on typing mode with a reference genome in Genbank format, check to see if all CDS/tRNA/rRNA features have the locus_tag qualifier. This is the default setting for ISMapper when choosing what to print in the final table. If the reference genome doesn't have locus_tag's, ISMapper will throw an error in the final table create step. This can be avoided by changing the setting of `--cds` (or `--trna` or `--rrna`) flags to db_xref gene product (or whatever other identifiers are present in your genbank file). +When running ISMapper on typing mode with a reference genome in Genbank format, check to see if all CDS/tRNA/rRNA features have the locus_tag qualifier. This is the default setting for ISMapper when choosing what to print in the final table. If the reference genome doesn't have locus_tag's, ISMapper will throw an error in the final table create step. This can be avoided by changing the setting of `--cds` (or `--trna` or `--rrna`) flags to db_xref gene product (or whatever other identifiers are present in your genbank file). ## Adavnced Options for ismap @@ -173,7 +173,7 @@ When running ISMapper on typing mode with a reference genome in Genbank format, `--min_clip` and `--max_clip` are used to determine the size of soft clipped sections of reads that are including in the final mapping step. The default size is currently 10 bp and 30 bp, which is ideal for reads between 100 - 150bp. To improve the detection of the exact target site duplication size, it is sometimes helpful to increase the size of `--max_clip`. However large values of `--max_clip` can increase the amount of noise in the final mapping step and cause nonsensical results. -`--a` and `--T` are flags that are passed to BWA. --a will turn on all alignment reporting in BWA, and --T is used to give an integer mapping score to BWA to determine what alignments are kept. These options may be useful in finding IS query positions that are next to repeated elements as BWA will report all hits for the read not just the best random hit. However, using these options may cause noise and confusion in the final output files. +`--a`, `--T` and `--t` are flags that are passed to BWA. --a will turn on all alignment reporting in BWA, and --T is used to give an integer mapping score to BWA to determine what alignments are kept. These options may be useful in finding IS query positions that are next to repeated elements as BWA will report all hits for the read not just the best random hit. However, using these options may cause noise and confusion in the final output files. `--t` is used to supply more threads to BWA if required. `--cds`, `--trna` and `--rrna` are used to specify what qualifiers will be looked for in the reference genbank when determining genes flanking the IS query location. Defaults are locus_tag gene product. @@ -183,9 +183,11 @@ When running ISMapper on typing mode with a reference genome in Genbank format, `--log` turns on the log file. +`--directory` sets an output directory for the output files (defualt is the directory where ISMapper is being run). + `--temp` turns on keeping the temporary files instead of deleting them once the run has completed. -`--bam` turns on keeping the final sorted and indexed BAM files of flanking reads for comparison against the reference genome (by default these files are deleted to save disk space). +`--bam` turns on keeping the final sorted and indexed BAM files of flanking reads for comparison against the reference genome (by default these files are deleted to save disk space). ## Other options for compiled_table @@ -198,7 +200,7 @@ When running ISMapper on typing mode with a reference genome in Genbank format, ## Running ISMapper without installing -ISMapper can be run directly from its directory without installing it via pip. To do so, ismap.py needs the path to the folder that contains all the scripts supplied to the argument `--path`. +ISMapper can be run directly from its directory without installing it via pip. To do so, ismap.py needs the path to the folder that contains all the scripts supplied to the argument `--path`. eg: `ismap.py --reads x_1.fastq.gz x_2.fastq.gz --queries is_query.fasta --path /path/to/IS_mapper/scripts/` @@ -254,4 +256,4 @@ optional arguments: --other_args OTHER_ARGS String containing all other arguments to pass to ISMapper -``` \ No newline at end of file +``` diff --git a/setup.py b/setup.py index 5d4d8ff..613dfbb 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ setup( name='ISMapper', - version='0.1.3', + version='0.1.4', author='Jane Hawkey', author_email='hawkey.jane@gmail.com', packages=['ismap'], @@ -33,4 +33,4 @@ #"scipy >= 0.12.0", #"biopython == 1.63", ], -) \ No newline at end of file +)