diff --git a/CHANGES.txt b/CHANGES.txt index 441a48e..7bf2ec8 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,3 +1,12 @@ +v0.1.4, Wed 7 Oct 2015 + * can now supply output directory using --directory command + * can now ask BWA to use multiple threads using --t command + * fixed some cases where an unpaired hit would pair incorrectly with itself + * correct distances returned if the flanking gene should be the first or last + gene in the reference + * fixed an error where no hits reported but bed files are non-empty + * possible related IS call must now meet 50% ID and 50% coverage threshold + otherwise the hit is discarded v0.1.3, Mon 10 Aug 2015 * fixed soft clipped read selection (Special thanks to Katie Cox for helping with this one) * bam files now deleted by default, can be kept using the --bam flag diff --git a/README.md b/README.md index aeaeadc..f46830a 100755 --- a/README.md +++ b/README.md @@ -159,7 +159,7 @@ Once ISMapper has finished running, multiple output files will be generated, the ## Common Issues -When running ISMapper on typing mode with a reference genome in Genbank format, check to see if all CDS/tRNA/rRNA features have the locus_tag qualifier. This is the default setting for ISMapper when choosing what to print in the final table. If the reference genome doesn't have locus_tag's, ISMapper will throw an error in the final table create step. This can be avoided by changing the setting of `--cds` (or `--trna` or `--rrna`) flags to db_xref gene product (or whatever other identifiers are present in your genbank file). +When running ISMapper on typing mode with a reference genome in Genbank format, check to see if all CDS/tRNA/rRNA features have the locus_tag qualifier. This is the default setting for ISMapper when choosing what to print in the final table. If the reference genome doesn't have locus_tag's, ISMapper will throw an error in the final table create step. This can be avoided by changing the setting of `--cds` (or `--trna` or `--rrna`) flags to db_xref gene product (or whatever other identifiers are present in your genbank file). ## Adavnced Options for ismap @@ -173,7 +173,7 @@ When running ISMapper on typing mode with a reference genome in Genbank format, `--min_clip` and `--max_clip` are used to determine the size of soft clipped sections of reads that are including in the final mapping step. The default size is currently 10 bp and 30 bp, which is ideal for reads between 100 - 150bp. To improve the detection of the exact target site duplication size, it is sometimes helpful to increase the size of `--max_clip`. However large values of `--max_clip` can increase the amount of noise in the final mapping step and cause nonsensical results. -`--a` and `--T` are flags that are passed to BWA. --a will turn on all alignment reporting in BWA, and --T is used to give an integer mapping score to BWA to determine what alignments are kept. These options may be useful in finding IS query positions that are next to repeated elements as BWA will report all hits for the read not just the best random hit. However, using these options may cause noise and confusion in the final output files. +`--a`, `--T` and `--t` are flags that are passed to BWA. --a will turn on all alignment reporting in BWA, and --T is used to give an integer mapping score to BWA to determine what alignments are kept. These options may be useful in finding IS query positions that are next to repeated elements as BWA will report all hits for the read not just the best random hit. However, using these options may cause noise and confusion in the final output files. `--t` is used to supply more threads to BWA if required. `--cds`, `--trna` and `--rrna` are used to specify what qualifiers will be looked for in the reference genbank when determining genes flanking the IS query location. Defaults are locus_tag gene product. @@ -183,9 +183,11 @@ When running ISMapper on typing mode with a reference genome in Genbank format, `--log` turns on the log file. +`--directory` sets an output directory for the output files (defualt is the directory where ISMapper is being run). + `--temp` turns on keeping the temporary files instead of deleting them once the run has completed. -`--bam` turns on keeping the final sorted and indexed BAM files of flanking reads for comparison against the reference genome (by default these files are deleted to save disk space). +`--bam` turns on keeping the final sorted and indexed BAM files of flanking reads for comparison against the reference genome (by default these files are deleted to save disk space). ## Other options for compiled_table @@ -198,7 +200,7 @@ When running ISMapper on typing mode with a reference genome in Genbank format, ## Running ISMapper without installing -ISMapper can be run directly from its directory without installing it via pip. To do so, ismap.py needs the path to the folder that contains all the scripts supplied to the argument `--path`. +ISMapper can be run directly from its directory without installing it via pip. To do so, ismap.py needs the path to the folder that contains all the scripts supplied to the argument `--path`. eg: `ismap.py --reads x_1.fastq.gz x_2.fastq.gz --queries is_query.fasta --path /path/to/IS_mapper/scripts/` @@ -211,7 +213,9 @@ eg: ## Running multiple jobs on a SLURM queing system There is no need to set the --output flag in other_args for this script, it sets it itself using the name of the readset. -Also, make sure the query sequence is already index with bwa (using `bwa mem query.fasta`) to prevent corruption of the index once multiple jobs start running. +Also, make sure the query sequence is already index with bwa (using `bwa mem query.fasta`) to prevent corruption of the index once multiple jobs start running. + +When running in improvement mode, ensure that --assemblies and --assembliesid is passed to the slurm script, but --extension and --type are passed to the ismap script through --other_args. ``` python slurm_ismap.py -h @@ -252,4 +256,4 @@ optional arguments: --other_args OTHER_ARGS String containing all other arguments to pass to ISMapper -``` \ No newline at end of file +``` diff --git a/scripts/compiled_table.py b/scripts/compiled_table.py index 6f63f3c..b5defa1 100644 --- a/scripts/compiled_table.py +++ b/scripts/compiled_table.py @@ -295,6 +295,17 @@ def findFeatureAfterPosition(features, isPosition, m): # If we are looking for the feature to the right of the # IS position, then either m or m+1 is our answer + # an index error will occur if m is the final feature, so just check that the first part is true + # and return m + try: + features[m+1] + except IndexError: + if features[m][0] > isPosition: + return features[m][2] + # otherwise we must be after the final position, so need to + # return the start position of the very first feature + else: + return features[0][2] # If the end of the m feature is before the IS position, # then m is before the IS and m+1 is the correct feature if features[m][1] < isPosition: @@ -307,15 +318,22 @@ def findFeatureAfterPosition(features, isPosition, m): # then m will be closer to the IS and is the correct feature elif features[m][0] > isPosition and features[m+1][0] > isPosition: return features[m][2] - else: return "3 - THIS SHOULDN'T HAPPEN!" -def get_flanking_genes(features, feature_list, left, right, cds_features, trna_features, rrna_features): +def get_flanking_genes(features, feature_list, left, right, cds_features, trna_features, rrna_features, genome_size): # Find the correct indexes left_feature_index = binary_search(feature_list, left, 'L') right_feature_index = binary_search(feature_list, right, 'R') + # Print out information if returning one of the error codes + if type(left_feature_index) != int or type(right_feature_index) != int: + print 'left index' + print left_feature_index + print 'right index' + print right_feature_index + print 'left position: ' + str(left) + print 'right position: ' + str(right) # Extract the SeqFeature object that corresponds to that index left_feature = features[left_feature_index] right_feature = features[right_feature_index] @@ -331,6 +349,32 @@ def get_flanking_genes(features, feature_list, left, right, cds_features, trna_f left_dist = abs(max(left_feature.location.start, left_feature.location.end) - left) # The distance to the right gene is the startmost position of the feature - the right IS coord right_dist = abs(min(right_feature.location.start, right_feature.location.end) - right) + + # Here is probably a good place to check if we've got a position that wraps around from start + # to end of the reference + # If we've got a distance that is close to the size of the reference, then we know we need to + # alter it + if left_dist in range(int(round(genome_size * 0.9)), int(round(genome_size * 1.1))): + # The the left hand feature is at the end of the genome + # Distance from IS position to start of the genome is the + # position itself + dist_to_start = left + # Distance from the end of the final gene to the end of the + # genome + dist_to_end = abs(left_feature.location.end - genome_size) + # So the total distnace is those two added together + left_dist = dist_to_start + dist_to_end + + elif right_dist in range(int(round(genome_size * 0.9)), int(round(genome_size * 1.1))): + # Then the right hand feature is at the start of the genome + # Distance from the IS position to the end of the genome + dist_to_end = abs(genome_size - right) + # Distance from the start of the genome to the start of the first feature + # is the start position of the first feature + dist_to_feature = right_feature.location.start + # So the total distance is those two added together + right_dist = dist_to_end + dist_to_feature + # The first string in this values list is the main gene id (eg locus_tag) left_gene = [left_values[0], str(left_dist), left_values[1:]] right_gene = [right_values[0], str(right_dist), right_values[1:]] @@ -528,10 +572,11 @@ def main(): feature_count += 1 else: feature_count += 1 - + # Sort the list just in case it's out of order (has caused issues in the past!!) + feature_list = sorted(feature_list, key=itemgetter(0)) # Get flanking genes for pos in list_of_positions: - genes_before, genes_after = get_flanking_genes(gb.features, feature_list, pos.x, pos.y, args.cds, args.trna, args.rrna) + genes_before, genes_after = get_flanking_genes(gb.features, feature_list, pos.x, pos.y, args.cds, args.trna, args.rrna, len(gb.seq)) pos.left_feature = genes_before pos.right_feature = genes_after diff --git a/scripts/create_typing_out.py b/scripts/create_typing_out.py index 0c542c9..e54ca8e 100755 --- a/scripts/create_typing_out.py +++ b/scripts/create_typing_out.py @@ -139,7 +139,7 @@ def novel_hit(x_L, y_L, x_R, y_R, x, y, genbank, ref, cds, trna, rrna, gap, orie genbank.features.append(right_feature) # Get the genes flanking the left and right ends - gene_left, gene_right = get_flanking_genes(features, feature_list, x, y, cds, trna, rrna) + gene_left, gene_right = get_flanking_genes(features, feature_list, x, y, cds, trna, rrna, len(genbank.seq)) #print gene_left #print gene_right # If the genes are the same, then hit is inside the gene @@ -187,7 +187,7 @@ def add_known(x_L, x_R, y_L, y_R, gap, genbank, ref, seq, temp, cds, trna, rrna, # Taking all four coordinates and finding min and max to avoid coordinates # that overlap the actual IS (don't want to return those in gene calls) # Mark as a known call to improve accuracy of gene calling - gene_left, gene_right = get_flanking_genes(features, feature_list, start, end, cds, trna, rrna) + gene_left, gene_right = get_flanking_genes(features, feature_list, start, end, cds, trna, rrna, len(genbank.seq)) #gene_left = get_other_gene(ref, min(y_L, y_R, x_R, x_L), "left", cds, trna, rrna, known=True) #gene_right = get_other_gene(ref, max(y_L, y_R, x_R, x_L), "right", cds, trna, rrna, known=True) @@ -207,19 +207,18 @@ def add_known(x_L, x_R, y_L, y_R, gap, genbank, ref, seq, temp, cds, trna, rrna, else: call = 'Known' results['region_' + str(region)] = [orient, str(start), str(end), gap, call, str(seq_results[0]), str('%.2f' % seq_results[1]), gene_left[-1][:-1], gene_left[-1][-1], gene_left[1], gene_right[-1][:-1], gene_right[-1][-1], gene_right[1], func_pred] - else: - # Then I'm not sure what this is - # Get flanking genes anyway - gene_left, gene_right = get_flanking_genes(features, feature_list, start, end, cds, trna, rrna) + elif len(seq_results) != 0 and seq_results[0] >=50 and seq_results[1] >= 50: + # Calling it a possible related IS if there is 50% nucleotide ID and 50% coverage + gene_left, gene_right = get_flanking_genes(features, feature_list, start, end, cds, trna, rrna, len(genbank.seq)) if 'unpaired' in file_loc: call = 'Possible related IS?' else: call = 'Possible releated IS' func_pred = '' - if len(seq_results) !=0: - results['region_' + str(region)] = [orient, str(start), str(end), gap, call, str(seq_results[0]), str('%.2f' % seq_results[1]), gene_left[-1][:-1], gene_left[-1][-1], gene_left[1], gene_right[-1][:-1], gene_right[-1][-1], gene_right[1], func_pred] - else: - removed_results['region_' + str(region)] = line.strip() + '\t' + file_loc +'\n' + results['region_' + str(region)] = [orient, str(start), str(end), gap, call, str(seq_results[0]), str('%.2f' % seq_results[1]), gene_left[-1][:-1], gene_left[-1][-1], gene_left[1], gene_right[-1][:-1], gene_right[-1][-1], gene_right[1], func_pred] + else: + # otherwise this a suprious result + removed_results['region_' + str(region)] = line.strip() + '\t' + file_loc +'\n' def gbk_to_fasta(genbank, fasta): ''' @@ -264,6 +263,7 @@ def main(): else: feature_count_list += 1 + feature_list = sorted(feature_list, key=itemgetter(0)) # Initialise feature count feature_count = 0 @@ -288,8 +288,8 @@ def main(): y_R = int(info[5]) # Check to see if the gap is reasonable if int(info[6]) <= 15: - R_range = range(min(x_R, y_R), max(x_R, y_R)) - L_range = range(min(x_L, y_L), max(x_L, y_L)) + R_range = range(min(x_R, y_R), max(x_R, y_R)+1) + L_range = range(min(x_L, y_L), max(x_L, y_L)+1) # if one hit is inside the other hit, remove it - don't know what to do with these if (x_L in R_range and y_L in R_range) or (x_R in L_range and y_R in L_range): removed_results['region_' + str(lines)] = line.strip() + '\tOne hit inside the other, intersect.bed\n' @@ -316,7 +316,6 @@ def main(): else: removed_results['region_' + str(lines)] = line.strip() + '\tintersect.bed\n' lines += 1 - # Get size of IS query is_length = insertion_length(args.seq) # Look inside the closest file (known or imprecise hits) @@ -338,8 +337,8 @@ def main(): y_L = int(info[2]) x_R = int(info[4]) y_R = int(info[5]) - R_range = range(min(x_R, y_R), max(x_R, y_R)) - L_range = range(min(x_L, y_L), max(x_L, y_L)) + R_range = range(min(x_R, y_R), max(x_R, y_R)+1) + L_range = range(min(x_L, y_L), max(x_L, y_L)+1) # if one hit is inside the other hit, remove it - don't know what to do with these if (x_L in R_range and y_L in R_range) or (x_R in L_range and y_R in L_range): removed_results['region_' + str(region)] = line.strip() + '\tOne hit inside the other, closest.bed\n' @@ -399,11 +398,14 @@ def main(): y_L = int(info[2]) x_R = int(info[4]) y_R = int(info[5]) - R_range = range(min(x_R, y_R), max(x_R, y_R)) - L_range = range(min(x_L, y_L), max(x_L, y_L)) + R_range = range(min(x_R, y_R), max(x_R, y_R)+1) + L_range = range(min(x_L, y_L), max(x_L, y_L)+1) # if one hit is inside the other hit, remove it - don't know what to do with these if (x_L in R_range and y_L in R_range) or (x_R in L_range and y_R in L_range): - removed_results['region_' + str(lines)] = line.strip() + '\tOne hit inside the other, intersect.bed\n' + removed_results['region_' + str(lines)] = line.strip() + '\tOne hit inside the other, left_unpaired.bed\n' + # or if a hit is paired with itself, which means it doesn't have a pair + elif x_L == x_R and y_L == y_R: + removed_results['region_' + str(region)] = line.strip() + '\tleft_unpaired.bed, paired with self\n' else: # Get orientation if x_L < x_R and y_L < y_R: @@ -451,11 +453,14 @@ def main(): y_L = int(info[2]) x_R = int(info[4]) y_R = int(info[5]) - R_range = range(min(x_R, y_R), max(x_R, y_R)) - L_range = range(min(x_L, y_L), max(x_L, y_L)) + R_range = range(min(x_R, y_R), max(x_R, y_R)+1) + L_range = range(min(x_L, y_L), max(x_L, y_L)+1) # if one hit is inside the other hit, remove it - don't know what to do with these if (x_L in R_range and y_L in R_range) or (x_R in L_range and y_R in L_range): - removed_results['region_' + str(lines)] = line.strip() + '\tOne hit inside the other, intersect.bed\n' + removed_results['region_' + str(lines)] = line.strip() + '\tOne hit inside the other, right_unpaired.bed\n' + # or if a hit is paired with itself, which means it doesn't have a pair + elif x_L == x_R and y_L == y_R: + removed_results['region_' + str(region)] = line.strip() + '\tright_unpaired.bed, paired with self\n' else: #get orientation if x_L < x_R and y_L < y_R: @@ -485,6 +490,9 @@ def main(): else: removed_results['region_' + str(region)] = line.strip() + '\tright_unpaired.bed\n' region += 1 + # Open the output file and write in the header + output = open(args.output + '_table.txt', 'w') + output.write('\t'.join(header) + '\n') # Sort regions into the correct order table_keys = [] for key in results: @@ -493,32 +501,30 @@ def main(): for region in table_keys: region_indexes.append(region.split('region_')[1]) arr = np.vstack((table_keys, region_indexes)).transpose() - if arr != 0: - sorted_keys = arr[arr[:,1].astype('int').argsort()] - # Write out the found hits to file - output = open(args.output + '_table.txt', 'w') - output.write('\t'.join(header) + '\n') if arr != 0: + sorted_keys = arr[arr[:,1].astype('int').argsort()] for key in sorted_keys[:,0]: output.write(key + '\t' + '\t'.join(str(i) for i in results[key]) + '\n') - if arr == 0: + # If all the hits failed, write out that no hits were found + elif arr == 0: output.write('No hits found.') output.close() # Write out the found hits to bed file for viewing in IGV with open(args.output + '_hits.bed', 'w') as outfile: - if args.chr_name == 'not_specified': - args.chr_name = SeqIO.read(args.ref, 'genbank').id - if args.igv == 1: - outfile.write('#gffTags \n') - for key in sorted_keys[:,0]: - r = results[key] - outfile.write(args.chr_name + '\t' + r[1] + '\t' + r[2] + '\t' + 'Name=' + key + ';orientation=' + r[0] + ';' + ';'.join([header[i+1] + '=' + str(r[i]) for i in range(3, len(r))])+ '\n') - else: - for key in sorted_keys[:,0]: - r = results[key] - outfile.write(args.chr_name + '\t' + r[1] + '\t' + r[2] + key + '\n') + if arr != 0: + if args.chr_name == 'not_specified': + args.chr_name = SeqIO.read(args.ref, 'genbank').id + if args.igv == 1: + outfile.write('#gffTags \n') + for key in sorted_keys[:,0]: + r = results[key] + outfile.write(args.chr_name + '\t' + r[1] + '\t' + r[2] + '\t' + 'Name=' + key + ';orientation=' + r[0] + ';' + ';'.join([header[i+1] + '=' + str(r[i]) for i in range(3, len(r))])+ '\n') + else: + for key in sorted_keys[:,0]: + r = results[key] + outfile.write(args.chr_name + '\t' + r[1] + '\t' + r[2] + key + '\n') # Write out hits that were removed for whatever reason to file if len(removed_results) != 0: diff --git a/scripts/ismap.py b/scripts/ismap.py index 3162590..1cc6c49 100755 --- a/scripts/ismap.py +++ b/scripts/ismap.py @@ -62,6 +62,7 @@ def parse_args(): parser.add_argument('--merging', type=str, required=False, default='100', help='Value for merging left and right hits in bed files together to simply calculation of closest and intersecting regions (default 100).') parser.add_argument('--a', action='store_true', required=False, help='Switch on all alignment reporting for bwa.') parser.add_argument('--T', type=str, required=False, default='30', help='Mapping quality score for bwa (default 30).') + parser.add_argument('--t', type=str, required=False, default='1', help='Number of threads for bwa (default 1).') parser.add_argument('--min_clip', type=int, required=False, default='10', help='Minimum size for softclipped region to be extracted from initial mapping (default 10).') parser.add_argument('--max_clip', type=int, required=False, default=30, help='Maximum size for softclipped regions to be included (default 30).') # Options for table output (typing) @@ -75,6 +76,7 @@ def parse_args(): parser.add_argument('--output', type=str, required=True, help='prefix for output files') parser.add_argument('--temp', action='store_true', required=False, help='Switch on keeping the temp folder instead of deleting it at the end of the program') parser.add_argument('--bam', action='store_true', required=False, help='Switch on keeping the final bam files instead of deleting them at the end of the program') + parser.add_argument('--directory', type=str, required=False, default='', help='Output directory for all output files.') return parser.parse_args() @@ -103,7 +105,7 @@ def bwa_index(fasta): Check to see if bwa index for given input fasta exists. If it doesn't, build an index from the given input fasta. ''' - + #check_command_version('bwa') built_index = fasta + '.bwt' print built_index @@ -378,7 +380,7 @@ def extract_clipped_reads(sam_file, min_size, max_size, out_five_file, out_three read_name, cigar = entries[0], entries[5] #parse the cigar info, exclude hard-clipped regions (these can only be on the very edges, outside of soft-clips if soft-clipping is present) map_regions = re.findall('[0-9]+[MIDNSP=X]', cigar) - + # Get the first and last items from the cigar string and see if it's soft-clipped (last letter = S). Soft clips will only ever be at the ends (inside would be I/D/N) # If so, find out how many bases are soft-clipped # If it's the right size, add the read and it's quality score to the appropriate fastq file, reverse-complementing if needed @@ -406,10 +408,10 @@ def main(): start_time = time.time() args = parse_args() - + # Set up logfile if args.log is True: - logfile = args.output + ".log" + logfile = args.directory + args.output + ".log" else: logfile = None logging.basicConfig( @@ -460,7 +462,13 @@ def main(): # Create the output file and folder names, # make the folders where necessary - current_dir = os.getcwd() + '/' + if args.directory == '': + current_dir = os.getcwd() + '/' + else: + current_dir = args.directory + if current_dir[-1] != '/': + current_dir = current_dir + '/' + temp_folder = current_dir + sample + '_' + query_name + '_temp/' output_sam = temp_folder + sample + '_' + query_name + '.sam' five_bam = temp_folder + sample + '_' + query_name + '_5.bam' @@ -471,11 +479,11 @@ def main(): right_clipped_reads = temp_folder + sample + '_' + query_name + '_right_clipped.fastq' final_left_reads = temp_folder + sample + '_' + query_name + '_LeftFinal.fastq' final_right_reads = temp_folder + sample + '_' + query_name + '_RightFinal.fastq' - no_hits_table = sample + '_' + query_name + '_table.txt' + no_hits_table = current_dir + sample + '_' + query_name + '_table.txt' make_directories([temp_folder]) # Map to IS query - run_command(['bwa', 'mem', query, forward_read, reverse_read, '>', output_sam], shell=True) + run_command(['bwa', 'mem', '-t', args.t, query, forward_read, reverse_read, '>', output_sam], shell=True) # Pull unmapped reads flanking IS run_command(['samtools view', '-Sb', '-f 36', output_sam, '>', five_bam], shell=True) run_command(['samtools view', '-Sb', '-f 4', '-F 40', output_sam, '>', three_bam], shell=True) @@ -521,11 +529,11 @@ def main(): three_bam_sorted = three_header + '_' + query_name + '.sorted' five_cov_bed = temp_folder + five_header + '_' + query_name + '_cov.bed' three_cov_bed = temp_folder + three_header + '_' + query_name + '_cov.bed' - five_final_cov = five_header + '_' + query_name + '_finalcov.bed' - three_final_cov = three_header + '_' + query_name + '_finalcov.bed' - five_merged_bed = five_header + '_' + query_name + '_merged.sorted.bed' - three_merged_bed = three_header + '_' + query_name + '_merged.sorted.bed' - final_genbankSingle = sample + '_' + query_name + '_annotatedSingle.gbk' + five_final_cov = current_dir + five_header + '_' + query_name + '_finalcov.bed' + three_final_cov = current_dir + three_header + '_' + query_name + '_finalcov.bed' + five_merged_bed = current_dir + five_header + '_' + query_name + '_merged.sorted.bed' + three_merged_bed = current_dir + three_header + '_' + query_name + '_merged.sorted.bed' + final_genbankSingle = current_dir + sample + '_' + query_name + '_annotatedSingle.gbk' # create fasta file from genbank if required if args.extension == '.gbk': @@ -537,12 +545,12 @@ def main(): # Map ends back to contigs bwa_index(assembly) if args.a == True: - run_command(['bwa', 'mem', 'a', '-T', args.T, assembly, final_left_reads, '>', five_to_ref_sam], shell=True) - run_command(['bwa', 'mem', 'a', '-T', args.T, assembly, final_right_reads, '>', three_to_ref_sam], shell=True) + run_command(['bwa', 'mem', 'a', '-T', args.T, '-t', args.t, assembly, final_left_reads, '>', five_to_ref_sam], shell=True) + run_command(['bwa', 'mem', 'a', '-T', args.T, '-t', args.t, assembly, final_right_reads, '>', three_to_ref_sam], shell=True) else: - run_command(['bwa', 'mem', assembly, final_left_reads, '>', five_to_ref_sam], shell=True) - run_command(['bwa', 'mem', assembly, final_right_reads, '>', three_to_ref_sam], shell=True) - + run_command(['bwa', 'mem', '-t', args.t, assembly, final_left_reads, '>', five_to_ref_sam], shell=True) + run_command(['bwa', 'mem', '-t', args.t, assembly, final_right_reads, '>', three_to_ref_sam], shell=True) + run_command(['samtools', 'view', '-Sb', five_to_ref_sam, '>', five_to_ref_bam], shell=True) run_command(['samtools', 'view', '-Sb', three_to_ref_sam, '>', three_to_ref_bam], shell=True) run_command(['samtools', 'sort', five_to_ref_bam, five_bam_sorted], shell=True) @@ -555,12 +563,12 @@ def main(): filter_on_depth(five_cov_bed, five_final_cov, args.cutoff) filter_on_depth(three_cov_bed, three_final_cov, args.cutoff) run_command(['bedtools', 'merge', '-i', five_final_cov, '-d', args.merging, '>', five_merged_bed], shell=True) - run_command(['bedtools', 'merge', '-i', three_final_cov, '-d', args.merging, '>', three_merged_bed], shell=True) + run_command(['bedtools', 'merge', '-i', three_final_cov, '-d', args.merging, '>', three_merged_bed], shell=True) # Create table and genbank if args.extension == '.fasta': - run_command([args.path + 'create_genbank_table.py', '--five_bed', five_merged_bed, '--three_bed', three_merged_bed, '--assembly', assembly, '--type fasta', '--output', sample + '_' + query_name], shell=True) + run_command([args.path + 'create_genbank_table.py', '--five_bed', five_merged_bed, '--three_bed', three_merged_bed, '--assembly', assembly, '--type fasta', '--output', current_dir + sample + '_' + query_name], shell=True) elif args.extension == '.gbk': - run_command([args.path + 'create_genbank_table.py', '--five_bed', five_merged_bed, '--three_bed', three_merged_bed, '--assembly', assembly_gbk, '--type genbank', '--output', sample + '_' + query_name], shell=True) + run_command([args.path + 'create_genbank_table.py', '--five_bed', five_merged_bed, '--three_bed', three_merged_bed, '--assembly', assembly_gbk, '--type genbank', '--output', current_dir + sample + '_' + query_name], shell=True) #create single entry genbank multi_to_single(sample + '_' + query_name + '_annotated.gbk', sample, final_genbankSingle) @@ -574,7 +582,7 @@ def main(): # Create reference fasta from genbank gbk_to_fasta(args.typingRef, typingRefFasta) # Create bwa index file for typing reference - bwa_index(typingRefFasta) + bwa_index(typingRefFasta) # Set up file names for output files five_header = sample + '_5_' + typingName three_header = sample + '_3_' + typingName @@ -588,22 +596,22 @@ def main(): three_cov_bed = temp_folder + three_header + '_' + query_name + '_cov.bed' five_cov_merged = temp_folder + five_header + '_' + query_name + '_cov_merged.sorted.bed' three_cov_merged = temp_folder + three_header + '_' + query_name + '_cov_merged.sorted.bed' - five_final_cov = five_header + '_' + query_name + '_finalcov.bed' - three_final_cov = three_header + '_' + query_name + '_finalcov.bed' - five_merged_bed = five_header + '_' + query_name + '_merged.sorted.bed' - three_merged_bed = three_header + '_' + query_name + '_merged.sorted.bed' - bed_intersect = sample + '_' + typingName + '_' + query_name + '_intersect.bed' - bed_closest = sample + '_' + typingName + '_' + query_name + '_closest.bed' - bed_unpaired_five = sample + '_' + typingName + '_' + query_name + '_left_unpaired.bed' - bed_unpaired_three = sample + '_' + typingName + '_' + query_name + '_right_unpaired.bed' + five_final_cov = current_dir + five_header + '_' + query_name + '_finalcov.bed' + three_final_cov = current_dir + three_header + '_' + query_name + '_finalcov.bed' + five_merged_bed = current_dir + five_header + '_' + query_name + '_merged.sorted.bed' + three_merged_bed = current_dir + three_header + '_' + query_name + '_merged.sorted.bed' + bed_intersect = current_dir + sample + '_' + typingName + '_' + query_name + '_intersect.bed' + bed_closest = current_dir + sample + '_' + typingName + '_' + query_name + '_closest.bed' + bed_unpaired_five = current_dir + sample + '_' + typingName + '_' + query_name + '_left_unpaired.bed' + bed_unpaired_three = current_dir + sample + '_' + typingName + '_' + query_name + '_right_unpaired.bed' # Map reads to reference, sort if args.a == True: - run_command(['bwa', 'mem', '-a', '-T', args.T, typingRefFasta, final_left_reads, '>', five_to_ref_sam], shell=True) - run_command(['bwa', 'mem', '-a', '-T', args.T, typingRefFasta, final_right_reads, '>', three_to_ref_sam], shell=True) + run_command(['bwa', 'mem', '-a', '-T', args.T, '-t', args.t, typingRefFasta, final_left_reads, '>', five_to_ref_sam], shell=True) + run_command(['bwa', 'mem', '-a', '-T', args.T, '-t', args.t,typingRefFasta, final_right_reads, '>', three_to_ref_sam], shell=True) else: - run_command(['bwa', 'mem', typingRefFasta, final_left_reads, '>', five_to_ref_sam], shell=True) - run_command(['bwa', 'mem', typingRefFasta, final_right_reads, '>', three_to_ref_sam], shell=True) + run_command(['bwa', 'mem', '-t', args.t, typingRefFasta, final_left_reads, '>', five_to_ref_sam], shell=True) + run_command(['bwa', 'mem', '-t', args.t, typingRefFasta, final_right_reads, '>', three_to_ref_sam], shell=True) run_command(['samtools', 'view', '-Sb', five_to_ref_sam, '>', five_to_ref_bam], shell=True) run_command(['samtools', 'view', '-Sb', three_to_ref_sam, '>', three_to_ref_bam], shell=True) run_command(['samtools', 'sort', five_to_ref_bam, five_bam_sorted], shell=True) @@ -615,7 +623,7 @@ def main(): run_command(['bedtools', 'genomecov', '-ibam', three_bam_sorted + '.bam', '-bg', '>', three_cov_bed], shell=True) run_command(['bedtools', 'merge', '-d', args.merging, '-i', five_cov_bed, '>', five_cov_merged], shell=True) run_command(['bedtools', 'merge', '-d', args.merging, '-i', three_cov_bed, '>', three_cov_merged], shell=True) - # Filter coveraged BED files on coverage cutoff (so only take + # Filter coveraged BED files on coverage cutoff (so only take # high coverage regions for further analysis) filter_on_depth(five_cov_bed, five_final_cov, args.cutoff) filter_on_depth(three_cov_bed, three_final_cov, args.cutoff) @@ -633,11 +641,11 @@ def main(): else: igv_flag = '0' run_command([args.path + 'create_typing_out.py', '--intersect', bed_intersect, '--closest', bed_closest, - '--left_bed', five_merged_bed, '--right_bed', three_merged_bed, - '--left_unpaired', bed_unpaired_five, '--right_unpaired', bed_unpaired_three, - '--seq', query, '--ref', args.typingRef, '--temp', temp_folder, + '--left_bed', five_merged_bed, '--right_bed', three_merged_bed, + '--left_unpaired', bed_unpaired_five, '--right_unpaired', bed_unpaired_three, + '--seq', query, '--ref', args.typingRef, '--temp', temp_folder, '--cds', args.cds, '--trna', args.trna, '--rrna', args.rrna, '--min_range', args.min_range, - '--max_range', args.max_range, '--output', sample + '_' + query_name, '--igv', igv_flag, '--chr_name', args.chr_name], shell=True) + '--max_range', args.max_range, '--output', current_dir + sample + '_' + query_name, '--igv', igv_flag, '--chr_name', args.chr_name], shell=True) # remove temp folder if required if args.temp == False: diff --git a/setup.py b/setup.py index 5d4d8ff..613dfbb 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ setup( name='ISMapper', - version='0.1.3', + version='0.1.4', author='Jane Hawkey', author_email='hawkey.jane@gmail.com', packages=['ismap'], @@ -33,4 +33,4 @@ #"scipy >= 0.12.0", #"biopython == 1.63", ], -) \ No newline at end of file +)