diff --git a/piranha/analysis/preprocessing.py b/piranha/analysis/preprocessing.py index 4fe94f0..29e6430 100644 --- a/piranha/analysis/preprocessing.py +++ b/piranha/analysis/preprocessing.py @@ -92,27 +92,35 @@ def parse_line(line): return values def add_to_hit_dict(hits, mapping,min_map_len,min_map_quality,unmapped): + status,description = "","" if mapping["direction"] == "+": start = mapping["read_hit_start"] end = mapping["read_hit_end"] - if int(mapping["aln_block_len"]) > min_map_len and int(mapping["map_quality"]) > min_map_quality: - hits[mapping["ref_hit"]].add((mapping["read_name"],start,end,mapping["aln_block_len"])) - else: - unmapped+=1 elif mapping["direction"] == "-": start = mapping["read_hit_end"] end = mapping["read_hit_start"] - if int(mapping["aln_block_len"]) > min_map_len and int(mapping["map_quality"]) > min_map_quality: - hits[mapping["ref_hit"]].add((mapping["read_name"],start,end,mapping["aln_block_len"])) - else: - unmapped+=1 else: unmapped+=1 - return unmapped + status = "unmapped" + if not status: + if int(mapping["aln_block_len"]) > min_map_len: + if int(mapping["map_quality"]) > min_map_quality: + hits[mapping["ref_hit"]].add((mapping["read_name"],start,end,mapping["aln_block_len"])) + status = "mapped" + else: + unmapped+=1 + status = "filtered" + description = "mapping quality too low" + else: + unmapped+=1 + status = "filtered" + description = "alignment block too short" + return unmapped,status,description -def group_hits(paf_file,ref_name_map,len_filter,min_map_quality): + +def group_hits(paf_file,ref_name_map,min_aln_block,min_map_quality,mapping_filter_file): total_reads= 0 ambiguous =0 unmapped = 0 @@ -121,48 +129,46 @@ def group_hits(paf_file,ref_name_map,len_filter,min_map_quality): mappings = [] last_mapping = None - with open(paf_file, "r") as f: - for l in f: - - mapping = parse_line(l) - if last_mapping: - if mapping["read_name"] == last_mapping["read_name"]: - mappings.append(last_mapping) - else: - mappings.append(last_mapping) - - if len(mappings) > 1: - ambiguous +=1 - total_reads +=1 - + with open(mapping_filter_file,"w") as fw: + writer = csv.DictWriter(fw, fieldnames=["read_name","status","description"],lineterminator='\n') + writer.writeheader() + with open(paf_file, "r") as f: + for l in f: + filtered_reason = "" + mapping = parse_line(l) + total_reads +=1 + + if last_mapping: + if mapping["read_name"] == last_mapping["read_name"]: + mappings.append(last_mapping) else: + mappings.append(last_mapping) - unmapped = add_to_hit_dict(hits, last_mapping,len_filter,min_map_quality,unmapped) - total_reads +=1 + if len(mappings) > 1: + ambiguous +=1 + filtered_reason = "ambiguous mapping" + row = {"read_name":mapping["read_name"],"status":"unmapped","description":"ambiguous_mapping"} + writer.writerow(row) + else: + unmapped,status,description = add_to_hit_dict(hits, last_mapping,min_aln_block,min_map_quality,unmapped) + row = {"read_name":mapping["read_name"],"status":status,"description":description} + writer.writerow(row) - mappings = [] - last_mapping = mapping + mappings = [] + last_mapping = mapping - else: - last_mapping = mapping - - unmapped = add_to_hit_dict(hits, last_mapping,len_filter,min_map_quality,unmapped) - total_reads +=1 - ref_hits = collections.defaultdict(list) - not_mapped = 0 - for ref in hits: - hit_info = hits[ref] - for read in hit_info: - - if read[-1] > 0.6*len_filter: - #aln block len needs to be more than 60% of min read len - ref_hits[ref].append(read) - else: - unmapped +=1 + else: + last_mapping = mapping + unmapped,status,description = add_to_hit_dict(hits, last_mapping,min_aln_block,min_map_quality,unmapped) + row = {"read_name":mapping["read_name"],"status":status,"description":description} + writer.writerow(row) + total_reads +=1 + if total_reads == 0: return {}, 0, 0, 0 - return ref_hits, unmapped, ambiguous, total_reads + + return hits, unmapped, ambiguous, total_reads def write_out_report(ref_index,ref_map,csv_out,hits,unmapped,total_reads,barcode): @@ -214,6 +220,7 @@ def is_non_zero_file(fpath): def parse_paf_file(paf_file, csv_out, + mapping_filter_out, hits_out, references_sequences, barcode, @@ -226,9 +233,9 @@ def parse_paf_file(paf_file, permissive_ref_name_map = make_ref_display_name_map(references_sequences) ref_name_map = make_display_name_to_reference_group_map(permissive_ref_name_map) - len_filter = 0.4*config[KEY_MIN_READ_LENGTH] + min_aln_block = config[KEY_MIN_ALN_BLOCK] - ref_hits, unmapped,ambiguous, total_reads = group_hits(paf_file,ref_name_map,len_filter,min_map_quality) + ref_hits, unmapped, ambiguous, total_reads = group_hits(paf_file,ref_name_map,min_aln_block,min_map_quality,mapping_filter_out) print(f"Barcode: {barcode}") print(green("Unmapped:"),unmapped) print(green("Ambiguous mapping:"),ambiguous) @@ -237,6 +244,7 @@ def parse_paf_file(paf_file, write_out_report(ref_index,ref_name_map,csv_out,ref_hits,unmapped,total_reads,barcode) write_out_hits(ref_hits,hits_out) + else: print("No reads for",barcode) with open(csv_out,"w") as fw: @@ -247,6 +255,11 @@ def parse_paf_file(paf_file, writer = csv.DictWriter(fw, lineterminator="\n",fieldnames=["read_name","hit","start","end","aln_block_len"]) writer.writeheader() + with open(mapping_filter_out,"w") as fw: + writer = csv.DictWriter(fw, lineterminator="\n",fieldnames=["read_name","status","description"]) + writer.writeheader() + + def diversity_report(input_files,csv_out,summary_out,ref_file,config): barcodes_csv= config[KEY_BARCODES_CSV] diff --git a/piranha/command.py b/piranha/command.py index f38161e..bd6af98 100644 --- a/piranha/command.py +++ b/piranha/command.py @@ -50,6 +50,7 @@ def main(sysargs = sys.argv[1:]): analysis_group.add_argument("-x","--max-read-length",action="store",type=int,help=f"Maximum read length. Default: {READ_LENGTH_DICT[VALUE_ANALYSIS_MODE][1]}") analysis_group.add_argument("-d","--min-read-depth",action="store",type=int,help=f"Minimum read depth required for consensus generation. Default: {VALUE_MIN_READS}") analysis_group.add_argument("-p","--min-read-pcent",action="store",type=float,help=f"Minimum percentage of sample required for consensus generation. Default: {VALUE_MIN_PCENT}") + analysis_group.add_argument("-a","--min-aln-block",action="store",type=float,help=f"Minimum alignment block length. Default: 0.6*MIN_READ_LENGTH") analysis_group.add_argument("--primer-length",action="store",type=int,help=f"Length of primer sequences to trim off start and end of reads. Default: {VALUE_PRIMER_LENGTH}") phylo_group = parser.add_argument_group('Phylogenetics options') @@ -122,6 +123,7 @@ def main(sysargs = sys.argv[1:]): args.max_read_length, args.min_read_depth, args.min_read_pcent, + args.min_aln_block, args.primer_length, args.min_map_quality, config) diff --git a/piranha/input_parsing/analysis_arg_parsing.py b/piranha/input_parsing/analysis_arg_parsing.py index db3482d..c899c11 100644 --- a/piranha/input_parsing/analysis_arg_parsing.py +++ b/piranha/input_parsing/analysis_arg_parsing.py @@ -50,17 +50,18 @@ def medaka_options_parsing(medaka_model,medaka_list_models,config): sys.stderr.write(cyan(f"Medaka model specified not valid: `{config[KEY_MEDAKA_MODEL]}`.\nPlease use --medaka-list-models to see which models are available.\nIf needed, update medaka version with `pip install --upgrade medaka`.\n")) sys.exit(-1) -def analysis_group_parsing(min_read_length,max_read_length,min_read_depth,min_read_pcent,primer_length,min_map_quality,config): +def analysis_group_parsing(min_read_length,max_read_length,min_read_depth,min_read_pcent,min_aln_block,primer_length,min_map_quality,config): # if command line arg, overwrite config value misc.add_arg_to_config(KEY_MIN_READ_LENGTH,min_read_length,config) misc.add_arg_to_config(KEY_MAX_READ_LENGTH,max_read_length,config) misc.add_arg_to_config(KEY_MIN_READS,min_read_depth,config) misc.add_arg_to_config(KEY_MIN_PCENT,min_read_pcent,config) + misc.add_arg_to_config(KEY_MIN_ALN_BLOCK,min_aln_block,config) misc.add_arg_to_config(KEY_PRIMER_LENGTH,primer_length,config) misc.add_arg_to_config(KEY_MIN_MAP_QUALITY,min_map_quality,config) - for key in [KEY_MIN_READ_LENGTH,KEY_MAX_READ_LENGTH,KEY_MIN_READS]: + for key in [KEY_MIN_READ_LENGTH,KEY_MAX_READ_LENGTH,KEY_MIN_READS,KEY_MIN_ALN_BLOCK]: check_if_int(key,config) check_if_float(KEY_MIN_PCENT,config) diff --git a/piranha/input_parsing/initialising.py b/piranha/input_parsing/initialising.py index 5991203..5ea6980 100644 --- a/piranha/input_parsing/initialising.py +++ b/piranha/input_parsing/initialising.py @@ -40,6 +40,7 @@ def get_defaults(): KEY_MIN_READ_LENGTH:READ_LENGTH_DICT[VALUE_ANALYSIS_MODE][0], KEY_MAX_READ_LENGTH:READ_LENGTH_DICT[VALUE_ANALYSIS_MODE][1], KEY_MIN_MAP_QUALITY:VALUE_MIN_MAP_QUALITY, + KEY_MIN_ALN_BLOCK:0.6*(READ_LENGTH_DICT[VALUE_ANALYSIS_MODE][0]), KEY_MEDAKA_MODEL:VALUE_DEFAULT_MEDAKA_MODEL, KEY_PRIMER_LENGTH:VALUE_PRIMER_LENGTH, diff --git a/piranha/scripts/piranha_preprocessing.smk b/piranha/scripts/piranha_preprocessing.smk index 1f9632c..d716861 100644 --- a/piranha/scripts/piranha_preprocessing.smk +++ b/piranha/scripts/piranha_preprocessing.smk @@ -51,10 +51,12 @@ rule assess_broad_diversity: min_map_quality = config[KEY_MIN_MAP_QUALITY] output: csv = os.path.join(config[KEY_TEMPDIR],"{barcode}","initial_processing","refs_present.csv"), - hits = os.path.join(config[KEY_TEMPDIR],"{barcode}","initial_processing","hits_reads.csv") + hits = os.path.join(config[KEY_TEMPDIR],"{barcode}","initial_processing","hits_reads.csv"), + parsing = os.path.join(config[KEY_TEMPDIR],"{barcode}","initial_processing","hits_filter_description.csv") run: parse_paf_file(input.map_file, output.csv, + output.parsing, output.hits, config[KEY_REFERENCE_SEQUENCES], params.barcode, diff --git a/piranha/utils/config.py b/piranha/utils/config.py index f67cc7c..bfb9d51 100644 --- a/piranha/utils/config.py +++ b/piranha/utils/config.py @@ -28,6 +28,7 @@ KEY_MIN_READS = "min_read_depth" KEY_MIN_PCENT = "min_read_pcent" KEY_MIN_MAP_QUALITY = "min_map_quality" +KEY_MIN_ALN_BLOCK = "min_aln_block" KEY_REFERENCE_SEQUENCES = "reference_sequences" KEY_MEDAKA_MODEL = "medaka_model" KEY_PRIMER_LENGTH = "primer_length" @@ -143,7 +144,6 @@ # READ_LENGTH_DEFAULT_WG = [3400,5200] VALUE_PRIMER_LENGTH = 30 VALUE_MIN_MAP_QUALITY = 50 - VALUE_DEFAULT_MEDAKA_MODEL="r941_min_hac_variant_g507" VALUE_MIN_READS = 50 @@ -200,7 +200,7 @@ "NonPolioEV|closest_reference","NonPolioEV|num_reads","NonPolioEV|nt_diff_from_reference","NonPolioEV|pcent_match","NonPolioEV|classification","comments"] VALUE_CONFIGURATION_TABLE_FIELDS = [ - KEY_MIN_READ_LENGTH,KEY_MAX_READ_LENGTH,KEY_MIN_MAP_QUALITY,KEY_MIN_READS, + KEY_MIN_READ_LENGTH,KEY_MAX_READ_LENGTH,KEY_MIN_MAP_QUALITY,KEY_MIN_READS,KEY_MIN_ALN_BLOCK, KEY_MIN_PCENT,KEY_MEDAKA_MODEL,KEY_PRIMER_LENGTH,KEY_ANALYSIS_MODE,KEY_RUN_PHYLO ]