Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Paf parsing #167

Merged
merged 8 commits into from
Oct 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 60 additions & 47 deletions piranha/analysis/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,27 +92,35 @@ def parse_line(line):
return values

def add_to_hit_dict(hits, mapping,min_map_len,min_map_quality,unmapped):
status,description = "",""
if mapping["direction"] == "+":
start = mapping["read_hit_start"]
end = mapping["read_hit_end"]
if int(mapping["aln_block_len"]) > min_map_len and int(mapping["map_quality"]) > min_map_quality:
hits[mapping["ref_hit"]].add((mapping["read_name"],start,end,mapping["aln_block_len"]))
else:
unmapped+=1
elif mapping["direction"] == "-":
start = mapping["read_hit_end"]
end = mapping["read_hit_start"]
if int(mapping["aln_block_len"]) > min_map_len and int(mapping["map_quality"]) > min_map_quality:
hits[mapping["ref_hit"]].add((mapping["read_name"],start,end,mapping["aln_block_len"]))
else:
unmapped+=1
else:
unmapped+=1
return unmapped
status = "unmapped"

if not status:
if int(mapping["aln_block_len"]) > min_map_len:
if int(mapping["map_quality"]) > min_map_quality:
hits[mapping["ref_hit"]].add((mapping["read_name"],start,end,mapping["aln_block_len"]))
status = "mapped"
else:
unmapped+=1
status = "filtered"
description = "mapping quality too low"
else:
unmapped+=1
status = "filtered"
description = "alignment block too short"

return unmapped,status,description

def group_hits(paf_file,ref_name_map,len_filter,min_map_quality):

def group_hits(paf_file,ref_name_map,min_aln_block,min_map_quality,mapping_filter_file):
total_reads= 0
ambiguous =0
unmapped = 0
Expand All @@ -121,48 +129,46 @@ def group_hits(paf_file,ref_name_map,len_filter,min_map_quality):
mappings = []

last_mapping = None
with open(paf_file, "r") as f:
for l in f:

mapping = parse_line(l)
if last_mapping:
if mapping["read_name"] == last_mapping["read_name"]:
mappings.append(last_mapping)
else:
mappings.append(last_mapping)

if len(mappings) > 1:
ambiguous +=1
total_reads +=1

with open(mapping_filter_file,"w") as fw:
writer = csv.DictWriter(fw, fieldnames=["read_name","status","description"],lineterminator='\n')
writer.writeheader()
with open(paf_file, "r") as f:
for l in f:
filtered_reason = ""
mapping = parse_line(l)
total_reads +=1

if last_mapping:
if mapping["read_name"] == last_mapping["read_name"]:
mappings.append(last_mapping)
else:
mappings.append(last_mapping)

unmapped = add_to_hit_dict(hits, last_mapping,len_filter,min_map_quality,unmapped)
total_reads +=1
if len(mappings) > 1:
ambiguous +=1
filtered_reason = "ambiguous mapping"
row = {"read_name":mapping["read_name"],"status":"unmapped","description":"ambiguous_mapping"}
writer.writerow(row)
else:
unmapped,status,description = add_to_hit_dict(hits, last_mapping,min_aln_block,min_map_quality,unmapped)
row = {"read_name":mapping["read_name"],"status":status,"description":description}
writer.writerow(row)

mappings = []
last_mapping = mapping
mappings = []
last_mapping = mapping

else:
last_mapping = mapping

unmapped = add_to_hit_dict(hits, last_mapping,len_filter,min_map_quality,unmapped)
total_reads +=1
ref_hits = collections.defaultdict(list)
not_mapped = 0
for ref in hits:
hit_info = hits[ref]
for read in hit_info:

if read[-1] > 0.6*len_filter:
#aln block len needs to be more than 60% of min read len
ref_hits[ref].append(read)
else:
unmapped +=1
else:
last_mapping = mapping

unmapped,status,description = add_to_hit_dict(hits, last_mapping,min_aln_block,min_map_quality,unmapped)
row = {"read_name":mapping["read_name"],"status":status,"description":description}
writer.writerow(row)
total_reads +=1

if total_reads == 0:
return {}, 0, 0, 0
return ref_hits, unmapped, ambiguous, total_reads

return hits, unmapped, ambiguous, total_reads

def write_out_report(ref_index,ref_map,csv_out,hits,unmapped,total_reads,barcode):

Expand Down Expand Up @@ -214,6 +220,7 @@ def is_non_zero_file(fpath):

def parse_paf_file(paf_file,
csv_out,
mapping_filter_out,
hits_out,
references_sequences,
barcode,
Expand All @@ -226,9 +233,9 @@ def parse_paf_file(paf_file,
permissive_ref_name_map = make_ref_display_name_map(references_sequences)
ref_name_map = make_display_name_to_reference_group_map(permissive_ref_name_map)

len_filter = 0.4*config[KEY_MIN_READ_LENGTH]
min_aln_block = config[KEY_MIN_ALN_BLOCK]

ref_hits, unmapped,ambiguous, total_reads = group_hits(paf_file,ref_name_map,len_filter,min_map_quality)
ref_hits, unmapped, ambiguous, total_reads = group_hits(paf_file,ref_name_map,min_aln_block,min_map_quality,mapping_filter_out)
print(f"Barcode: {barcode}")
print(green("Unmapped:"),unmapped)
print(green("Ambiguous mapping:"),ambiguous)
Expand All @@ -237,6 +244,7 @@ def parse_paf_file(paf_file,
write_out_report(ref_index,ref_name_map,csv_out,ref_hits,unmapped,total_reads,barcode)

write_out_hits(ref_hits,hits_out)

else:
print("No reads for",barcode)
with open(csv_out,"w") as fw:
Expand All @@ -247,6 +255,11 @@ def parse_paf_file(paf_file,
writer = csv.DictWriter(fw, lineterminator="\n",fieldnames=["read_name","hit","start","end","aln_block_len"])
writer.writeheader()

with open(mapping_filter_out,"w") as fw:
writer = csv.DictWriter(fw, lineterminator="\n",fieldnames=["read_name","status","description"])
writer.writeheader()



def diversity_report(input_files,csv_out,summary_out,ref_file,config):
barcodes_csv= config[KEY_BARCODES_CSV]
Expand Down
2 changes: 2 additions & 0 deletions piranha/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def main(sysargs = sys.argv[1:]):
analysis_group.add_argument("-x","--max-read-length",action="store",type=int,help=f"Maximum read length. Default: {READ_LENGTH_DICT[VALUE_ANALYSIS_MODE][1]}")
analysis_group.add_argument("-d","--min-read-depth",action="store",type=int,help=f"Minimum read depth required for consensus generation. Default: {VALUE_MIN_READS}")
analysis_group.add_argument("-p","--min-read-pcent",action="store",type=float,help=f"Minimum percentage of sample required for consensus generation. Default: {VALUE_MIN_PCENT}")
analysis_group.add_argument("-a","--min-aln-block",action="store",type=float,help=f"Minimum alignment block length. Default: 0.6*MIN_READ_LENGTH")
analysis_group.add_argument("--primer-length",action="store",type=int,help=f"Length of primer sequences to trim off start and end of reads. Default: {VALUE_PRIMER_LENGTH}")

phylo_group = parser.add_argument_group('Phylogenetics options')
Expand Down Expand Up @@ -122,6 +123,7 @@ def main(sysargs = sys.argv[1:]):
args.max_read_length,
args.min_read_depth,
args.min_read_pcent,
args.min_aln_block,
args.primer_length,
args.min_map_quality,
config)
Expand Down
5 changes: 3 additions & 2 deletions piranha/input_parsing/analysis_arg_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,17 +50,18 @@ def medaka_options_parsing(medaka_model,medaka_list_models,config):
sys.stderr.write(cyan(f"Medaka model specified not valid: `{config[KEY_MEDAKA_MODEL]}`.\nPlease use --medaka-list-models to see which models are available.\nIf needed, update medaka version with `pip install --upgrade medaka`.\n"))
sys.exit(-1)

def analysis_group_parsing(min_read_length,max_read_length,min_read_depth,min_read_pcent,primer_length,min_map_quality,config):
def analysis_group_parsing(min_read_length,max_read_length,min_read_depth,min_read_pcent,min_aln_block,primer_length,min_map_quality,config):

# if command line arg, overwrite config value
misc.add_arg_to_config(KEY_MIN_READ_LENGTH,min_read_length,config)
misc.add_arg_to_config(KEY_MAX_READ_LENGTH,max_read_length,config)
misc.add_arg_to_config(KEY_MIN_READS,min_read_depth,config)
misc.add_arg_to_config(KEY_MIN_PCENT,min_read_pcent,config)
misc.add_arg_to_config(KEY_MIN_ALN_BLOCK,min_aln_block,config)
misc.add_arg_to_config(KEY_PRIMER_LENGTH,primer_length,config)
misc.add_arg_to_config(KEY_MIN_MAP_QUALITY,min_map_quality,config)

for key in [KEY_MIN_READ_LENGTH,KEY_MAX_READ_LENGTH,KEY_MIN_READS]:
for key in [KEY_MIN_READ_LENGTH,KEY_MAX_READ_LENGTH,KEY_MIN_READS,KEY_MIN_ALN_BLOCK]:
check_if_int(key,config)

check_if_float(KEY_MIN_PCENT,config)
Expand Down
1 change: 1 addition & 0 deletions piranha/input_parsing/initialising.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def get_defaults():
KEY_MIN_READ_LENGTH:READ_LENGTH_DICT[VALUE_ANALYSIS_MODE][0],
KEY_MAX_READ_LENGTH:READ_LENGTH_DICT[VALUE_ANALYSIS_MODE][1],
KEY_MIN_MAP_QUALITY:VALUE_MIN_MAP_QUALITY,
KEY_MIN_ALN_BLOCK:0.6*(READ_LENGTH_DICT[VALUE_ANALYSIS_MODE][0]),

KEY_MEDAKA_MODEL:VALUE_DEFAULT_MEDAKA_MODEL,
KEY_PRIMER_LENGTH:VALUE_PRIMER_LENGTH,
Expand Down
4 changes: 3 additions & 1 deletion piranha/scripts/piranha_preprocessing.smk
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,12 @@ rule assess_broad_diversity:
min_map_quality = config[KEY_MIN_MAP_QUALITY]
output:
csv = os.path.join(config[KEY_TEMPDIR],"{barcode}","initial_processing","refs_present.csv"),
hits = os.path.join(config[KEY_TEMPDIR],"{barcode}","initial_processing","hits_reads.csv")
hits = os.path.join(config[KEY_TEMPDIR],"{barcode}","initial_processing","hits_reads.csv"),
parsing = os.path.join(config[KEY_TEMPDIR],"{barcode}","initial_processing","hits_filter_description.csv")
run:
parse_paf_file(input.map_file,
output.csv,
output.parsing,
output.hits,
config[KEY_REFERENCE_SEQUENCES],
params.barcode,
Expand Down
4 changes: 2 additions & 2 deletions piranha/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
KEY_MIN_READS = "min_read_depth"
KEY_MIN_PCENT = "min_read_pcent"
KEY_MIN_MAP_QUALITY = "min_map_quality"
KEY_MIN_ALN_BLOCK = "min_aln_block"
KEY_REFERENCE_SEQUENCES = "reference_sequences"
KEY_MEDAKA_MODEL = "medaka_model"
KEY_PRIMER_LENGTH = "primer_length"
Expand Down Expand Up @@ -143,7 +144,6 @@
# READ_LENGTH_DEFAULT_WG = [3400,5200]
VALUE_PRIMER_LENGTH = 30
VALUE_MIN_MAP_QUALITY = 50

VALUE_DEFAULT_MEDAKA_MODEL="r941_min_hac_variant_g507"

VALUE_MIN_READS = 50
Expand Down Expand Up @@ -200,7 +200,7 @@
"NonPolioEV|closest_reference","NonPolioEV|num_reads","NonPolioEV|nt_diff_from_reference","NonPolioEV|pcent_match","NonPolioEV|classification","comments"]

VALUE_CONFIGURATION_TABLE_FIELDS = [
KEY_MIN_READ_LENGTH,KEY_MAX_READ_LENGTH,KEY_MIN_MAP_QUALITY,KEY_MIN_READS,
KEY_MIN_READ_LENGTH,KEY_MAX_READ_LENGTH,KEY_MIN_MAP_QUALITY,KEY_MIN_READS,KEY_MIN_ALN_BLOCK,
KEY_MIN_PCENT,KEY_MEDAKA_MODEL,KEY_PRIMER_LENGTH,KEY_ANALYSIS_MODE,KEY_RUN_PHYLO
]

Expand Down
Loading