diff --git a/piranha/analysis/haplo_functions.py b/piranha/analysis/haplo_functions.py index c27b05e..90bc535 100644 --- a/piranha/analysis/haplo_functions.py +++ b/piranha/analysis/haplo_functions.py @@ -11,7 +11,6 @@ def parse_partition_file(partition_file): - partitions = collections.defaultdict(set) with open(partition_file, "r") as f: part = "" @@ -35,6 +34,11 @@ def collapse_close(flopp_file,distance_cutoff,vcf_file): final_haplos = [] vcf = parse_VCF(vcf_file) parsed_flopp = parse_flopp_reads(flopp_file) + #if flopp wasn't run then the file is empty, parsing gives empty list + #need a single partition containing all reads + if parsed_flopp == []: + return [[0]] + for h in parsed_flopp: collapse_into_current = [haplo_pos] if haplo_pos in collapsed: @@ -176,7 +180,12 @@ def parse_support_col(col): total_reads = 0 base_support = {} for base in base_split: - base_val,support_reads = base.split(':') + try: + base_val,support_reads = base.split(':') + except ValueError: + # -1 in the support col - means zero reads in partition overlap this position + base_val = '-1' + support_reads = 0 # if base_val == selbase_val: # base_support = int(support_reads) base_support[base_val] = int(support_reads) @@ -187,6 +196,9 @@ def check_site_sig(base_support,allele_freqs): #check if allele freqs are consistent with random sample from overall reads for this site #need to know these are the same order #reads might have 100% one allele, need to insert zero in this case, then sort to make sure order correct + if '-1' in base_support.keys(): + # zero read coverage, this site shouldn't be significant + return False for key in allele_freqs: if key not in base_support: base_support[key] = 0 @@ -226,7 +238,16 @@ def get_haplo_dist(h1,h2,parsed_VCF): def sum_base_support(dict1,dict2): out_dict = {} for key in dict1: - out_dict[key] = dict1[key] + dict2[key] + try: + out_dict[key] = dict1[key] + dict2[key] + except KeyError: + #base not recorded in dict2 - probably a -1 col + out_dict[key] = dict1[key] + # check for extra keys in other dict + for key in dict2: + if key not in dict1.keys(): + out_dict[key] = dict2[key] + return out_dict def calc_sd(haplotypes): diff --git a/piranha/scripts/piranha_haplotype.smk b/piranha/scripts/piranha_haplotype.smk index 3a9e79b..52890ce 100644 --- a/piranha/scripts/piranha_haplotype.smk +++ b/piranha/scripts/piranha_haplotype.smk @@ -85,8 +85,11 @@ rule flopp: partition = os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}","haplotyping","flopp_partitions","{reference}_part.txt") shell: """ - VARIANTS=$(grep -v '#' {input.vcf} | wc -l) - if [[ $VARIANTS -gt 0 ]] + # || true to stop snakemake catching the exit code 1 grep returns on finding zero lines + VARIANTS=$((grep -v '#' {input.vcf} | wc -l) || true) + echo "Number of variants" + echo $VARIANTS + if [[ $VARIANTS -gt 1 ]] then flopp -b {input.bam:q} \ -c {input.vcf:q} \ @@ -95,7 +98,7 @@ rule flopp: -o {output.flopp:q} \ -P {params.partition_path} else - echo "No variants called - single reference haplotype" + echo "Less than 2 variants called - haplotyping not possible, one haplotype will be output" touch {output.partition:q} touch {output.flopp:q} fi @@ -129,10 +132,14 @@ rule haplotype_qc: with open(output.txt,"w") as fhaplo: merged_haplo_count = 0 for subset in merge_info: - reads = set() - for part in subset: - part_reads = partitions[part] - reads = reads.union(part_reads) + if subset == [0] and len(merge_info) == 1: + # flopp wasn't run, single haplo to be called using all reads + reads = [read for read in seq_index] + else: + reads = set() + for part in subset: + part_reads = partitions[part] + reads = reads.union(part_reads) if len(reads) > params.min_reads: print(green(f"Haplotype HAP0{merged_haplo_count}:"), len(reads), "reads")