Skip to content

Commit

Permalink
Revert "Update piranha_consensus.smk"
Browse files Browse the repository at this point in the history
This reverts commit 848f660.
  • Loading branch information
Desperate-Dan committed Apr 16, 2024
1 parent 848f660 commit ff2d8c0
Showing 1 changed file with 136 additions and 183 deletions.
319 changes: 136 additions & 183 deletions piranha/scripts/piranha_consensus.smk
Original file line number Diff line number Diff line change
@@ -1,223 +1,176 @@
#!/usr/bin/env python3


import os
import collections
from Bio import SeqIO
import yaml

from piranha.analysis.clean_gaps import *
from piranha.analysis.consensus_functions import *
from piranha.utils.log_colours import green,cyan
from piranha.utils.log_colours import green,cyan,yellow
from piranha.utils.config import *



BARCODE = config[KEY_BARCODE]
SAMPLE = str(config[KEY_SAMPLE])
REFERENCES = config[BARCODE]

rule all:
input:
os.path.join(config[KEY_TEMPDIR],"consensus_sequences.fasta"),
os.path.join(config[KEY_TEMPDIR],"variants.csv"),
os.path.join(config[KEY_TEMPDIR],"masked_variants.csv"),
expand(os.path.join(config[KEY_TEMPDIR],"snipit","{reference}.svg"), reference=REFERENCES),
expand(os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}.merged_cns.mask.tsv"), reference=REFERENCES)


# do this per cns
os.path.join(config[KEY_TEMPDIR],"consensus_config.yaml")

rule files:
params:
ref= os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}.ref.fasta"),
cns = os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}.merged_cns.fasta"),
vcf = os.path.join(config[KEY_TEMPDIR],"variant_calls","{reference}.vcf")
ref=os.path.join(config[KEY_TEMPDIR],"reference_groups","{reference}.reference.fasta"),
reads=os.path.join(config[KEY_TEMPDIR],"reference_groups","{reference}.fastq")

rule maskara:
rule medaka_haploid_variant:
input:
ref= rules.files.params.ref,
cns = rules.files.params.cns,
vcf = rules.files.params.vcf, #Need to get the vcf files for bcftools consensus to work
bam = os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}.merged_cns.bam"),
reads=rules.files.params.reads,
ref=rules.files.params.ref
params:
reference = "{reference}"
model = config[KEY_MEDAKA_MODEL],
outdir = os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}","medaka_haploid_variant")
output:
mask = os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}.merged_cns.mask.tsv"),
fasta = os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}.merged_cns.masked.fasta")
run:
mask_file = os.path.join(config[KEY_TEMPDIR],"reference_analysis","{wildcards.reference}.merged_cns.mask") #Added this line to define output variable in maskara command (wouldn't be a problem is maska didn't add a .tsv; must fix)
ref = params.reference.split(".SEQ")[0]
outfile = output.mask[:-4]
shell(f"maskara '{input.bam}' -r '{ref}' -o '{mask_file}'")
shell(f"echo '{ref} {ref}' | bcftools annotate --rename-chrs /dev/stdin '{input.vcf}' > {input.vcf}_renamed.vcf") #This line is not used currently, but if there's a change to the input to bcftools may be needed. Changes the name of the chrm variants are mapped against to allow bcftools to appy variants to whichever input is needed if your fasta is titled differently.
shell(f"bgzip -c '{input.vcf}_renamed.vcf' > '{input.vcf}_renamed.vcf.gz'") #bcftools needs vcf files to be bgzipped...
shell(f"bcftools index '{input.vcf}_renamed.vcf.gz'") #...and also indexed
shell(f"bcftools consensus -f '{input.ref}' -m '{mask_file}.tsv' '{input.vcf}_renamed.vcf.gz' > '{output.fasta}'") #Applies the mask and variants to the ORIGINAL REFERENCE. I think this is sensible, bcftools fails if you give it merged_cns for example as the "ref" alleles in the vcf don't match the provided "ref" sequence.

# cns = ""
# for record in SeqIO.parse(input.cns, "fasta"):
# cns = str(record.seq)
# new_seq = ""
# with open(output.mask, "r") as f:
# for l in f:
# l= l.rstrip()
# ref,start,end = l.split("\t")
# start = int(start)
# end = int(end)
# if not new_seq:
# new_seq = cns[:start]

# new_seq += cns[:]


rule join_cns_ref:
input:
ref=rules.files.params.ref,
cns = rules.maskara.output.fasta
params:
reference = "{reference}"
output:
fasta = os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}","ref_cns.fasta")
run:
with open(output[0],"w") as fw:
for record in SeqIO.parse(input.ref,KEY_FASTA):
match_field = ""
for field in record.description.split(" "):
if field.startswith(VALUE_REFERENCE_GROUP_FIELD):
match_field = field.split("=")[1]

fw.write(f">{match_field} {record.description}\n{record.seq}\n")
if "Sabin" in params.reference:
for record in SeqIO.parse(input.cns,KEY_FASTA):
cns = record.id.split(".")[-1]
record_name = str(SAMPLE).replace(" ","_")
record_name += f"|{cns}"
fw.write(f">{record_name}\n{record.seq}\n")
else:
# need to check up on this
for record in SeqIO.parse(input.cns,KEY_FASTA):
record_name = str(SAMPLE).replace(" ","_")
fw.write(f">{record_name}\n{record.seq}\n")


rule align_cns_ref:
input:
rules.join_cns_ref.output.fasta
output:
aln = os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}","ref_cns.aln.fasta")
probs = os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}","medaka_haploid_variant","consensus_probs.hdf"),
vcf = os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}","medaka_haploid_variant","medaka.vcf"),
cns = os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}","medaka_haploid_variant","consensus.fasta"),
pub_vcf = os.path.join(config[KEY_TEMPDIR],"variant_calls","{reference}.vcf")
log: os.path.join(config[KEY_TEMPDIR],"logs","{reference}.hapoid_variant.log")
shell:
"""
mafft {input:q} > {output:q}
[ ! -d {params.outdir:q} ] && mkdir {params.outdir:q}
if [ -s {input.ref:q} ]
then
medaka_haploid_variant -i {input.reads:q} \
-r {input.ref:q} \
-o {params.outdir:q} \
-m {params.model:q} \
-f -x && \
medaka stitch {output.probs:q} {input.ref:q} {output.cns:q}
else
touch {output.cns:q}
touch {output.probs:q}
touch {output.vcf:q}
fi
cp {output.vcf:q} {output.pub_vcf:q}
"""

# this step might lead to some sequences now being identical
rule curate_variants:

rule medaka_haploid_variant_cns:
input:
aln = rules.align_cns_ref.output.aln
reads=rules.files.params.reads,
ref=rules.medaka_haploid_variant.output.cns
params:
reference = "{reference}"
reference = "{reference}",
model = config[KEY_MEDAKA_MODEL],
outdir = os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}","medaka_haploid_variant_cns")
output:
masked = os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}","masked.csv"),
fasta = os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}","medaka_cns_clean.fasta")
probs = os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}","medaka_haploid_variant_cns","consensus_probs.hdf"),
vcf = os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}","medaka_haploid_variant_cns","medaka.vcf"),
cns = os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}","medaka_haploid_variant_cns","consensus.fasta")
log: os.path.join(config[KEY_TEMPDIR],"logs","{reference}_cns.hapoid_variant.log")
run:
masked = clean_medaka_cns(SAMPLE, input.aln, output.fasta)
with open(output.masked,"w") as fw:
for var in masked:
site = int(var) + 1
fw.write(f"{params.reference},{site},{masked[var]}\n")

rule gather_masked_variants:
if not "Sabin" in params.reference:
shell("""
[ ! -d {params.outdir:q} ] && mkdir {params.outdir:q}
if [ -s {input.ref:q} ]
then
medaka_haploid_variant -i {input.reads:q} \
-r {input.ref:q} \
-o {params.outdir:q} \
-m {params.model:q} \
-f -x && \
medaka stitch {output.probs} {input.ref} {output.cns}
else
touch {output.cns:q}
touch {output.probs:q}
touch {output.vcf:q}
fi
""")
else:
shell("""
touch {output.cns:q}
touch {output.probs:q}
touch {output.vcf:q}
""")

rule gather_merge_cns:
input:
expand(rules.curate_variants.output.masked, reference=REFERENCES)
ref=expand(rules.files.params.ref,reference=REFERENCES),
cns = expand(os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}","medaka_haploid_variant","consensus.fasta"),reference=REFERENCES),
cns_cns = expand(os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}","medaka_haploid_variant_cns","consensus.fasta"),reference=REFERENCES)
output:
masked = os.path.join(config[KEY_TEMPDIR],"masked_variants.csv")
yaml = os.path.join(config[KEY_TEMPDIR],"consensus_config.yaml")
run:
with open(output.masked,"w") as fw:
fw.write("reference,site,variant\n")
for i in input:
with open(i, "r") as f:
for l in f:
l=l.rstrip("\n")
fw.write(l + '\n')
sequences = collections.defaultdict(set)
ref_seqs = {}

for cns_file in input.cns:
if "Sabin" in cns_file:
haplodir = "/".join(cns_file.split("/")[:-1])
else:
haplodir = "/".join(cns_file.split("/")[:-1])+"_cns"

rule join_clean_cns_ref:
input:
ref=rules.files.params.ref,
cns=rules.curate_variants.output.fasta
output:
fasta = os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}","ref_medaka_cns_clean.fasta")
run:
with open(output[0],"w") as fw:
for record in SeqIO.parse(input.ref,KEY_FASTA):
match_field = ""
for field in record.description.split(" "):
if field.startswith(VALUE_REFERENCE_GROUP_FIELD):
match_field = field.split("=")[1]

fw.write(f">{match_field} {record.description}\n{record.seq}\n")
for record in SeqIO.parse(input.cns,KEY_FASTA):
record_name = SAMPLE.replace(" ","_")
fw.write(f">{record_name}\n{record.seq}\n")

rule align_clean_cns_ref:
input:
rules.join_clean_cns_ref.output.fasta
output:
aln = os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}","ref_medaka_cns_clean.aln.fasta")
shell:
"""
mafft {input:q} > {output:q}
"""
haplo_bam = os.path.join(haplodir, "calls_to_ref.bam")
for record in SeqIO.parse(cns_file, KEY_FASTA):
print(record.id)
sequences[str(record.seq)].add(haplo_bam)
ref_seqs[str(record.seq)] = record.id

ref_file_dict = {}
for ref_file in input.ref:
for record in SeqIO.parse(ref_file,KEY_FASTA):
ref_file_dict[record.id] = ref_file
# print(ref_file_dict)
cns_file_dict = {}
for cns_file in input.cns_cns:
for record in SeqIO.parse(cns_file,KEY_FASTA):
cns_file_dict[record.id] = cns_file
# print(cns_file_dict)

for seq in sequences:
print(seq[:5],"...",sequences[seq])

ref_count = collections.Counter()
ref_cns = []
new_config = config

rule make_snipit_graph:
input:
aln = rules.align_clean_cns_ref.output.aln
params:
out_stem = os.path.join(config[KEY_TEMPDIR],"snipit","{reference}")
output:
os.path.join(config[KEY_TEMPDIR],"snipit","{reference}.svg")
run:
try:
shell("""snipit {input.aln:q} -o {params.out_stem} -f svg -c wes""")
except:
shell("touch {output[0]:q}")
for seq in sequences:
ref = ref_seqs[seq]
ref_count[ref] +=1
record_id = f"{ref_seqs[seq]}.{config[KEY_CNS_STEM]}0{ref_count[ref]}"
ref_cns.append(record_id)

haplo_bams = ""
for i in sequences[seq]:
haplo_bams += f"'{i}' "

out_bam = os.path.join(config[KEY_TEMPDIR],"reference_analysis",f"{record_id}.merged_cns.bam")
read_count_out = os.path.join(config[KEY_TEMPDIR],"reference_analysis",f"{record_id}.read_count.txt")

shell(f"samtools merge -f '{out_bam}' {haplo_bams} && samtools index '{out_bam}'")
shell(f"samtools view -F 0x904 -c '{out_bam}' > {read_count_out}")

with open(read_count_out,"r") as f:
for l in f:
read_count = l.rstrip()
new_config[record_id] = read_count

with open(os.path.join(config[KEY_TEMPDIR],"reference_analysis",f"{record_id}.merged_cns.fasta"),"w") as fseq:
fseq.write(f">{record_id} read_count={read_count}\n{seq}\n")

cns_ref = os.path.join(config[KEY_TEMPDIR],"reference_analysis",f"{record_id}.ref.fasta")
cns_cns = os.path.join(config[KEY_TEMPDIR],"reference_analysis",f"{record_id}.cns_ref.fasta")
# print(cns_cns)
shell(f"cp '{ref_file_dict[ref]}' '{cns_ref}'")
if ref in cns_file_dict:
shell(f"cp '{cns_file_dict[ref]}' '{cns_cns}'")

new_config[BARCODE] = ref_cns

with open(output.yaml, 'w') as fw:
yaml.dump(new_config, fw)
print(yellow("-----------------------"))

rule assess_variants:
input:
rules.align_clean_cns_ref.output.aln
params:
reference = "{reference}"
output:
csv = os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}","variants.csv")
run:
parse_variants(input[0],output.csv,BARCODE,params.reference)

rule gather_variants:
input:
expand(rules.assess_variants.output.csv, reference=REFERENCES)
output:
csv = os.path.join(config[KEY_TEMPDIR],"variants.csv")
run:
join_variant_files(VARIANT_CALLS_HEADER_FIELDS,input,output.csv)

rule gather_cns:
input:
variants = rules.gather_variants.output.csv,
seqs = expand(rules.curate_variants.output.fasta, reference=REFERENCES)
output:
os.path.join(config[KEY_TEMPDIR],"consensus_sequences.fasta")
run:
var_dict = {}
with open(input.variants, "r") as f:
reader = csv.DictReader(f)
for row in reader:
var_dict[row[KEY_REFERENCE]] = row

with open(output[0],"w") as fw:
for in_file in input.seqs:
reference = os.path.basename(os.path.dirname(in_file))
for record in SeqIO.parse(in_file,KEY_FASTA):
variant_count = var_dict[reference]["variant_count"]
variant_string = var_dict[reference]["variants"]
fw.write(f">{reference}|{BARCODE}|{variant_count}|{variant_string}\n{record.seq}\n")

0 comments on commit ff2d8c0

Please sign in to comment.