-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Revert "Update piranha_consensus.smk"
This reverts commit 848f660.
- Loading branch information
1 parent
848f660
commit ff2d8c0
Showing
1 changed file
with
136 additions
and
183 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,223 +1,176 @@ | ||
#!/usr/bin/env python3 | ||
|
||
|
||
import os | ||
import collections | ||
from Bio import SeqIO | ||
import yaml | ||
|
||
from piranha.analysis.clean_gaps import * | ||
from piranha.analysis.consensus_functions import * | ||
from piranha.utils.log_colours import green,cyan | ||
from piranha.utils.log_colours import green,cyan,yellow | ||
from piranha.utils.config import * | ||
|
||
|
||
|
||
BARCODE = config[KEY_BARCODE] | ||
SAMPLE = str(config[KEY_SAMPLE]) | ||
REFERENCES = config[BARCODE] | ||
|
||
rule all: | ||
input: | ||
os.path.join(config[KEY_TEMPDIR],"consensus_sequences.fasta"), | ||
os.path.join(config[KEY_TEMPDIR],"variants.csv"), | ||
os.path.join(config[KEY_TEMPDIR],"masked_variants.csv"), | ||
expand(os.path.join(config[KEY_TEMPDIR],"snipit","{reference}.svg"), reference=REFERENCES), | ||
expand(os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}.merged_cns.mask.tsv"), reference=REFERENCES) | ||
|
||
|
||
# do this per cns | ||
os.path.join(config[KEY_TEMPDIR],"consensus_config.yaml") | ||
|
||
rule files: | ||
params: | ||
ref= os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}.ref.fasta"), | ||
cns = os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}.merged_cns.fasta"), | ||
vcf = os.path.join(config[KEY_TEMPDIR],"variant_calls","{reference}.vcf") | ||
ref=os.path.join(config[KEY_TEMPDIR],"reference_groups","{reference}.reference.fasta"), | ||
reads=os.path.join(config[KEY_TEMPDIR],"reference_groups","{reference}.fastq") | ||
|
||
rule maskara: | ||
rule medaka_haploid_variant: | ||
input: | ||
ref= rules.files.params.ref, | ||
cns = rules.files.params.cns, | ||
vcf = rules.files.params.vcf, #Need to get the vcf files for bcftools consensus to work | ||
bam = os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}.merged_cns.bam"), | ||
reads=rules.files.params.reads, | ||
ref=rules.files.params.ref | ||
params: | ||
reference = "{reference}" | ||
model = config[KEY_MEDAKA_MODEL], | ||
outdir = os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}","medaka_haploid_variant") | ||
output: | ||
mask = os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}.merged_cns.mask.tsv"), | ||
fasta = os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}.merged_cns.masked.fasta") | ||
run: | ||
mask_file = os.path.join(config[KEY_TEMPDIR],"reference_analysis","{wildcards.reference}.merged_cns.mask") #Added this line to define output variable in maskara command (wouldn't be a problem is maska didn't add a .tsv; must fix) | ||
ref = params.reference.split(".SEQ")[0] | ||
outfile = output.mask[:-4] | ||
shell(f"maskara '{input.bam}' -r '{ref}' -o '{mask_file}'") | ||
shell(f"echo '{ref} {ref}' | bcftools annotate --rename-chrs /dev/stdin '{input.vcf}' > {input.vcf}_renamed.vcf") #This line is not used currently, but if there's a change to the input to bcftools may be needed. Changes the name of the chrm variants are mapped against to allow bcftools to appy variants to whichever input is needed if your fasta is titled differently. | ||
shell(f"bgzip -c '{input.vcf}_renamed.vcf' > '{input.vcf}_renamed.vcf.gz'") #bcftools needs vcf files to be bgzipped... | ||
shell(f"bcftools index '{input.vcf}_renamed.vcf.gz'") #...and also indexed | ||
shell(f"bcftools consensus -f '{input.ref}' -m '{mask_file}.tsv' '{input.vcf}_renamed.vcf.gz' > '{output.fasta}'") #Applies the mask and variants to the ORIGINAL REFERENCE. I think this is sensible, bcftools fails if you give it merged_cns for example as the "ref" alleles in the vcf don't match the provided "ref" sequence. | ||
|
||
# cns = "" | ||
# for record in SeqIO.parse(input.cns, "fasta"): | ||
# cns = str(record.seq) | ||
# new_seq = "" | ||
# with open(output.mask, "r") as f: | ||
# for l in f: | ||
# l= l.rstrip() | ||
# ref,start,end = l.split("\t") | ||
# start = int(start) | ||
# end = int(end) | ||
# if not new_seq: | ||
# new_seq = cns[:start] | ||
|
||
# new_seq += cns[:] | ||
|
||
|
||
rule join_cns_ref: | ||
input: | ||
ref=rules.files.params.ref, | ||
cns = rules.maskara.output.fasta | ||
params: | ||
reference = "{reference}" | ||
output: | ||
fasta = os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}","ref_cns.fasta") | ||
run: | ||
with open(output[0],"w") as fw: | ||
for record in SeqIO.parse(input.ref,KEY_FASTA): | ||
match_field = "" | ||
for field in record.description.split(" "): | ||
if field.startswith(VALUE_REFERENCE_GROUP_FIELD): | ||
match_field = field.split("=")[1] | ||
|
||
fw.write(f">{match_field} {record.description}\n{record.seq}\n") | ||
if "Sabin" in params.reference: | ||
for record in SeqIO.parse(input.cns,KEY_FASTA): | ||
cns = record.id.split(".")[-1] | ||
record_name = str(SAMPLE).replace(" ","_") | ||
record_name += f"|{cns}" | ||
fw.write(f">{record_name}\n{record.seq}\n") | ||
else: | ||
# need to check up on this | ||
for record in SeqIO.parse(input.cns,KEY_FASTA): | ||
record_name = str(SAMPLE).replace(" ","_") | ||
fw.write(f">{record_name}\n{record.seq}\n") | ||
|
||
|
||
rule align_cns_ref: | ||
input: | ||
rules.join_cns_ref.output.fasta | ||
output: | ||
aln = os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}","ref_cns.aln.fasta") | ||
probs = os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}","medaka_haploid_variant","consensus_probs.hdf"), | ||
vcf = os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}","medaka_haploid_variant","medaka.vcf"), | ||
cns = os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}","medaka_haploid_variant","consensus.fasta"), | ||
pub_vcf = os.path.join(config[KEY_TEMPDIR],"variant_calls","{reference}.vcf") | ||
log: os.path.join(config[KEY_TEMPDIR],"logs","{reference}.hapoid_variant.log") | ||
shell: | ||
""" | ||
mafft {input:q} > {output:q} | ||
[ ! -d {params.outdir:q} ] && mkdir {params.outdir:q} | ||
if [ -s {input.ref:q} ] | ||
then | ||
medaka_haploid_variant -i {input.reads:q} \ | ||
-r {input.ref:q} \ | ||
-o {params.outdir:q} \ | ||
-m {params.model:q} \ | ||
-f -x && \ | ||
medaka stitch {output.probs:q} {input.ref:q} {output.cns:q} | ||
else | ||
touch {output.cns:q} | ||
touch {output.probs:q} | ||
touch {output.vcf:q} | ||
fi | ||
cp {output.vcf:q} {output.pub_vcf:q} | ||
""" | ||
|
||
# this step might lead to some sequences now being identical | ||
rule curate_variants: | ||
|
||
rule medaka_haploid_variant_cns: | ||
input: | ||
aln = rules.align_cns_ref.output.aln | ||
reads=rules.files.params.reads, | ||
ref=rules.medaka_haploid_variant.output.cns | ||
params: | ||
reference = "{reference}" | ||
reference = "{reference}", | ||
model = config[KEY_MEDAKA_MODEL], | ||
outdir = os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}","medaka_haploid_variant_cns") | ||
output: | ||
masked = os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}","masked.csv"), | ||
fasta = os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}","medaka_cns_clean.fasta") | ||
probs = os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}","medaka_haploid_variant_cns","consensus_probs.hdf"), | ||
vcf = os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}","medaka_haploid_variant_cns","medaka.vcf"), | ||
cns = os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}","medaka_haploid_variant_cns","consensus.fasta") | ||
log: os.path.join(config[KEY_TEMPDIR],"logs","{reference}_cns.hapoid_variant.log") | ||
run: | ||
masked = clean_medaka_cns(SAMPLE, input.aln, output.fasta) | ||
with open(output.masked,"w") as fw: | ||
for var in masked: | ||
site = int(var) + 1 | ||
fw.write(f"{params.reference},{site},{masked[var]}\n") | ||
|
||
rule gather_masked_variants: | ||
if not "Sabin" in params.reference: | ||
shell(""" | ||
[ ! -d {params.outdir:q} ] && mkdir {params.outdir:q} | ||
if [ -s {input.ref:q} ] | ||
then | ||
medaka_haploid_variant -i {input.reads:q} \ | ||
-r {input.ref:q} \ | ||
-o {params.outdir:q} \ | ||
-m {params.model:q} \ | ||
-f -x && \ | ||
medaka stitch {output.probs} {input.ref} {output.cns} | ||
else | ||
touch {output.cns:q} | ||
touch {output.probs:q} | ||
touch {output.vcf:q} | ||
fi | ||
""") | ||
else: | ||
shell(""" | ||
touch {output.cns:q} | ||
touch {output.probs:q} | ||
touch {output.vcf:q} | ||
""") | ||
|
||
rule gather_merge_cns: | ||
input: | ||
expand(rules.curate_variants.output.masked, reference=REFERENCES) | ||
ref=expand(rules.files.params.ref,reference=REFERENCES), | ||
cns = expand(os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}","medaka_haploid_variant","consensus.fasta"),reference=REFERENCES), | ||
cns_cns = expand(os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}","medaka_haploid_variant_cns","consensus.fasta"),reference=REFERENCES) | ||
output: | ||
masked = os.path.join(config[KEY_TEMPDIR],"masked_variants.csv") | ||
yaml = os.path.join(config[KEY_TEMPDIR],"consensus_config.yaml") | ||
run: | ||
with open(output.masked,"w") as fw: | ||
fw.write("reference,site,variant\n") | ||
for i in input: | ||
with open(i, "r") as f: | ||
for l in f: | ||
l=l.rstrip("\n") | ||
fw.write(l + '\n') | ||
sequences = collections.defaultdict(set) | ||
ref_seqs = {} | ||
|
||
for cns_file in input.cns: | ||
if "Sabin" in cns_file: | ||
haplodir = "/".join(cns_file.split("/")[:-1]) | ||
else: | ||
haplodir = "/".join(cns_file.split("/")[:-1])+"_cns" | ||
|
||
rule join_clean_cns_ref: | ||
input: | ||
ref=rules.files.params.ref, | ||
cns=rules.curate_variants.output.fasta | ||
output: | ||
fasta = os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}","ref_medaka_cns_clean.fasta") | ||
run: | ||
with open(output[0],"w") as fw: | ||
for record in SeqIO.parse(input.ref,KEY_FASTA): | ||
match_field = "" | ||
for field in record.description.split(" "): | ||
if field.startswith(VALUE_REFERENCE_GROUP_FIELD): | ||
match_field = field.split("=")[1] | ||
|
||
fw.write(f">{match_field} {record.description}\n{record.seq}\n") | ||
for record in SeqIO.parse(input.cns,KEY_FASTA): | ||
record_name = SAMPLE.replace(" ","_") | ||
fw.write(f">{record_name}\n{record.seq}\n") | ||
|
||
rule align_clean_cns_ref: | ||
input: | ||
rules.join_clean_cns_ref.output.fasta | ||
output: | ||
aln = os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}","ref_medaka_cns_clean.aln.fasta") | ||
shell: | ||
""" | ||
mafft {input:q} > {output:q} | ||
""" | ||
haplo_bam = os.path.join(haplodir, "calls_to_ref.bam") | ||
for record in SeqIO.parse(cns_file, KEY_FASTA): | ||
print(record.id) | ||
sequences[str(record.seq)].add(haplo_bam) | ||
ref_seqs[str(record.seq)] = record.id | ||
|
||
ref_file_dict = {} | ||
for ref_file in input.ref: | ||
for record in SeqIO.parse(ref_file,KEY_FASTA): | ||
ref_file_dict[record.id] = ref_file | ||
# print(ref_file_dict) | ||
cns_file_dict = {} | ||
for cns_file in input.cns_cns: | ||
for record in SeqIO.parse(cns_file,KEY_FASTA): | ||
cns_file_dict[record.id] = cns_file | ||
# print(cns_file_dict) | ||
|
||
for seq in sequences: | ||
print(seq[:5],"...",sequences[seq]) | ||
|
||
ref_count = collections.Counter() | ||
ref_cns = [] | ||
new_config = config | ||
|
||
rule make_snipit_graph: | ||
input: | ||
aln = rules.align_clean_cns_ref.output.aln | ||
params: | ||
out_stem = os.path.join(config[KEY_TEMPDIR],"snipit","{reference}") | ||
output: | ||
os.path.join(config[KEY_TEMPDIR],"snipit","{reference}.svg") | ||
run: | ||
try: | ||
shell("""snipit {input.aln:q} -o {params.out_stem} -f svg -c wes""") | ||
except: | ||
shell("touch {output[0]:q}") | ||
for seq in sequences: | ||
ref = ref_seqs[seq] | ||
ref_count[ref] +=1 | ||
record_id = f"{ref_seqs[seq]}.{config[KEY_CNS_STEM]}0{ref_count[ref]}" | ||
ref_cns.append(record_id) | ||
|
||
haplo_bams = "" | ||
for i in sequences[seq]: | ||
haplo_bams += f"'{i}' " | ||
|
||
out_bam = os.path.join(config[KEY_TEMPDIR],"reference_analysis",f"{record_id}.merged_cns.bam") | ||
read_count_out = os.path.join(config[KEY_TEMPDIR],"reference_analysis",f"{record_id}.read_count.txt") | ||
|
||
shell(f"samtools merge -f '{out_bam}' {haplo_bams} && samtools index '{out_bam}'") | ||
shell(f"samtools view -F 0x904 -c '{out_bam}' > {read_count_out}") | ||
|
||
with open(read_count_out,"r") as f: | ||
for l in f: | ||
read_count = l.rstrip() | ||
new_config[record_id] = read_count | ||
|
||
with open(os.path.join(config[KEY_TEMPDIR],"reference_analysis",f"{record_id}.merged_cns.fasta"),"w") as fseq: | ||
fseq.write(f">{record_id} read_count={read_count}\n{seq}\n") | ||
|
||
cns_ref = os.path.join(config[KEY_TEMPDIR],"reference_analysis",f"{record_id}.ref.fasta") | ||
cns_cns = os.path.join(config[KEY_TEMPDIR],"reference_analysis",f"{record_id}.cns_ref.fasta") | ||
# print(cns_cns) | ||
shell(f"cp '{ref_file_dict[ref]}' '{cns_ref}'") | ||
if ref in cns_file_dict: | ||
shell(f"cp '{cns_file_dict[ref]}' '{cns_cns}'") | ||
|
||
new_config[BARCODE] = ref_cns | ||
|
||
with open(output.yaml, 'w') as fw: | ||
yaml.dump(new_config, fw) | ||
print(yellow("-----------------------")) | ||
|
||
rule assess_variants: | ||
input: | ||
rules.align_clean_cns_ref.output.aln | ||
params: | ||
reference = "{reference}" | ||
output: | ||
csv = os.path.join(config[KEY_TEMPDIR],"reference_analysis","{reference}","variants.csv") | ||
run: | ||
parse_variants(input[0],output.csv,BARCODE,params.reference) | ||
|
||
rule gather_variants: | ||
input: | ||
expand(rules.assess_variants.output.csv, reference=REFERENCES) | ||
output: | ||
csv = os.path.join(config[KEY_TEMPDIR],"variants.csv") | ||
run: | ||
join_variant_files(VARIANT_CALLS_HEADER_FIELDS,input,output.csv) | ||
|
||
rule gather_cns: | ||
input: | ||
variants = rules.gather_variants.output.csv, | ||
seqs = expand(rules.curate_variants.output.fasta, reference=REFERENCES) | ||
output: | ||
os.path.join(config[KEY_TEMPDIR],"consensus_sequences.fasta") | ||
run: | ||
var_dict = {} | ||
with open(input.variants, "r") as f: | ||
reader = csv.DictReader(f) | ||
for row in reader: | ||
var_dict[row[KEY_REFERENCE]] = row | ||
|
||
with open(output[0],"w") as fw: | ||
for in_file in input.seqs: | ||
reference = os.path.basename(os.path.dirname(in_file)) | ||
for record in SeqIO.parse(in_file,KEY_FASTA): | ||
variant_count = var_dict[reference]["variant_count"] | ||
variant_string = var_dict[reference]["variants"] | ||
fw.write(f">{reference}|{BARCODE}|{variant_count}|{variant_string}\n{record.seq}\n") |