Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mergevariants #113

Merged
merged 11 commits into from
Jul 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions src/harpy/bin/infer_sv.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,7 @@
}

if args.failfile is not None:
failout = open(args.failfile, "w", encoding="utf-8")
with open(args.bedfile, "r", encoding="utf-8") as f:
with open(args.bedfile, "r", encoding="utf-8") as f, open(args.failfile, "w", encoding="utf-8") as failout:
# first line, the header
line = f.readline().strip().split("\t")
line_corrected = [i.title().replace(" ", "") for i in line]
Expand All @@ -49,7 +48,6 @@
sys.stdout.write(NEWROW)
else:
failout.write(NEWROW)
failout.close()
else:
with open(args.bedfile, "r", encoding="utf-8") as f:
# first line, the header
Expand Down
16 changes: 10 additions & 6 deletions src/harpy/reports/impute.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -236,20 +236,24 @@ the reference allele (from the reference genome), `Alt` refers to the alternativ
allele, and `Missing` details how many SNPs remain missing
(failed to impute) for that individual. As an example, the `Homozygous Ref` is
the number of missing genotypes that were imputed into
a homozygous genotype for the reference allele. The table on the right is the
underlying data used to create the visualization. Clicking
on a plot will expand it to fill your browser window. Clicking it again will exit
out of the zoomed view.
a homozygous genotype for the reference allele. The table on the other tab is the
underlying data used to create the visualization.

```{r}
n_samples <- length(unique(gcts_long$sample))
plotheight <- 150 + (15 * n_samples)
figheight <- 1 + (0.2 * n_samples)
```

## ind stats plt
## ind stats plt {.tabset data-height=plotheight}
### Individuals Plot {.no-title}
```{r preprocess fields, echo = FALSE, message = FALSE, warning = FALSE, out.width = "100%"}
gcts_long$Conversion <- gsub("AltHom","Homozygous Alt", gcts_long$Conversion)
gcts_long$Conversion <- gsub("Het","Heterozygote", gcts_long$Conversion)
gcts_long$Conversion <- gsub("RefHom","Homozygous Ref", gcts_long$Conversion)
gcts_long$Conversion <- gsub("missing","not imputed", gcts_long$Conversion)
```
```{r percontig dist, echo = FALSE, message = FALSE, warning = FALSE, out.width = "100%"}
```{r per_indiv, echo = FALSE, message = FALSE, warning = FALSE, fig.height=figheight, out.width = "100%"}
hchart(gcts_long, "bar", hcaes(x = sample, y = Count, group = Conversion), stacking = "normal", animation = F) |>
hc_colors(c("#7cb3d4", "#b18ad1", "#ffd75f", "#6f6c70")) |>
hc_title(text = "Missing Genotype Imputations") |>
Expand Down
2 changes: 0 additions & 2 deletions src/harpy/reports/naibr.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,6 @@ summstats <- sv %>%
group_by(Chr1, SV) %>%
summarise(n = n())

#svlen <- sv %>% group_by(SV) %>% summarise(mlen = mean(Length), sdlen = sd(Length))

sumtable <- summstats %>%
group_by(SV) %>%
summarise(count = sum(n))
Expand Down
12 changes: 6 additions & 6 deletions src/harpy/reports/naibr_pop.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -279,10 +279,10 @@ fa.sizes <- fa.sizes[fa.sizes$contig %in% unique(sv$Chr1), 1:2]
fa.sizes <- fa.sizes[1:min(nrow(fa.sizes), 30), ]

# filter out variants that aren't on one of these contigs
sv <- sv[sv$Chr1 %in% fa.sizes$contig, ]
sv$Chr1 <- factor(sv$Chr1)
sv_clean <- sv_clean[sv_clean$Chr1 %in% fa.sizes$contig, ]
sv_clean$Chr1 <- factor(sv_clean$Chr1)

populations <- levels(sv$Population)
populations <- levels(sv_clean$Population)
color.palette <- turbo(length(populations))
names(color.palette) <- populations

Expand Down Expand Up @@ -322,7 +322,7 @@ tracks <- BioCircosTracklist()
variant_type <- "inversion"
# Add one track for each population
for( pop in populations ){
variant <- sv[(sv$SV == variant_type) & (sv$Population == pop),]
variant <- sv_clean[(sv_clean$SV == variant_type) & (sv_clean$Population == pop),]
if(nrow(variant) > 0){
tracks <- tracks + BioCircosArcTrack(
paste0(variant_type, pop),
Expand Down Expand Up @@ -372,7 +372,7 @@ tracks <- BioCircosTracklist()
variant_type <- "duplication"
# Add one track for each population
for( pop in populations ){
variant <- sv[(sv$SV == variant_type) & (sv$Population == pop),]
variant <- sv_clean[(sv_clean$SV == variant_type) & (sv_clean$Population == pop),]
if(nrow(variant) > 0){
tracks <- tracks + BioCircosArcTrack(
paste0(variant_type, pop),
Expand Down Expand Up @@ -420,7 +420,7 @@ tracks <- BioCircosTracklist()
variant_type <- "deletion"
# Add one track for each population
for( pop in populations ){
variant <- sv[(sv$SV == variant_type) & (sv$Population == pop),]
variant <- sv_clean[(sv_clean$SV == variant_type) & (sv_clean$Population == pop),]
if(nrow(variant) > 0){
tracks <- tracks + BioCircosSNPTrack(
paste0(variant_type, pop),
Expand Down
48 changes: 42 additions & 6 deletions src/harpy/snakefiles/sv_leviathan.smk
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ rule call_sv:
genome = f"Genome/{bn}",
genidx = multiext(f"Genome/{bn}", ".fai", ".ann", ".bwt", ".pac", ".sa", ".amb")
output:
temp(outdir + "/{sample}.vcf")
temp(outdir + "/vcf/{sample}.vcf")
log:
runlog = outdir + "/logs/{sample}.leviathan.log",
candidates = outdir + "/logs/{sample}.candidates"
Expand All @@ -178,9 +178,9 @@ rule call_sv:

rule sort_bcf:
input:
outdir + "/{sample}.vcf"
outdir + "/vcf/{sample}.vcf"
output:
outdir + "/{sample}.bcf"
outdir + "/vcf/{sample}.bcf"
params:
"{wildcards.sample}"
container:
Expand All @@ -192,9 +192,9 @@ rule sort_bcf:

rule sv_stats:
input:
outdir + "/{sample}.bcf"
outdir + "/vcf/{sample}.bcf"
output:
outdir + "/reports/data/{sample}.sv.stats"
temp(outdir + "/reports/data/{sample}.sv.stats")
container:
None
message:
Expand All @@ -205,6 +205,41 @@ rule sv_stats:
bcftools query -f '{wildcards.sample}\\t%CHROM\\t%POS\\t%END\\t%SVLEN\\t%SVTYPE\\t%BARCODES\\t%PAIRS\\n' {input} >> {output}
"""

rule merge_variants:
input:
collect(outdir + "/reports/data/{sample}.sv.stats", sample = samplenames)
output:
outdir + "/inversions.bedpe",
outdir + "/deletions.bedpe",
outdir + "/duplications.bedpe",
outdir + "/breakends.bedpe"
message:
"Aggregating the detected variants"
run:
with open(output[0], "w") as inversions, open(output[1], "w") as deletions, open(output[2], "w") as duplications, open(output[3], "w") as breakends:
header = ["sample","contig","position_start","position_end","length","type","n_barcodes","n_pairs"]
_ = inversions.write("\t".join(header) + "\n")
_ = deletions.write("\t".join(header) + "\n")
_ = duplications.write("\t".join(header) + "\n")
_ = breakends.write("\t".join(header) + "\n")
for varfile in input:
with open(varfile, "r") as f_in:
# skip header
f_in.readline()
while True:
line = f_in.readline()
if not line:
break
record = line.rstrip().split("\t")
if record[5] == "INV":
_ = inversions.write(line)
elif record[5] == "DEL":
_ = deletions.write(line)
elif record[5] == "DUP":
_ = duplications.write(line)
elif record[5] == "BND":
_ = breakends.write(line)

rule sv_report:
input:
faidx = f"Genome/{bn}.fai",
Expand All @@ -222,7 +257,8 @@ rule sv_report:
rule workflow_summary:
default_target: True
input:
vcf = collect(outdir + "/{sample}.bcf", sample = samplenames),
vcf = collect(outdir + "/vcf/{sample}.bcf", sample = samplenames),
bedpe_agg = collect(outdir + "/{sv}.bedpe", sv = ["inversions", "deletions","duplications", "breakends"]),
reports = collect(outdir + "/reports/{sample}.SV.html", sample = samplenames) if not skipreports else []
params:
min_sv = f"-v {min_sv}",
Expand Down
52 changes: 44 additions & 8 deletions src/harpy/snakefiles/sv_leviathan_pop.smk
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ rule call_sv:
genome = f"Genome/{bn}",
genidx = multiext(f"Genome/{bn}", ".fai", ".ann", ".bwt", ".pac", ".sa", ".amb")
output:
temp(outdir + "/{population}.vcf")
temp(outdir + "/vcf/{population}.vcf")
log:
runlog = outdir + "/logs/{population}.leviathan.log",
candidates = outdir + "/logs/{population}.candidates"
Expand All @@ -196,17 +196,17 @@ rule call_sv:
conda:
f"{envdir}/sv.yaml"
message:
"Calling variants: Population {wildcards.population}"
"Calling variants: {wildcards.population}"
benchmark:
".Benchmark/leviathan-pop/{population}.variantcall"
shell:
"LEVIATHAN -b {input.bam} -i {input.bc_idx} {params} -g {input.genome} -o {output} -t {threads} --candidates {log.candidates} 2> {log.runlog}"

rule sort_bcf:
input:
outdir + "/{population}.vcf"
outdir + "/vcf/{population}.vcf"
output:
outdir + "/{population}.bcf"
outdir + "/vcf/{population}.bcf"
params:
"{wildcards.population}"
container:
Expand All @@ -218,9 +218,9 @@ rule sort_bcf:

rule sv_stats:
input:
outdir + "/{population}.bcf"
outdir + "/vcf/{population}.bcf"
output:
outdir + "/reports/data/{population}.sv.stats"
temp(outdir + "/reports/data/{population}.sv.stats")
container:
None
message:
Expand All @@ -231,10 +231,45 @@ rule sv_stats:
bcftools query -f '{wildcards.population}\\t%CHROM\\t%POS\\t%END\\t%SVLEN\\t%SVTYPE\\t%BARCODES\\t%PAIRS\\n' {input} >> {output}
"""

rule merge_variants:
input:
collect(outdir + "/reports/data/{population}.sv.stats", population = populations)
output:
outdir + "/inversions.bedpe",
outdir + "/deletions.bedpe",
outdir + "/duplications.bedpe",
outdir + "/breakends.bedpe"
message:
"Aggregating the detected variants"
run:
with open(output[0], "w") as inversions, open(output[1], "w") as deletions, open(output[2], "w") as duplications, open(output[3], "w") as breakends:
header = ["population","contig","position_start","position_end","length","type","n_barcodes","n_pairs"]
_ = inversions.write("\t".join(header) + "\n")
_ = deletions.write("\t".join(header) + "\n")
_ = duplications.write("\t".join(header) + "\n")
_ = breakends.write("\t".join(header) + "\n")
for varfile in input:
with open(varfile, "r") as f_in:
# skip header
f_in.readline()
while True:
line = f_in.readline()
if not line:
break
record = line.rstrip().split("\t")
if record[5] == "INV":
_ = inversions.write(line)
elif record[5] == "DEL":
_ = deletions.write(line)
elif record[5] == "DUP":
_ = duplications.write(line)
elif record[5] == "BND":
_ = breakends.write(line)

rule sv_report:
input:
statsfile = outdir + "/reports/data/{population}.sv.stats",
bcf = outdir + "/{population}.bcf",
bcf = outdir + "/vcf/{population}.bcf",
faidx = f"Genome/{bn}.fai"
output:
outdir + "/reports/{population}.sv.html"
Expand All @@ -261,7 +296,8 @@ rule sv_report_aggregate:
rule workflow_summary:
default_target: True
input:
vcf = collect(outdir + "/{pop}.bcf", pop = populations),
vcf = collect(outdir + "/vcf/{pop}.bcf", pop = populations),
bedpe_agg = collect(outdir + "/{sv}.bedpe", sv = ["inversions", "deletions","duplications", "breakends"]),
reports = collect(outdir + "/reports/{pop}.sv.html", pop = populations) if not skipreports else [],
agg_report = outdir + "/reports/leviathan.summary.html" if not skipreports else []
message:
Expand Down
43 changes: 39 additions & 4 deletions src/harpy/snakefiles/sv_naibr.smk
Original file line number Diff line number Diff line change
Expand Up @@ -139,9 +139,9 @@ rule infer_sv:
refmt = outdir + "/{sample}/{sample}.reformat.bedpe",
vcf = outdir + "/{sample}/{sample}.vcf"
output:
bedpe = outdir + "/{sample}.bedpe",
bedpe = outdir + "/bedpe/{sample}.bedpe",
refmt = outdir + "/IGV/{sample}.reformat.bedpe",
fail = outdir + "/filtered/{sample}.fail.bedpe",
fail = outdir + "/bedpe/qc_fail/{sample}.fail.bedpe",
vcf = outdir + "/vcf/{sample}.vcf"
params:
outdir = lambda wc: outdir + "/" + wc.get("sample")
Expand All @@ -157,6 +157,40 @@ rule infer_sv:
rm -rf {params.outdir}
"""

rule merge_variants:
input:
collect(outdir + "/bedpe/{sample}.bedpe", sample = samplenames)
output:
outdir + "/inversions.bedpe",
outdir + "/deletions.bedpe",
outdir + "/duplications.bedpe"
message:
"Aggregating the detected variants"
run:
from pathlib import Path
with open(output[0], "w") as inversions, open(output[1], "w") as deletions, open(output[2], "w") as duplications:
header = ["Sample","Chr1","Break1","Chr2","Break2","SplitMolecules","DiscordantReads","Orientation","Haplotype","Score","PassFilter","SV"]
_ = inversions.write("\t".join(header) + "\n")
_ = deletions.write("\t".join(header) + "\n")
_ = duplications.write("\t".join(header) + "\n")
for varfile in input:
samplename = Path(varfile).stem
with open(varfile, "r") as f_in:
# read the header to skip it
f_in.readline()
# read the rest of it
while True:
line = f_in.readline()
if not line:
break
record = line.rstrip().split("\t")
if record[-1] == "inversion":
_ = inversions.write(f"{samplename}\t{line}")
elif record[-1] == "deletion":
_ = deletions.write(f"{samplename}\t{line}")
elif record[-1] == "duplication":
_ = duplications.write(f"{samplename}\t{line}")

rule genome_link:
input:
genomefile
Expand Down Expand Up @@ -202,7 +236,7 @@ rule genome_faidx:

rule create_report:
input:
bedpe = outdir + "/{sample}.bedpe",
bedpe = outdir + "/bedpe/{sample}.bedpe",
fai = f"Genome/{bn}.fai"
output:
outdir + "/reports/{sample}.naibr.html"
Expand All @@ -216,7 +250,8 @@ rule create_report:
rule workflow_summary:
default_target: True
input:
bedpe = collect(outdir + "/{sample}.bedpe", sample = samplenames),
bedpe = collect(outdir + "/bedpe/{sample}.bedpe", sample = samplenames),
bedpe_agg = collect(outdir + "/{sv}.bedpe", sv = ["inversions", "deletions","duplications"]),
reports = collect(outdir + "/reports/{sample}.naibr.html", sample = samplenames) if not skipreports else []
message:
"Summarizing the workflow: {output}"
Expand Down
Loading
Loading