Skip to content

Commit

Permalink
Merge pull request #130 from pdimens/cleaner_impute
Browse files Browse the repository at this point in the history
tidy up impute
  • Loading branch information
pdimens authored Aug 21, 2024
2 parents 0b028ed + ce181e9 commit 07432d2
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 76 deletions.
18 changes: 16 additions & 2 deletions harpy/_launch.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def launch_snakemake(sm_args, workflow, starttext, outdir, sm_logfile, quiet):
print_snakefile_error("If you manually edited the Snakefile, see the error below to fix it. If you didn't edit it manually, it's probably a bug (oops!) and you should submit an issue on GitHub: [bold]https://github.com/pdimens/harpy/issues")
errtext = subprocess.run(sm_args.split(), stderr=subprocess.PIPE, text = True)
errtext = errtext.stderr.split("\n")
startprint = [i for i,j in enumerate(errtext) if "Exception in rule" in j][0]
startprint = [i for i,j in enumerate(errtext) if "Exception in rule" in j or "Error in file" in j or "Exception:" in j][0]
rprint("[red]" + "\n".join(errtext[startprint:]), end = None)
#rprint("[red]" + errtext.stderr.partition("jobs...\n")[2], end = None)
sys.exit(1)
Expand Down Expand Up @@ -120,7 +120,7 @@ def launch_snakemake(sm_args, workflow, starttext, outdir, sm_logfile, quiet):
break
# prioritizing printing the error and quitting
if err:
if "Shutting down, this" in output:
if "Shutting down, this" in output or output.endswith("]\n"):
sys.exit(1)
rprint(f"[red]{output.strip().partition("Finished job")[0]}", file = sys.stderr)
if "Error in rule" in output or "RuleException" in output:
Expand Down Expand Up @@ -157,6 +157,20 @@ def launch_snakemake(sm_args, workflow, starttext, outdir, sm_logfile, quiet):
if process.returncode < 1:
print_onsuccess(outdir)
else:
print_onerror(sm_logfile)
with open(sm_logfile, "r", encoding="utf-8") as logfile:
line = logfile.readline()
while line:
if "Error" in line or "Exception" in line:
rprint("[bold yellow]" + line.rstrip())
break
line = logfile.readline()
line = logfile.readline()
while line:
if line.endswith("]\n"):
break
rprint("[red]" + line.rstrip())
line = logfile.readline()
sys.exit(1)
except KeyboardInterrupt:
# Handle the keyboard interrupt
Expand Down
65 changes: 25 additions & 40 deletions harpy/reports/stitch_collate.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -46,23 +46,36 @@ using<-function(...) {
lapply(need,require,character.only=TRUE)
}
}
using("flexdashboard","tidyr","magrittr","DT","ggplot2","plotly","RColorBrewer")
using("flexdashboard","tidyr","magrittr","DT")
```

```{r load data}
infile <- snakemake@input[[1]]
#infile <- "~/contig1.stats"
dataL <- readLines(infile)
bcf <- gsub(".stats$", ".bcf", infile)
bcf <- gsub(".stats$", ".bcf", basename(infile))
chrom <- gsub(".bcf", "", bcf)
basedir <- dirname(normalizePath(infile))
chrom <- basename(basedir)
plotdir <- paste(basedir, "plots/", sep = "/")
plotdir <- normalizePath(snakemake@input[[2]])
plotdir <- paste0(plotdir, "/")
# first by folder
params <- unlist(strsplit(infile, "[/]"))
model <- gsub("model", "", params[length(params) - 6])
usebx <- gsub("usebx", "", params[length(params) - 5])
otherparams <- unlist(strsplit(params[length(params) - 4], "_"))
k <- gsub("k", "", otherparams[1])
s <- gsub("s", "", otherparams[2])
ngen <- gsub("ngen", "", otherparams[3])
bxlimit <- gsub("bxlimit", "", otherparams[4])
```
## `r bcf`
## Contig: `r chrom`

For convenience, this report consolidates the various outputs from `STITCH` and `bcftools` into a single document.
This reflects the general information stored in the records of the bcf file. Since STITCH
This reflects the general information stored in the records of `r bcf`. Since STITCH
requires only bi-allelic SNPs as input, you should not expect to see any other types
of variants (MNPs, indels, _etc._). The images may appear small and can be clicked on to expand/focus them. Click on the image again to remove focus from the image. For the wider images, you may right-click them and open them in a new tab/window to view them at their original
of variants (MNPs, indels, _etc._). The images may appear small and can be clicked on to expand/focus them.
Click on the image again to remove focus from the image. For the wider images, you may right-click them and open
them in a new tab/window to view them at their original
sizes, which may prove to be more useful in some cases.


Expand All @@ -77,43 +90,16 @@ sn <- as.data.frame(t(sn[2]))
rownames(sn) <- NULL
```

**SAMPLES**: `r sn$samples[1]` | **TOTAL RECORDS**: `r sn$records[1]` | **no-ALTs**: `r sn[3,1]` | **SNPs**: `r sn$SNPs[1]` | **MNPs**: `r sn$MNPs[1]`
**model**: `r model` | **use barcodes**: `r usebx` | **k**: `r k` | **s**: `r s` | **ngen**: `r ngen` | **bx limit**: `r bxlimit`

### Per Sample (graph)
```{r}
# scale plot height to sample count
n <- as.numeric(sn$samples[1])
pltheight <- 330 + (15 * n)
pheight <- 1.2 + (0.166 * n)
```
**SAMPLES**: `r sn$samples[1]` | **TOTAL RECORDS**: `r sn$records[1]` | **no-ALTs**: `r sn[3,1]` | **SNPs**: `r sn$SNPs[1]` | **MNPs**: `r sn$MNPs[1]`

```{r basic stats, fig.width = 7.5, dev ='jpeg'}
#TODO make this interactive(?)
### Per Sample Stats
```{r stats table}
.pscL <- grepl("^PSC", dataL)
psc <- read.table(text=dataL[.pscL])[ ,3:14]
names(psc) <- c("Sample", "HomozygousRef", "HomozygousAtl", "Heterozygotes", "Transitions", "Transversions", "Indels", "MeanDepth", "Singletons", "HapRef", "HapAlt", "Missing")
psc$Homozygotes <- psc$HomozygousRef + psc$HomozygousAtl
tidy_psc <- pivot_longer(psc[,c(1,2,3,4,5,6,9,12,13)], -Sample , names_to = "Category", values_to = "Count")
ggplot(data = tidy_psc, mapping = aes(x = Count, y = Sample, color = Category)) +
geom_point(size = 2) +
labs(title = "Statistics by Sample") +
theme_bw() +
scale_x_continuous(n.breaks=9, labels = scales::comma) +
xlab("Count") +
ylab("") +
scale_color_brewer(palette = "Dark2") +
theme(panel.grid.minor = element_blank(), panel.grid.major.y = element_blank())
```

***

This graph contains variant statistics per individual, corresponding
to the `r sn$samples[1]` samples in `r bcf`. The data showin in the next
tab were used create this visualization.

### Per Sample (table)
```{r stats table}
DT::datatable(
psc[,c(1,12,2,3,13,4,5,6,9)],
rownames = F,
Expand All @@ -132,8 +118,7 @@ DT::datatable(
***

This table contains variant statistics per individual, corresponding
to the `r sn$samples[1]` samples in `r bcf`. These data are what is used
to create the visualization from the previous tab.
to the `r sn$samples[1]` samples in `r bcf`.

### Chromosome-wide Imputation QC Plot
```{r QC_chromwide, dev = "jpeg", out.width="100%"}
Expand Down
6 changes: 1 addition & 5 deletions harpy/scripts/stitch_impute.R
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,4 @@ debugdir <- paste(outdir, "debug", sep = "/")
# remove debug if it's empty
if (length(list.files(debugdir)) < 1) {
unlink(debugdir, recursive = TRUE)
}

unlink(paste(outdir, "input", sep = "/"), recursive = TRUE)
unlink(paste(outdir, "RData", sep = "/"), recursive = TRUE)
unlink(paste(outdir, "tmp", sep = "/"), recursive = TRUE)
}
51 changes: 22 additions & 29 deletions harpy/snakefiles/impute.smk
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ rule sort_bcf:
bcf = temp(f"{outdir}/workflow/input/vcf/input.sorted.bcf"),
idx = temp(f"{outdir}/workflow/input/vcf/input.sorted.bcf.csi")
log:
f"{outdir}/logs/input.sorted.log"
f"{outdir}/logs/input.sort.log"
container:
None
shell:
Expand Down Expand Up @@ -91,9 +91,13 @@ rule impute:
output:
# format a wildcard pattern like "k{k}/s{s}/ngen{ngen}"
# into a file path, with k, s, ngen being the columns of the data frame
f"{outdir}/{paramspace.wildcard_pattern}/contigs/" + "{part}/{part}.vcf.gz"
temp(f"{outdir}/{paramspace.wildcard_pattern}/contigs/" + "{part}/{part}.vcf.gz"),
temp(directory(f"{outdir}/{paramspace.wildcard_pattern}/contigs/" + "{part}/plots")),
temp(directory(f"{outdir}/{paramspace.wildcard_pattern}/contigs/" + "{part}/tmp")),
temp(directory(f"{outdir}/{paramspace.wildcard_pattern}/contigs/" + "{part}/RData")),
temp(directory(f"{outdir}/{paramspace.wildcard_pattern}/contigs/" + "{part}/input"))
log:
f"{outdir}/{paramspace.wildcard_pattern}/logs/stitch/" + "{part}/{part}.log"
f"{outdir}/{paramspace.wildcard_pattern}/logs/" + "{part}.stitch.log"
params:
# automatically translate the wildcard values into an instance of the param space
# in the form of a dict (here: {"k": ..., "s": ..., "ngen": ...})
Expand All @@ -111,45 +115,35 @@ rule impute:
rule index_vcf:
input:
vcf = outdir + "/{stitchparams}/contigs/{part}/{part}.vcf.gz"
output:
idx = outdir + "/{stitchparams}/contigs/{part}/{part}.vcf.gz.tbi",
stats = outdir + "/{stitchparams}/contigs/{part}/{part}.stats"
output:
vcf = outdir + "/{stitchparams}/contigs/{part}.vcf.gz",
idx = outdir + "/{stitchparams}/contigs/{part}.vcf.gz.tbi",
stats = outdir + "/{stitchparams}/reports/data/contigs/{part}.stats"
wildcard_constraints:
part = "[^/]+"
container:
None
shell:
"""
tabix {input.vcf}
cp {input} {output.vcf}
tabix {output.vcf}
bcftools stats -s "-" {input.vcf} > {output.stats}
"""

rule stitch_reports:
input:
outdir + "/{stitchparams}/contigs/{part}/{part}.stats"
outdir + "/{stitchparams}/reports/data/contigs/{part}.stats",
outdir + "/{stitchparams}/contigs/{part}/plots"
output:
outdir + "/{stitchparams}/contigs/{part}/{part}.STITCH.html"
outdir + "/{stitchparams}/reports/{part}.stitch.html"
conda:
f"{envdir}/r.yaml"
script:
"report/stitch_collate.Rmd"

rule clean_plots:
input:
outdir + "/{stitchparams}/contigs/{part}/{part}.STITCH.html"
output:
temp(touch(outdir + "/{stitchparams}/contigs/{part}/.plots.cleaned"))
params:
stitch = lambda wc: wc.get("stitchparams"),
outdir = outdir
container:
None
priority:
2
shell:
"rm -rf {params.outdir}/{params.stitch}/contigs/{wildcards.part}/plots"

rule concat_list:
input:
bcf = collect(outdir + "/{{stitchparams}}/contigs/{part}/{part}.vcf.gz", part = contigs)
bcf = collect(outdir + "/{{stitchparams}}/contigs/{part}.vcf.gz", part = contigs)
output:
temp(outdir + "/{stitchparams}/bcf.files")
run:
Expand All @@ -159,7 +153,7 @@ rule concat_list:
rule merge_vcf:
input:
files = outdir + "/{stitchparams}/bcf.files",
idx = collect(outdir + "/{{stitchparams}}/contigs/{part}/{part}.vcf.gz.tbi", part = contigs)
idx = collect(outdir + "/{{stitchparams}}/contigs/{part}.vcf.gz.tbi", part = contigs)
output:
outdir + "/{stitchparams}/variants.imputed.bcf"
threads:
Expand All @@ -184,7 +178,7 @@ rule general_stats:
bcf = outdir + "/{stitchparams}/variants.imputed.bcf",
idx = outdir + "/{stitchparams}/variants.imputed.bcf.csi"
output:
outdir + "/{stitchparams}/reports/variants.imputed.stats"
outdir + "/{stitchparams}/reports/data/impute.stats"
container:
None
shell:
Expand Down Expand Up @@ -228,8 +222,7 @@ rule workflow_summary:
input:
vcf = collect(outdir + "/{stitchparams}/variants.imputed.bcf", stitchparams=paramspace.instance_patterns),
agg_report = collect(outdir + "/{stitchparams}/reports/variants.imputed.html", stitchparams=paramspace.instance_patterns) if not skipreports else [],
contig_report = collect(outdir + "/{stitchparams}/contigs/{part}/{part}.STITCH.html", stitchparams=paramspace.instance_patterns, part = contigs) if not skipreports else [],
cleanedplots = collect(outdir + "/{stitchparams}/contigs/{part}/.plots.cleaned", stitchparams=paramspace.instance_patterns, part = contigs) if not skipreports else []
contig_report = collect(outdir + "/{stitchparams}/reports/{part}.stitch.html", stitchparams=paramspace.instance_patterns, part = contigs) if not skipreports else [],
run:
with open(outdir + "/workflow/impute.summary", "w") as f:
_ = f.write("The harpy impute workflow ran using these parameters:\n\n")
Expand Down

0 comments on commit 07432d2

Please sign in to comment.