Merge pull request #130 from pdimens/cleaner_impute

tidy up impute
pdimens · Aug 21, 2024 · 07432d2 · 07432d2
2 parents 0b028ed + ce181e9
commit 07432d2
Show file tree

Hide file tree

Showing 4 changed files with 64 additions and 76 deletions.
diff --git a/harpy/_launch.py b/harpy/_launch.py
@@ -25,7 +25,7 @@ def launch_snakemake(sm_args, workflow, starttext, outdir, sm_logfile, quiet):
                 print_snakefile_error("If you manually edited the Snakefile, see the error below to fix it. If you didn't edit it manually, it's probably a bug (oops!) and you should submit an issue on GitHub: [bold]https://github.com/pdimens/harpy/issues")
                 errtext = subprocess.run(sm_args.split(), stderr=subprocess.PIPE, text = True)
                 errtext = errtext.stderr.split("\n")
-                startprint = [i for i,j in enumerate(errtext) if "Exception in rule" in j][0]
+                startprint = [i for i,j in enumerate(errtext) if "Exception in rule" in j or "Error in file" in j or "Exception:" in j][0]
                 rprint("[red]" + "\n".join(errtext[startprint:]), end = None)
                 #rprint("[red]" + errtext.stderr.partition("jobs...\n")[2], end = None)
                 sys.exit(1)
@@ -120,7 +120,7 @@ def launch_snakemake(sm_args, workflow, starttext, outdir, sm_logfile, quiet):
                         break
                     # prioritizing printing the error and quitting
                     if err:
-                        if "Shutting down, this" in output:
+                        if "Shutting down, this" in output or output.endswith("]\n"):
                             sys.exit(1)
                         rprint(f"[red]{output.strip().partition("Finished job")[0]}", file = sys.stderr)
                     if "Error in rule" in output or "RuleException" in output:
@@ -157,6 +157,20 @@ def launch_snakemake(sm_args, workflow, starttext, outdir, sm_logfile, quiet):
         if process.returncode < 1:
             print_onsuccess(outdir)
         else:
+            print_onerror(sm_logfile)
+            with open(sm_logfile, "r", encoding="utf-8") as logfile:
+                line = logfile.readline()
+                while line:
+                    if "Error" in line or "Exception" in line:
+                        rprint("[bold yellow]" + line.rstrip())
+                        break
+                    line = logfile.readline()
+                line = logfile.readline()
+                while line:
+                    if line.endswith("]\n"):
+                        break
+                    rprint("[red]" + line.rstrip())
+                    line = logfile.readline()
             sys.exit(1)
     except KeyboardInterrupt:
         # Handle the keyboard interrupt

diff --git a/harpy/reports/stitch_collate.Rmd b/harpy/reports/stitch_collate.Rmd
@@ -46,23 +46,36 @@ using<-function(...) {
         lapply(need,require,character.only=TRUE)
     }
 }
-using("flexdashboard","tidyr","magrittr","DT","ggplot2","plotly","RColorBrewer")
+using("flexdashboard","tidyr","magrittr","DT")
 ```
 
 ```{r load data}
 infile <- snakemake@input[[1]]
 #infile <- "~/contig1.stats"
 dataL <- readLines(infile)
-bcf <- gsub(".stats$", ".bcf", infile)
+bcf <- gsub(".stats$", ".bcf", basename(infile))
+chrom <- gsub(".bcf", "", bcf)
 basedir <- dirname(normalizePath(infile))
-chrom <- basename(basedir)
-plotdir <- paste(basedir, "plots/", sep = "/")
+plotdir <- normalizePath(snakemake@input[[2]])
+plotdir <- paste0(plotdir, "/")
+# first by folder
+params <- unlist(strsplit(infile, "[/]"))
+model <- gsub("model", "", params[length(params) - 6])
+usebx <- gsub("usebx", "", params[length(params) - 5])
+otherparams <- unlist(strsplit(params[length(params) - 4], "_"))
+k <-       gsub("k", "", otherparams[1])
+s <-       gsub("s", "", otherparams[2])
+ngen <-    gsub("ngen", "", otherparams[3])
+bxlimit <- gsub("bxlimit", "", otherparams[4])
 ```
-## `r bcf`
+## Contig: `r chrom`
+
 For convenience, this report consolidates the various outputs from `STITCH` and `bcftools` into a single document.
-This reflects the general information stored in the records of the bcf file. Since STITCH
+This reflects the general information stored in the records of `r bcf`. Since STITCH
 requires only bi-allelic SNPs as input, you should not expect to see any other types
-of variants (MNPs, indels, _etc._). The images may appear small and can be clicked on to expand/focus them. Click on the image again to remove focus from the image. For the wider images, you may right-click them and open them in a new tab/window to view them at their original
+of variants (MNPs, indels, _etc._). The images may appear small and can be clicked on to expand/focus them. 
+Click on the image again to remove focus from the image. For the wider images, you may right-click them and open
+them in a new tab/window to view them at their original
 sizes, which may prove to be more useful in some cases.
 
 
@@ -77,43 +90,16 @@ sn <- as.data.frame(t(sn[2]))
 rownames(sn) <- NULL
 ```
 
-**SAMPLES**: `r sn$samples[1]` | **TOTAL RECORDS**: `r sn$records[1]` | **no-ALTs**: `r sn[3,1]` | **SNPs**: `r sn$SNPs[1]` | **MNPs**: `r sn$MNPs[1]`
+**model**: `r model` | **use barcodes**: `r usebx` | **k**: `r k` | **s**: `r s` | **ngen**: `r ngen` | **bx limit**: `r bxlimit`
 
-### Per Sample (graph)
-```{r}
-# scale plot height to sample count
-n <- as.numeric(sn$samples[1])
-pltheight <- 330 + (15 * n)
-pheight <- 1.2 + (0.166 * n)
-```
+**SAMPLES**: `r sn$samples[1]` | **TOTAL RECORDS**: `r sn$records[1]` | **no-ALTs**: `r sn[3,1]` | **SNPs**: `r sn$SNPs[1]` | **MNPs**: `r sn$MNPs[1]`
 
-```{r basic stats, fig.width = 7.5, dev ='jpeg'}
-#TODO make this interactive(?)
+### Per Sample Stats
+```{r stats table}
 .pscL <- grepl("^PSC", dataL)
 psc <- read.table(text=dataL[.pscL])[ ,3:14]
 names(psc) <- c("Sample", "HomozygousRef", "HomozygousAtl", "Heterozygotes", "Transitions", "Transversions", "Indels",	"MeanDepth", "Singletons",	"HapRef", "HapAlt", "Missing")
 psc$Homozygotes <- psc$HomozygousRef + psc$HomozygousAtl
-tidy_psc <- pivot_longer(psc[,c(1,2,3,4,5,6,9,12,13)], -Sample , names_to = "Category", values_to = "Count")
-
-ggplot(data = tidy_psc, mapping = aes(x = Count, y = Sample, color = Category)) +
-  geom_point(size = 2) +
-  labs(title = "Statistics by Sample") +
-  theme_bw() +
-  scale_x_continuous(n.breaks=9, labels = scales::comma) +
-  xlab("Count") +
-  ylab("") +
-  scale_color_brewer(palette = "Dark2") +
-  theme(panel.grid.minor = element_blank(), panel.grid.major.y = element_blank())
-```
-
-***
-
-This graph contains variant statistics per individual, corresponding 
-to the `r sn$samples[1]` samples in `r bcf`. The data showin in the next
-tab were used create this visualization.
-
-### Per Sample (table)
-```{r stats table}
 DT::datatable(
   psc[,c(1,12,2,3,13,4,5,6,9)],
   rownames = F, 
@@ -132,8 +118,7 @@ DT::datatable(
 ***
 
 This table contains variant statistics per individual, corresponding 
-to the `r sn$samples[1]` samples in `r bcf`. These data are what is used
-to create the visualization from the previous tab.
+to the `r sn$samples[1]` samples in `r bcf`. 
 
 ### Chromosome-wide Imputation QC Plot
 ```{r QC_chromwide, dev = "jpeg", out.width="100%"}

diff --git a/harpy/scripts/stitch_impute.R b/harpy/scripts/stitch_impute.R
@@ -89,8 +89,4 @@ debugdir <- paste(outdir, "debug", sep = "/")
 # remove debug if it's empty
 if (length(list.files(debugdir)) < 1) {
     unlink(debugdir, recursive = TRUE)
-}
-
-unlink(paste(outdir, "input", sep = "/"), recursive = TRUE)
-unlink(paste(outdir, "RData", sep = "/"), recursive = TRUE)
-unlink(paste(outdir, "tmp", sep = "/"), recursive = TRUE)
+}
diff --git a/harpy/snakefiles/impute.smk b/harpy/snakefiles/impute.smk
@@ -40,7 +40,7 @@ rule sort_bcf:
         bcf = temp(f"{outdir}/workflow/input/vcf/input.sorted.bcf"),
         idx = temp(f"{outdir}/workflow/input/vcf/input.sorted.bcf.csi")
     log:
-        f"{outdir}/logs/input.sorted.log"
+        f"{outdir}/logs/input.sort.log"
     container:
         None
     shell:
@@ -91,9 +91,13 @@ rule impute:
     output:
         # format a wildcard pattern like "k{k}/s{s}/ngen{ngen}"
         # into a file path, with k, s, ngen being the columns of the data frame
-        f"{outdir}/{paramspace.wildcard_pattern}/contigs/" + "{part}/{part}.vcf.gz"
+        temp(f"{outdir}/{paramspace.wildcard_pattern}/contigs/" + "{part}/{part}.vcf.gz"),
+        temp(directory(f"{outdir}/{paramspace.wildcard_pattern}/contigs/" + "{part}/plots")),
+        temp(directory(f"{outdir}/{paramspace.wildcard_pattern}/contigs/" + "{part}/tmp")),
+        temp(directory(f"{outdir}/{paramspace.wildcard_pattern}/contigs/" + "{part}/RData")),
+        temp(directory(f"{outdir}/{paramspace.wildcard_pattern}/contigs/" + "{part}/input"))
     log:
-        f"{outdir}/{paramspace.wildcard_pattern}/logs/stitch/" + "{part}/{part}.log"
+        f"{outdir}/{paramspace.wildcard_pattern}/logs/" + "{part}.stitch.log"
     params:
         # automatically translate the wildcard values into an instance of the param space
         # in the form of a dict (here: {"k": ..., "s": ..., "ngen": ...})
@@ -111,45 +115,35 @@ rule impute:
 rule index_vcf:
     input:
         vcf   = outdir + "/{stitchparams}/contigs/{part}/{part}.vcf.gz"
-    output: 
-        idx   = outdir + "/{stitchparams}/contigs/{part}/{part}.vcf.gz.tbi",
-        stats = outdir + "/{stitchparams}/contigs/{part}/{part}.stats"
+    output:
+        vcf   = outdir + "/{stitchparams}/contigs/{part}.vcf.gz",
+        idx   = outdir + "/{stitchparams}/contigs/{part}.vcf.gz.tbi",
+        stats = outdir + "/{stitchparams}/reports/data/contigs/{part}.stats"
+    wildcard_constraints:
+        part = "[^/]+"
     container:
         None
     shell:
         """
-        tabix {input.vcf}
+        cp {input} {output.vcf}
+        tabix {output.vcf}
         bcftools stats -s "-" {input.vcf} > {output.stats}
         """
 
 rule stitch_reports:
     input:
-        outdir + "/{stitchparams}/contigs/{part}/{part}.stats"
+        outdir + "/{stitchparams}/reports/data/contigs/{part}.stats",
+        outdir + "/{stitchparams}/contigs/{part}/plots"
     output:
-        outdir + "/{stitchparams}/contigs/{part}/{part}.STITCH.html"
+        outdir + "/{stitchparams}/reports/{part}.stitch.html"
     conda:
         f"{envdir}/r.yaml"
     script:
         "report/stitch_collate.Rmd"
 
-rule clean_plots:
-    input:
-        outdir + "/{stitchparams}/contigs/{part}/{part}.STITCH.html"
-    output:
-        temp(touch(outdir + "/{stitchparams}/contigs/{part}/.plots.cleaned"))
-    params:
-        stitch = lambda wc: wc.get("stitchparams"),
-        outdir = outdir
-    container:
-        None
-    priority:
-        2
-    shell: 
-        "rm -rf {params.outdir}/{params.stitch}/contigs/{wildcards.part}/plots"
-
 rule concat_list:
     input:
-        bcf = collect(outdir + "/{{stitchparams}}/contigs/{part}/{part}.vcf.gz", part = contigs)
+        bcf = collect(outdir + "/{{stitchparams}}/contigs/{part}.vcf.gz", part = contigs)
     output:
         temp(outdir + "/{stitchparams}/bcf.files")
     run:
@@ -159,7 +153,7 @@ rule concat_list:
 rule merge_vcf:
     input:
         files = outdir + "/{stitchparams}/bcf.files",
-        idx   = collect(outdir + "/{{stitchparams}}/contigs/{part}/{part}.vcf.gz.tbi", part = contigs)
+        idx   = collect(outdir + "/{{stitchparams}}/contigs/{part}.vcf.gz.tbi", part = contigs)
     output:
         outdir + "/{stitchparams}/variants.imputed.bcf"
     threads:
@@ -184,7 +178,7 @@ rule general_stats:
         bcf = outdir + "/{stitchparams}/variants.imputed.bcf",
         idx = outdir + "/{stitchparams}/variants.imputed.bcf.csi"
     output:
-        outdir + "/{stitchparams}/reports/variants.imputed.stats"
+        outdir + "/{stitchparams}/reports/data/impute.stats"
     container:
         None
     shell:
@@ -228,8 +222,7 @@ rule workflow_summary:
     input: 
         vcf = collect(outdir + "/{stitchparams}/variants.imputed.bcf", stitchparams=paramspace.instance_patterns),
         agg_report = collect(outdir + "/{stitchparams}/reports/variants.imputed.html", stitchparams=paramspace.instance_patterns) if not skipreports else [],
-        contig_report = collect(outdir + "/{stitchparams}/contigs/{part}/{part}.STITCH.html", stitchparams=paramspace.instance_patterns, part = contigs) if not skipreports else [],
-        cleanedplots = collect(outdir + "/{stitchparams}/contigs/{part}/.plots.cleaned", stitchparams=paramspace.instance_patterns, part = contigs) if not skipreports else []
+        contig_report = collect(outdir + "/{stitchparams}/reports/{part}.stitch.html", stitchparams=paramspace.instance_patterns, part = contigs) if not skipreports else [],
     run:
         with open(outdir + "/workflow/impute.summary", "w") as f:
             _ = f.write("The harpy impute workflow ran using these parameters:\n\n")