diff --git a/DESCRIPTION b/DESCRIPTION index d57e89c..5a56306 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: RaMS Type: Package Title: R Access to Mass-Spec Data -Version: 1.3.3 +Version: 1.3.4 Authors@R: c( person(given = "William", family = "Kumler", email = "wkumler@uw.edu", role = c("aut", "cre", "cph")), diff --git a/cran-comments.md b/cran-comments.md index 3d80ecf..616a37a 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,6 +1,12 @@ ## Resubmission +This is a resubmission. In this version I have set data.table to use only two +threads to avoid CPU time > 2.5 times elapsed time in the vignettes + + +## Resubmission (prior) + This is a resubmission. In this version I have reduced the example runtimes to below 5s (I hope) for tests and examples and set data.table to use only two threads to avoid CPU time > 2.5 times elapsed time diff --git a/doc/Intro-to-RaMS.R b/doc/Intro-to-RaMS.R index 04f1b4b..2a99e0c 100644 --- a/doc/Intro-to-RaMS.R +++ b/doc/Intro-to-RaMS.R @@ -1,6 +1,7 @@ -## ---- include = FALSE--------------------------------------------------------- +## ----include = FALSE---------------------------------------------------------- options(rmarkdown.html_vignette.check_title = FALSE) options(tidyverse.quiet = TRUE) +data.table::setDTthreads(2) knitr::opts_chunk$set( collapse = TRUE, @@ -142,7 +143,7 @@ msdata$MS2 <- mutate(msdata$MS2, neutral_loss=premz-fragmz) %>% msdata$MS2[neutral_loss%between%pmppm(homarine_neutral_loss, ppm = 5)] %>% arrange(desc(int)) %>% head() %>% knitr::kable() -## ---- fig.height=3------------------------------------------------------------ +## ----fig.height=3------------------------------------------------------------- chrom_file <- system.file("extdata", "wk_chrom.mzML.gz", package = "RaMS") msdata_chroms <- grabMSdata(chrom_file, verbosity = 0, grab_what = "chroms") given_chrom <- msdata_chroms$chroms[chrom_type=="SRM iletter1"] @@ -192,7 +193,7 @@ as.numeric(object.size(all_data)/object.size(small_data)) ## ----verbosedemo-------------------------------------------------------------- all_data <- grabMSdata(data_files, grab_what = c("MS1", "MS2"), verbosity = 2) -## ---- eval=FALSE-------------------------------------------------------------- +## ----eval=FALSE--------------------------------------------------------------- # ## Not run: # library(parallel) # cl <- makeCluster(getOption("cl.cores", detectCores()-1)) @@ -206,12 +207,12 @@ all_data <- grabMSdata(data_files, grab_what = c("MS1", "MS2"), verbosity = 2) # } # stopImplicitCluster() -## ---- eval=FALSE-------------------------------------------------------------- +## ----eval=FALSE--------------------------------------------------------------- # ## Not run: # data_nodes <- xml2::xml_find_all(mzML_nodes, xpath="//d1:precursorMz") # raw_data <- xml2::xml_attr(data_nodes, "value") -## ---- eval=FALSE-------------------------------------------------------------- +## ----eval=FALSE--------------------------------------------------------------- # ## Not run: # decoded_binary <- base64enc::base64decode(binary) # raw_binary <- as.raw(decoded_binary) diff --git a/doc/Intro-to-RaMS.Rmd b/doc/Intro-to-RaMS.Rmd index 7d411eb..43afd02 100644 --- a/doc/Intro-to-RaMS.Rmd +++ b/doc/Intro-to-RaMS.Rmd @@ -12,6 +12,7 @@ vignette: > ```{r, include = FALSE} options(rmarkdown.html_vignette.check_title = FALSE) options(tidyverse.quiet = TRUE) +data.table::setDTthreads(2) knitr::opts_chunk$set( collapse = TRUE, diff --git a/doc/Intro-to-RaMS.html b/doc/Intro-to-RaMS.html index 480a3fa..e4c2a87 100644 --- a/doc/Intro-to-RaMS.html +++ b/doc/Intro-to-RaMS.html @@ -404,10 +404,10 @@

TICs, BPCs, and metadata

msdata <- grabMSdata(single_file, grab_what = "TIC") #> -#> Reading file LB12HL_AB.mzML.gz... 0.04 secs -#> Reading TIC...0.09 secs +#> Reading file LB12HL_AB.mzML.gz... 0.05 secs +#> Reading TIC...0.11 secs #> Binding files together into single data.table -#> Total time: 0.14 secs +#> Total time: 0.16 secs knitr::kable(head(msdata$TIC, 3)) @@ -448,10 +448,10 @@

TICs, BPCs, and metadata

calculated.

msdata <- grabMSdata(single_file, grab_what = "BPC")
 #> 
-#> Reading file LB12HL_AB.mzML.gz... 0.04 secs 
-#> Reading BPC...0.06 secs 
+#> Reading file LB12HL_AB.mzML.gz... 0.05 secs 
+#> Reading BPC...0.07 secs 
 #> Binding files together into single data.table
-#> Total time: 0.1 secs
+#> Total time: 0.12 secs

Since the data is parsed in a “tidy” format, it plays nicely with popular packages such as ggplot2. Let’s use that to plot our BPC instead of the base R plotting system:

@@ -501,8 +501,8 @@

TICs, BPCs, and metadata

# less-minified DDA file heregrabMSdata(files = data_files[1], grab_what = "metadata")#> -#> Reading file DDApos_2.mzML.gz... 0.13 secs -#> Reading file metadata...0.37 secs +#> Reading file DDApos_2.mzML.gz... 0.12 secs +#> Reading file metadata...0.38 secs #> Binding files together into single data.table#> Total time: 0.51 secs#> $metadata @@ -529,7 +529,7 @@

Adding a column: MS1 data

|=============================================== | 67% | |======================================================================| 100% -#> Total time: 0.72 secs +#> Total time: 0.8 secsknitr::kable(head(msdata$MS1, 3))
@@ -606,14 +606,14 @@

Moving along: MS2 data

a good idea to grab the MS1 data alongside.

msdata <- grabMSdata(data_files[1], grab_what = "everything")
 #> 
-#> Reading file DDApos_2.mzML.gz... 0.46 secs 
+#> Reading file DDApos_2.mzML.gz... 0.41 secs 
 #> Reading MS1 data...0.29 secs 
 #> Reading MS2 data...0.11 secs 
-#> Reading BPC...0.16 secs 
-#> Reading TIC...0.16 secs 
-#> Reading file metadata...0.47 secs 
+#> Reading BPC...0.17 secs 
+#> Reading TIC...0.17 secs 
+#> Reading file metadata...0.37 secs 
 #> Binding files together into single data.table
-#> Total time: 1.65 secs
+#> Total time: 1.52 secs
 knitr::kable(head(msdata$MS2, 3))
@@ -926,7 +926,7 @@

Saving space: EICs and rtrange

|==================================================== | 75% | |======================================================================| 100% -#> Total time: 1.57 secs +#> Total time: 1.72 secsmzs_of_interest <- c(adenine=136.06232, valine=118.0865)small_data <- grabMSdata(data_files, grab_what = c("EIC", "EIC_MS2"), @@ -942,7 +942,7 @@

Saving space: EICs and rtrange

|==================================================== | 75% | |======================================================================| 100% -#> Total time: 1.52 secs +#> Total time: 1.63 secsall_data$MS1 %>% mutate(type="All data") %>% @@ -986,7 +986,7 @@

Saving space: EICs and rtrange

|==================================================== | 75% | |======================================================================| 100% -#> Total time: 1.49 secs +#> Total time: 1.28 secsall_data$MS1 %>% mutate(type="All data") %>% @@ -1030,26 +1030,26 @@

Speeding things up

#> | |================== | 25% -#> Reading file LB12HL_AB.mzML.gz... 0.12 secs -#> Reading MS1 data...0.1 secs +#> Reading file LB12HL_AB.mzML.gz... 0.15 secs +#> Reading MS1 data...0.12 secs #> Reading MS2 data...0.01 secs #> | |=================================== | 50% -#> Reading file LB12HL_CD.mzML.gz... 0.12 secs -#> Reading MS1 data...0.1 secs +#> Reading file LB12HL_CD.mzML.gz... 0.15 secs +#> Reading MS1 data...0.12 secs #> Reading MS2 data...0.01 secs #> | |==================================================== | 75% -#> Reading file LB12HL_EF.mzML.gz... 0.12 secs -#> Reading MS1 data...0.1 secs +#> Reading file LB12HL_EF.mzML.gz... 0.13 secs +#> Reading MS1 data...0.11 secs #> Reading MS2 data...0.01 secs #> | |======================================================================| 100%#> Binding files together into single data.table -#> Total time: 1.55 secs +#> Total time: 1.65 secs

In general, slow file read times can be improved by compressing the data. mzML files are highly compressible, with options to compress both the data itself using the zlib method and the files as a @@ -1160,7 +1160,7 @@

The nitty-gritty details

For more information about the mzML data format and its history, check out the specification at https://www.psidev.info/mzML.


-

Vignette last built on 2023-11-29

+

Vignette last built on 2023-12-13

diff --git a/doc/Intro-to-tmzML.R b/doc/Intro-to-tmzML.R index e3a3bbf..9f1deed 100644 --- a/doc/Intro-to-tmzML.R +++ b/doc/Intro-to-tmzML.R @@ -1,6 +1,7 @@ ## ----setup, include = FALSE--------------------------------------------------- options(rmarkdown.html_vignette.check_title = FALSE) options(tidyverse.quiet = TRUE) +data.table::setDTthreads(2) knitr::opts_chunk$set( collapse = TRUE, diff --git a/doc/Intro-to-tmzML.Rmd b/doc/Intro-to-tmzML.Rmd index c7d2ae9..e79b9f8 100644 --- a/doc/Intro-to-tmzML.Rmd +++ b/doc/Intro-to-tmzML.Rmd @@ -12,6 +12,7 @@ vignette: > ```{r setup, include = FALSE} options(rmarkdown.html_vignette.check_title = FALSE) options(tidyverse.quiet = TRUE) +data.table::setDTthreads(2) knitr::opts_chunk$set( collapse = TRUE, diff --git a/doc/Intro-to-tmzML.html b/doc/Intro-to-tmzML.html index 47164cc..ff60389 100644 --- a/doc/Intro-to-tmzML.html +++ b/doc/Intro-to-tmzML.html @@ -462,27 +462,16 @@

Getting started with tmzML documents

ggplot(ms_data_table) + geom_line(aes(x=rt, y=int, color=filename)) + xlim(8, 9.5)

library(dplyr)
-#> 
-#> Attaching package: 'dplyr'
-#> The following object is masked from 'package:RaMS':
-#> 
-#>     between
-#> The following objects are masked from 'package:stats':
-#> 
-#>     filter, lag
-#> The following objects are masked from 'package:base':
-#> 
-#>     intersect, setdiff, setequal, union
-ms_data_table %>%
-  filter(rt%between%c(8.4, 8.9)) %>%
-  group_by(filename) %>%
-  summarise(area=sum(int))
-#> # A tibble: 3 × 2
-#>   filename             area
-#>   <chr>               <dbl>
-#> 1 LB12HL_AB.tmzML 13057831.
-#> 2 LB12HL_CD.tmzML 24925878.
-#> 3 LB12HL_EF.tmzML 21913457.
+ms_data_table %>% + filter(rt%between%c(8.4, 8.9)) %>% + group_by(filename) %>% + summarise(area=sum(int)) +#> # A tibble: 3 × 2 +#> filename area +#> <chr> <dbl> +#> 1 LB12HL_AB.tmzML 13057831. +#> 2 LB12HL_CD.tmzML 24925878. +#> 3 LB12HL_EF.tmzML 21913457.

Why not tmzML?

@@ -509,7 +498,7 @@

Why not tmzML?

#> $ MS2 : NULL #> $ connection:List of 3 #> ..$ files : Named chr [1:3] "tmzMLs/LB12HL_AB.tmzML" "tmzMLs/LB12HL_CD.tmzML" "tmzMLs/LB12HL_EF.tmzML" -#> .. ..- attr(*, "names")= chr [1:3] "C:/Users/willi/AppData/Local/R/win-library/4.3/RaMS/extdata/LB12HL_AB.mzML.gz" "C:/Users/willi/AppData/Local/R/win-library/4.3/RaMS/extdata/LB12HL_CD.mzML.gz" "C:/Users/willi/AppData/Local/R/win-library/4.3/RaMS/extdata/LB12HL_EF.mzML.gz" +#> .. ..- attr(*, "names")= chr [1:3] "C:/Users/willi/AppData/Local/Temp/RtmpQ9wT8y/temp_libpath49a055ab1e9b/RaMS/extdata/LB12HL_AB.mzML.gz" "C:/Users/willi/AppData/Local/Temp/RtmpQ9wT8y/temp_libpath49a055ab1e9b/RaMS/extdata/LB12HL_CD.mzML.gz" "C:/Users/willi/AppData/Local/Temp/RtmpQ9wT8y/temp_libpath49a055ab1e9b/RaMS/extdata/LB12HL_EF.mzML.gz" #> ..$ grab_what: chr [1:2] "MS1" "MS2" #> ..$ verbosity: num 0 #> - attr(*, "class")= chr "msdata_connection"
@@ -573,7 +562,7 @@

tmzML internals

fast so we may be approaching diminishing returns.

unlink("tmzMLs", recursive = TRUE)

-

Vignette last built on 2023-11-29

+

Vignette last built on 2023-12-13

diff --git a/doc/Minifying-files-with-RaMS.R b/doc/Minifying-files-with-RaMS.R index 7879811..9fe91e8 100644 --- a/doc/Minifying-files-with-RaMS.R +++ b/doc/Minifying-files-with-RaMS.R @@ -1,6 +1,7 @@ ## ----setup, include = FALSE--------------------------------------------------- options(rmarkdown.html_vignette.check_title = FALSE) options(tidyverse.quiet = TRUE) +data.table::setDTthreads(2) knitr::opts_chunk$set( collapse = TRUE, diff --git a/doc/Minifying-files-with-RaMS.Rmd b/doc/Minifying-files-with-RaMS.Rmd index 6ddbe20..f2e9ea4 100644 --- a/doc/Minifying-files-with-RaMS.Rmd +++ b/doc/Minifying-files-with-RaMS.Rmd @@ -12,6 +12,7 @@ vignette: > ```{r setup, include = FALSE} options(rmarkdown.html_vignette.check_title = FALSE) options(tidyverse.quiet = TRUE) +data.table::setDTthreads(2) knitr::opts_chunk$set( collapse = TRUE, diff --git a/doc/Minifying-files-with-RaMS.html b/doc/Minifying-files-with-RaMS.html index 0151a94..33594cd 100644 --- a/doc/Minifying-files-with-RaMS.html +++ b/doc/Minifying-files-with-RaMS.html @@ -365,23 +365,23 @@

Minifying files with RaMS

Below, we begin with a large MS file containing both MS1 and MS2 data and extract only the data corresponding to valine/glycine betaine and homarine.

-
library(RaMS)
-msdata_files <- list.files(
-  system.file("extdata", package = "RaMS"), full.names = TRUE, pattern = "mzML"
-)[1:4]
-
-initial_filename <- msdata_files[1]
-output_filename <- gsub(x=paste0("minified_", basename(initial_filename)), "\\.gz", "")
-
-masses_of_interest <- c(118.0865, 138.0555)
-minifyMSdata(files = initial_filename, output_files = output_filename, 
-             mz_include = masses_of_interest, ppm = 10, warn = FALSE)
+
library(RaMS)
+msdata_files <- list.files(
+  system.file("extdata", package = "RaMS"), full.names = TRUE, pattern = "mzML"
+)[1:4]
+
+initial_filename <- msdata_files[1]
+output_filename <- gsub(x=paste0("minified_", basename(initial_filename)), "\\.gz", "")
+
+masses_of_interest <- c(118.0865, 138.0555)
+minifyMSdata(files = initial_filename, output_files = output_filename, 
+             mz_include = masses_of_interest, ppm = 10, warn = FALSE)

Then, when we open the file up (with RaMS or other software) we are left with the data corresponding only to those compounds:

-
init_msdata <- grabMSdata(initial_filename)
-msdata <- grabMSdata(output_filename)
-
knitr::kable(head(msdata$MS1, 3))
+
init_msdata <- grabMSdata(initial_filename)
+msdata <- grabMSdata(output_filename)
+
knitr::kable(head(msdata$MS1, 3))
@@ -412,7 +412,7 @@

Minifying files with RaMS

-
knitr::kable(head(msdata$MS2, 3))
+
knitr::kable(head(msdata$MS2, 3))
@@ -461,23 +461,23 @@

Minifying files with RaMS

Both the TIC and BPC are updated to reflect the smaller file size as well:

-
par(mfrow=c(2, 1), mar=c(2.1, 2.1, 1.1, 0.1))
-plot(init_msdata$BPC$rt, init_msdata$BPC$int, type = "l", main = "Initial BPC")
-plot(msdata$BPC$rt, msdata$BPC$int, type = "l", main = "New BPC")
+
par(mfrow=c(2, 1), mar=c(2.1, 2.1, 1.1, 0.1))
+plot(init_msdata$BPC$rt, init_msdata$BPC$int, type = "l", main = "Initial BPC")
+plot(msdata$BPC$rt, msdata$BPC$int, type = "l", main = "New BPC")


The minifyMSdata function is vectorized so the exact same syntax can be used for multiple files:

-
dir.create("mini_mzMLs/")
-output_files <- paste0("mini_mzMLs/", basename(msdata_files))
-output_files <- gsub(x=output_files, "\\.gz", "")
-
-minifyMSdata(files = msdata_files, output_files = output_files, verbosity = 0,
-             mz_include = masses_of_interest, ppm = 10, warn = FALSE)
-
mini_msdata <- grabMSdata(output_files, verbosity = 0)
-
-library(ggplot2)
-ggplot(mini_msdata$BPC) + geom_line(aes(x=rt, y=int, color=filename)) + theme_bw()
+
dir.create("mini_mzMLs/")
+output_files <- paste0("mini_mzMLs/", basename(msdata_files))
+output_files <- gsub(x=output_files, "\\.gz", "")
+
+minifyMSdata(files = msdata_files, output_files = output_files, verbosity = 0,
+             mz_include = masses_of_interest, ppm = 10, warn = FALSE)
+
mini_msdata <- grabMSdata(output_files, verbosity = 0)
+
+library(ggplot2)
+ggplot(mini_msdata$BPC) + geom_line(aes(x=rt, y=int, color=filename)) + theme_bw()

These new files are valid according to the validator provided in MSnbase, which means that most programs should be able to open them, but @@ -500,40 +500,40 @@

Minifying files with RaMS

First, we identify the m/z values we’d like to keep in the minified files. For the demo data, I’ll use the Ingalls Lab list of targeted compounds - those we have authentic standards for.

-
raw_stans <- read.csv(paste0("https://raw.githubusercontent.com/",
-                             "IngallsLabUW/Ingalls_Standards/",
-                             "b098927ea0089b6e7a31e1758e7c7eaad5408535/",
-                             "Ingalls_Lab_Standards_NEW.csv"))
-
-mzs_to_include <- as.numeric(unique(raw_stans[raw_stans$Fraction1=="HILICPos",]$m.z))
-# Include glycine betaine isotopes for README demo
-mzs_to_include <- c(mzs_to_include, 119.0899, 119.0835)
+
raw_stans <- read.csv(paste0("https://raw.githubusercontent.com/",
+                             "IngallsLabUW/Ingalls_Standards/",
+                             "b098927ea0089b6e7a31e1758e7c7eaad5408535/",
+                             "Ingalls_Lab_Standards_NEW.csv"))
+
+mzs_to_include <- as.numeric(unique(raw_stans[raw_stans$Fraction1=="HILICPos",]$m.z))
+# Include glycine betaine isotopes for README demo
+mzs_to_include <- c(mzs_to_include, 119.0899, 119.0835)

Then, we download the raw MS data from the online repository into which it’s been deposited.

-
if(!dir.exists("vignettes/data"))dir.create("vignettes/data")
-base_url <- "ftp://ftp.ebi.ac.uk/pub/databases/metabolights/studies/public/MTBLS703/"
-chosen_files <- paste0(base_url, "170223_Smp_LB12HL_", c("AB", "CD", "EF"), "_pos.mzXML")
-new_names <- gsub(x=basename(chosen_files), "170223_Smp_", "")
-
-mapply(download.file, chosen_files, paste0("vignettes/data/", new_names), 
-       mode = "wb", method = "libcurl")
+
if(!dir.exists("vignettes/data"))dir.create("vignettes/data")
+base_url <- "ftp://ftp.ebi.ac.uk/pub/databases/metabolights/studies/public/MTBLS703/"
+chosen_files <- paste0(base_url, "170223_Smp_LB12HL_", c("AB", "CD", "EF"), "_pos.mzXML")
+new_names <- gsub(x=basename(chosen_files), "170223_Smp_", "")
+
+mapply(download.file, chosen_files, paste0("vignettes/data/", new_names), 
+       mode = "wb", method = "libcurl")

The MSMS data wasn’t uploaded, so we handle that separately by pulling it off the lab computer manually and copying it over to our temporary directory. If you’re following along, you can skip this chunk or use your own DDA data.

-
file.copy(from = paste0("Z:/1_QEdata/2016/2016_Katherine_1335_LightB12_",
-                        "Experiment/170223_KRH_Rerun_1335_LightB12_Exp_HILIC/",
-                        "positive/",
-                        "170223_Poo_AllCyanoAqExtracts_DDApos_2.mzXML"),
-          to = "vignettes/data/DDApos_2.mzXML", overwrite = TRUE)
+
file.copy(from = paste0("Z:/1_QEdata/2016/2016_Katherine_1335_LightB12_",
+                        "Experiment/170223_KRH_Rerun_1335_LightB12_Exp_HILIC/",
+                        "positive/",
+                        "170223_Poo_AllCyanoAqExtracts_DDApos_2.mzXML"),
+          to = "vignettes/data/DDApos_2.mzXML", overwrite = TRUE)

Then we can actually perform the minification:

-
library(RaMS)
-
-if(!dir.exists("inst/extdata"))dir.create("inst/extdata", recursive = TRUE)
-init_files <- list.files("vignettes/data/", full.names = TRUE)
-out_files <- paste0("inst/extdata/", basename(init_files))
-minifyMSdata(files = init_files, output_files = out_files, warn = FALSE,
-             mz_include = mzs_to_include, ppm = 20)
+
library(RaMS)
+
+if(!dir.exists("inst/extdata"))dir.create("inst/extdata", recursive = TRUE)
+init_files <- list.files("vignettes/data/", full.names = TRUE)
+out_files <- paste0("inst/extdata/", basename(init_files))
+minifyMSdata(files = init_files, output_files = out_files, warn = FALSE,
+             mz_include = mzs_to_include, ppm = 20)

Now we have four minified mzXML files in our inst/extdata folder. However, we’d like to be able to demo the mzML functionality as well as that of mzXMLs, so we can use Proteowizard’s @@ -544,28 +544,28 @@

Minifying files with RaMS

time, keeping data between 4 and 15 minutes.

Finally, we gzip the files to get them as small as possible, also using msconvert.

-
system("msconvert inst/extdata/*.mzXML -o inst/extdata/temp --noindex")
-system("msconvert --mzXML inst/extdata/*.mzXML -o inst/extdata/temp --noindex")
-system('msconvert inst/extdata/temp/*.mzML --filter \"scanTime [240,900]\" -o inst/extdata -g')
-system('msconvert inst/extdata/temp/*.mzXML --mzXML --filter \"scanTime [240,900]\" -o inst/extdata -g')
+
system("msconvert inst/extdata/*.mzXML -o inst/extdata/temp --noindex")
+system("msconvert --mzXML inst/extdata/*.mzXML -o inst/extdata/temp --noindex")
+system('msconvert inst/extdata/temp/*.mzML --filter \"scanTime [240,900]\" -o inst/extdata -g')
+system('msconvert inst/extdata/temp/*.mzXML --mzXML --filter \"scanTime [240,900]\" -o inst/extdata -g')

And then for the last few steps, we again rename the files (since msconvert expands them to their full .raw names) and remove the ones we don’t need for the demos.

-
init_files <- list.files("inst/extdata", full.names = TRUE)
-new_names <- paste0("inst/extdata/", gsub(x=init_files, ".*(Smp_|Extracts_)", ""))
-file.rename(init_files, new_names)
-
-unlink("inst/extdata/temp", recursive = TRUE)
-file.remove(list.files("inst/extdata", pattern = "mzXML$", full.names = TRUE))
-file.remove(paste0("inst/extdata/", c("LB12HL_CD.mzXML.gz", "LB12HL_EF.mzXML.gz")))
+
init_files <- list.files("inst/extdata", full.names = TRUE)
+new_names <- paste0("inst/extdata/", gsub(x=init_files, ".*(Smp_|Extracts_)", ""))
+file.rename(init_files, new_names)
+
+unlink("inst/extdata/temp", recursive = TRUE)
+file.remove(list.files("inst/extdata", pattern = "mzXML$", full.names = TRUE))
+file.remove(paste0("inst/extdata/", c("LB12HL_CD.mzXML.gz", "LB12HL_EF.mzXML.gz")))

To check that the new files look ok, we can see if we can read them with RaMS and MSnbase.

-
MSnbase::readMSData(list.files("inst/extdata", full.names = TRUE)[1], msLevel. = 1)
-RaMS::grabMSdata(new_names[1])
+
MSnbase::readMSData(list.files("inst/extdata", full.names = TRUE)[1], msLevel. = 1)
+RaMS::grabMSdata(new_names[1])

Finally, remember to clean up the original downloads folder

-
unlink("vignettes/data", recursive = TRUE)
+
unlink("vignettes/data", recursive = TRUE)

-

README last built on 2022-11-14

+

README last built on 2023-12-13

diff --git a/doc/RaMS-and-friends.R b/doc/RaMS-and-friends.R index 65c1825..a1d47a7 100644 --- a/doc/RaMS-and-friends.R +++ b/doc/RaMS-and-friends.R @@ -1,5 +1,6 @@ ## ----setup, include = FALSE--------------------------------------------------- options(rmarkdown.html_vignette.check_title = FALSE) +data.table::setDTthreads(2) ## ----------------------------------------------------------------------------- library(RaMS) diff --git a/doc/RaMS-and-friends.Rmd b/doc/RaMS-and-friends.Rmd index c215d02..1ff0206 100644 --- a/doc/RaMS-and-friends.Rmd +++ b/doc/RaMS-and-friends.Rmd @@ -9,6 +9,7 @@ vignette: > ```{r setup, include = FALSE} options(rmarkdown.html_vignette.check_title = FALSE) +data.table::setDTthreads(2) ``` **Table of contents:** diff --git a/doc/RaMS-and-friends.html b/doc/RaMS-and-friends.html index 297677a..beb8057 100644 --- a/doc/RaMS-and-friends.html +++ b/doc/RaMS-and-friends.html @@ -360,26 +360,27 @@

Standard export to CSV

exported to CSV files with base R functions. This works best with a few chromatograms at a time, as the millions of data points found in most MS files can overwhelm common file readers.

-
library(RaMS)
-
-# Locate an MS file
-single_file <- system.file("extdata", "LB12HL_AB.mzML.gz", package = "RaMS")
-
-# Grab the MS data
-msdata <- grabMSdata(single_file, grab_what = "everything")
+
library(RaMS)
+
+# Locate an MS file
+single_file <- system.file("extdata", "LB12HL_AB.mzML.gz", package = "RaMS")
+
+# Grab the MS data
+msdata <- grabMSdata(single_file, grab_what = "everything")
## 
-## Reading file LB12HL_AB.mzML.gz... 0.23 secs 
-## Reading MS1 data...0.14 secs 
-## Reading MS2 data...0.02 secs 
+## Reading file LB12HL_AB.mzML.gz... 0.15 secs 
+## Reading MS1 data...0.11 secs 
+## Reading MS2 data...0.01 secs 
 ## Reading BPC...0.08 secs 
-## Reading TIC...0.07 secs 
-## Reading file metadata...0.14 secs 
-## Total time: 0.7 secs
-
# Write out MS1 data to .csv file
-write.csv(x = msdata$MS1, file = "MS1_data.csv")
-
-# Clean up afterward
-file.remove("MS1_data.csv")
+## Reading TIC...0.06 secs +## Reading file metadata...0.1 secs +## Binding files together into single data.table +## Total time: 0.51 secs +
# Write out MS1 data to .csv file
+write.csv(x = msdata$MS1, file = "MS1_data.csv")
+
+# Clean up afterward
+file.remove("MS1_data.csv")
## [1] TRUE
@@ -391,25 +392,26 @@

Fancier export to Excel

uses the openxlsx package, although there are several alternatives with identical functionality.

-
library(openxlsx)
-
-# Locate an MS2 file
-MS2_file <- system.file("extdata", "DDApos_2.mzML.gz", package = "RaMS")
-
-# Grab the MS1 and MS2 data
-msdata <- grabMSdata(MS2_file, grab_what=c("MS1", "MS2"))
+
library(openxlsx)
+
+# Locate an MS2 file
+MS2_file <- system.file("extdata", "DDApos_2.mzML.gz", package = "RaMS")
+
+# Grab the MS1 and MS2 data
+msdata <- grabMSdata(MS2_file, grab_what=c("MS1", "MS2"))
## 
-## Reading file DDApos_2.mzML.gz... 0.54 secs 
-## Reading MS1 data...0.34 secs 
-## Reading MS2 data...0.12 secs 
-## Total time: 1 secs
-
# Write out MS data to Excel file
-# openxlsx writes each object in a list to a unique sheet
-# Produces one sheet for MS1 and one for MS2
-write.xlsx(msdata, file = "MS2_data.xlsx")
-
-# Clean up afterward
-file.remove("MS2_data.xlsx")
+## Reading file DDApos_2.mzML.gz... 0.41 secs +## Reading MS1 data...0.55 secs +## Reading MS2 data...0.11 secs +## Binding files together into single data.table +## Total time: 1.07 secs +
# Write out MS data to Excel file
+# openxlsx writes each object in a list to a unique sheet
+# Produces one sheet for MS1 and one for MS2
+write.xlsx(msdata, file = "MS2_data.xlsx")
+
+# Clean up afterward
+file.remove("MS2_data.xlsx")
## [1] TRUE
@@ -419,12 +421,12 @@

Exporting to SQL database

This vignette will demo the RSQLite package’s engine, although several other database engines have similar functionality.

-
library(DBI)
-# Get data from multiple files to show off
-mzml_files <- system.file(c("extdata/LB12HL_AB.mzML.gz", 
-                            "extdata/LB12HL_CD.mzML.gz"), 
-                          package = "RaMS")
-msdata <- grabMSdata(mzml_files)
+
library(DBI)
+# Get data from multiple files to show off
+mzml_files <- system.file(c("extdata/LB12HL_AB.mzML.gz", 
+                            "extdata/LB12HL_CD.mzML.gz"), 
+                          package = "RaMS")
+msdata <- grabMSdata(mzml_files)
## 
   |                                                                            
   |                                                                      |   0%
@@ -432,57 +434,58 @@ 

Exporting to SQL database

|=================================== | 50% | |======================================================================| 100% -## Total time: 1.12 secs
-
# Create the sqlite database and connect to it
-MSdb <- dbConnect(RSQLite::SQLite(), "MSdata.sqlite")
-
-# Export MS1 and MS2 data to sqlite tables
-dbWriteTable(MSdb, "MS1", msdata$MS1)
-dbWriteTable(MSdb, "MS2", msdata$MS2)
-dbListTables(MSdb)
+## Total time: 0.98 secs +
# Create the sqlite database and connect to it
+MSdb <- dbConnect(RSQLite::SQLite(), "MSdata.sqlite")
+
+# Export MS1 and MS2 data to sqlite tables
+dbWriteTable(MSdb, "MS1", msdata$MS1)
+dbWriteTable(MSdb, "MS2", msdata$MS2)
+dbListTables(MSdb)
## [1] "MS1" "MS2"
-
# Perform a simple query to ensure data was exported correctly
-dbGetQuery(MSdb, 'SELECT * FROM MS1 LIMIT 3')
+
# Perform a simple query to ensure data was exported correctly
+dbGetQuery(MSdb, 'SELECT * FROM MS1 LIMIT 3')
##      rt       mz        int          filename
 ## 1 4.009 139.0503 1800550.12 LB12HL_AB.mzML.gz
 ## 2 4.009 148.0967  206310.81 LB12HL_AB.mzML.gz
 ## 3 4.009 136.0618   71907.15 LB12HL_AB.mzML.gz
-
# Perform EIC extraction in SQL rather than in R
-EIC_query <- 'SELECT * FROM MS1 WHERE mz BETWEEN :lower_bound AND :upper_bound'
-query_params <- list(lower_bound=118.086, upper_bound=118.087)
-EIC <- dbGetQuery(MSdb, EIC_query, params = query_params)
-
-# Append with additional files
-extra_file <- system.file("extdata", "LB12HL_EF.mzML.gz", package = "RaMS")
-extra_msdata <- grabMSdata(extra_file, grab_what = "everything")
+
# Perform EIC extraction in SQL rather than in R
+EIC_query <- 'SELECT * FROM MS1 WHERE mz BETWEEN :lower_bound AND :upper_bound'
+query_params <- list(lower_bound=118.086, upper_bound=118.087)
+EIC <- dbGetQuery(MSdb, EIC_query, params = query_params)
+
+# Append with additional files
+extra_file <- system.file("extdata", "LB12HL_EF.mzML.gz", package = "RaMS")
+extra_msdata <- grabMSdata(extra_file, grab_what = "everything")
## 
-## Reading file LB12HL_EF.mzML.gz... 0.16 secs 
-## Reading MS1 data...0.15 secs 
+## Reading file LB12HL_EF.mzML.gz... 0.15 secs 
+## Reading MS1 data...0.3 secs 
 ## Reading MS2 data...0.01 secs 
-## Reading BPC...0.07 secs 
-## Reading TIC...0.07 secs 
-## Reading file metadata...0.14 secs 
-## Total time: 0.62 secs
-
dbGetQuery(MSdb, 'SELECT COUNT(*) FROM MS1') # Initial number of rows
+## Reading BPC...0.06 secs +## Reading TIC...0.06 secs +## Reading file metadata...0.1 secs +## Binding files together into single data.table +## Total time: 0.69 secs +
dbGetQuery(MSdb, 'SELECT COUNT(*) FROM MS1') # Initial number of rows
##   COUNT(*)
 ## 1    42313
-
dbAppendTable(MSdb, "MS1", extra_msdata$MS1)
+
dbAppendTable(MSdb, "MS1", extra_msdata$MS1)
## [1] 22124
-
# Confirm three different files exist in DB
-dbGetQuery(MSdb, 'SELECT DISTINCT filename FROM MS1')
+
# Confirm three different files exist in DB
+dbGetQuery(MSdb, 'SELECT DISTINCT filename FROM MS1')
##            filename
 ## 1 LB12HL_AB.mzML.gz
 ## 2 LB12HL_CD.mzML.gz
 ## 3 LB12HL_EF.mzML.gz
-
# Confirm new rows have been added
-dbGetQuery(MSdb, 'SELECT COUNT(*) FROM MS1')
+
# Confirm new rows have been added
+dbGetQuery(MSdb, 'SELECT COUNT(*) FROM MS1')
##   COUNT(*)
 ## 1    64437
-
# Disconnect after export
-dbDisconnect(MSdb)
-
-# Clean up afterward
-unlink("MSdata.sqlite")
+
# Disconnect after export
+dbDisconnect(MSdb)
+
+# Clean up afterward
+unlink("MSdata.sqlite")

Interfacing with Python via reticulate

@@ -492,11 +495,11 @@

Interfacing with Python via reticulate

Python code chunks as shown below.

R code chunk: {r}

-
# Locate a couple MS files
-data_dir <- system.file("extdata", package = "RaMS")
-file_paths <- list.files(data_dir, pattern = "HL.*mzML", full.names = TRUE)
-
-msdata <- grabMSdata(files = file_paths, grab_what = "BPC")$BPC
+
# Locate a couple MS files
+data_dir <- system.file("extdata", package = "RaMS")
+file_paths <- list.files(data_dir, pattern = "HL.*mzML", full.names = TRUE)
+
+msdata <- grabMSdata(files = file_paths, grab_what = "BPC")$BPC
## 
   |                                                                            
   |                                                                      |   0%
@@ -506,18 +509,18 @@ 

R code chunk: {r}

|=============================================== | 67% | |======================================================================| 100% -## Total time: 0.42 secs
+## Total time: 0.34 secs

Python code chunk: {python}

-
# Not run to pass R CMD check on GitHub
-# Make sure python, matplotlib, and seaborn are installed
-
-import seaborn as sns
-import matplotlib.pyplot as plt
-
-sns.relplot(data=r.msdata, kind="line", x="rt", y="int", hue="filename")
-plt.show()
+
# Not run to pass R CMD check on GitHub
+# Make sure python, matplotlib, and seaborn are installed
+
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+sns.relplot(data=r.msdata, kind="line", x="rt", y="int", hue="filename")
+plt.show()
diff --git a/doc/speed_size_comparison.R b/doc/speed_size_comparison.R new file mode 100644 index 0000000..d495564 --- /dev/null +++ b/doc/speed_size_comparison.R @@ -0,0 +1,563 @@ +## ----setup, include=FALSE----------------------------------------------------- +knitr::opts_chunk$set(echo = TRUE, eval=FALSE) +options(rmarkdown.html_vignette.check_title = FALSE) +data.table::setDTthreads(2) + +## ----------------------------------------------------------------------------- +# library(RaMS) +# library(tidyverse) +# library(microbenchmark) +# library(MSnbase) +# library(Spectra) +# library(DBI) +# library(arrow) +# library(rvest) +# library(xml2) +# +# BiocParallel::register(BiocParallel::SerialParam(stop.on.error = FALSE, progressbar = TRUE)) + +## ----file download------------------------------------------------------------ +# set.seed(123) +# +# n_ms_files <- 10 +# base_url <- "ftp://massive.ucsd.edu/v01/MSV000080030/peak/" %>% +# paste0("Forensic_study_80_volunteers/Forensic_Hands_mzXML/") %>% +# paste0("Forensic_Hands_plate_1_mzXML/Sample/") +# file_list <- base_url %>% +# read_html %>% +# xml_text() %>% +# strsplit("\\\n") %>% +# pluck(1) %>% +# str_remove(".*2016 ") %>% +# str_remove("\\\r") +# chosen_file_list <- sample(file_list, n_ms_files) +# +# dir.create("vignettes/figures/ssc_vignette_renders/Sample/") +# for(i in chosen_file_list){ +# new_file_path <- paste0("vignettes/figures/ssc_vignette_renders/Sample/", i) +# download.file(paste0(base_url, i), destfile = new_file_path, mode = "wb") +# } +# +# ms_files <- list.files("vignettes/figures/ssc_vignette_renders/Sample/", full.names = TRUE) + +## ----MSnExp------------------------------------------------------------------- +# msnexp_obj <- readMSData(ms_files, mode="inMemory", msLevel. = 1) +# plot(chromatogram(msnexp_obj, mz=pmppm(432.2810, ppm = 20))) + +## ----onDiskMSnExp------------------------------------------------------------- +# ondisk_obj <- readMSData(ms_files, mode="onDisk", msLevel. = 1) +# plot(chromatogram(ondisk_obj, mz=pmppm(432.2810, ppm = 20))) + +## ----Spectra------------------------------------------------------------------ +# getIntensities <- function(x, ...) { +# if (nrow(x)) { +# cbind(mz = NA_real_, intensity = x[, "intensity"]) +# } else cbind(mz = NA_real_, intensity = NA_real_) +# } +# +# sfs_filtered <- Spectra(ms_files, source=MsBackendMzR()) %>% +# filterMsLevel(1) %>% +# filterMzRange(pmppm(432.2810, ppm = 20)) +# sfs_agg <- addProcessing(sfs_filtered, getIntensities) +# eic <- cbind(rt=rtime(sfs_agg), int=unlist(intensity(sfs_agg), use.names = FALSE)) +# plot(eic[,"rt"], eic[,"int"], type="l") + +## ----RaMS--------------------------------------------------------------------- +# rams_obj <- grabMSdata(ms_files, grab_what="MS1") +# rams_chrom_data <- rams_obj$MS1[mz%between%pmppm(432.2810, ppm = 20)] +# plot(rams_chrom_data$rt, rams_chrom_data$int, type="l") + +## ----tmzML-------------------------------------------------------------------- +# tmzml_names <- paste0(dirname(dirname(ms_files)), "/tmzMLs/", gsub("mzXML", "tmzML", basename(ms_files))) +# dir.create("vignettes/figures/ssc_vignette_renders/tmzMLs") +# bpmapply(tmzmlMaker, ms_files, tmzml_names, BPPARAM = SnowParam(workers = 3, progressbar = TRUE, tasks=length(tmzml_names))) +# tmzml_obj <- grabMSdata(tmzml_names) +# tmzml_chrom_data <- tmzml_obj$MS1[mz%between%pmppm(432.2810, ppm = 20)] +# plot(tmzml_chrom_data$rt, tmzml_chrom_data$int, type="l") + +## ----arrow-------------------------------------------------------------------- +# write_dataset(rams_obj$MS1, path = "vignettes/figures/ssc_vignette_renders/pqds") +# arrow_data <- open_dataset("vignettes/figures/ssc_vignette_renders/pqds") %>% +# filter(mz%between%pmppm(432.2810, ppm = 20)) %>% +# dplyr::collect() +# plot(arrow_data$rt, arrow_data$int, type="l") + +## ----sqlite database---------------------------------------------------------- +# MSdb <- dbConnect(RSQLite::SQLite(), "vignettes/figures/ssc_vignette_renders/MSdata.sqlite") +# dbWriteTable(MSdb, "MS1", rams_obj$MS1, overwrite=TRUE) +# EIC_query <- 'SELECT * FROM MS1 WHERE mz BETWEEN :lower_bound AND :upper_bound' +# query_params <- as.list(pmppm(432.2810, ppm = 20)) +# names(query_params) <- c("lower_bound", "upper_bound") +# sql_data <- dbGetQuery(MSdb, EIC_query, params = query_params) +# plot(sql_data$rt, sql_data$int, type="l") +# +# rs <- dbSendQuery(MSdb, "CREATE INDEX mz ON MS1 (mz)") +# dbClearResult(rs) +# sql_data <- dbGetQuery(MSdb, EIC_query, params = query_params) +# sql_data <- sql_data[order(sql_data$filename, sql_data$rt),] +# plot(sql_data$rt, sql_data$int, type="l") +# dbDisconnect(MSdb) + +## ----time2make---------------------------------------------------------------- +# msnexp_make_fun <- function(){ +# readMSData(ms_files, mode="inMemory", msLevel. = 1) +# } +# ondisk_make_fun <- function(){ +# readMSData(ms_files, mode="onDisk", msLevel. = 1) +# } +# spectra_make_fun <- function(){ +# Spectra(ms_files, source=MsBackendMzR()) %>% filterMsLevel(1) +# } +# rams_make_fun <- function(){ +# grabMSdata(ms_files, grab_what="MS1") +# } +# tmzml_make_fun <- function(){ +# tmzml_names <- paste0(dirname(dirname(ms_files)), "/tmzMLs/", +# gsub("mzXML", "tmzML", basename(ms_files))) +# dir.create("vignettes/figures/ssc_vignette_renders/tmzMLs") +# mapply(tmzmlMaker, ms_files, tmzml_names) +# unlink("vignettes/figures/ssc_vignette_renders/tmzMLs", recursive = TRUE) +# } +# arrow_make_fun <- function(){ +# msdata <- grabMSdata(ms_files, grab_what="MS1") +# write_dataset(msdata$MS1, path = "vignettes/figures/ssc_vignette_renders/pqds") +# unlink("vignettes/figures/ssc_vignette_renders/pqds", recursive = TRUE) +# } +# sql_make_fun <- function(){ +# msdata <- grabMSdata(ms_files, grab_what="MS1") +# MSdb <- dbConnect(RSQLite::SQLite(), "vignettes/figures/ssc_vignette_renders/MSdata.sqlite") +# dbWriteTable(MSdb, "MS1", msdata$MS1, overwrite=TRUE) +# dbDisconnect(MSdb) +# unlink("vignettes/figures/ssc_vignette_renders/MSdata.sqlite") +# } +# sqlidx_make_fun <- function(){ +# msdata <- grabMSdata(ms_files, grab_what="MS1") +# MSdb_idx <- dbConnect(RSQLite::SQLite(), "vignettes/figures/ssc_vignette_renders/MSdata_idx.sqlite") +# dbWriteTable(MSdb_idx, "MS1", msdata$MS1, overwrite=TRUE) +# rs <- dbSendQuery(MSdb_idx, "CREATE INDEX mz ON MS1 (mz)") +# dbClearResult(rs) +# dbDisconnect(MSdb_idx) +# unlink("vignettes/figures/ssc_vignette_renders/MSdata_idx.sqlite") +# } +# +# make_timings <- microbenchmark( +# msnexp_make_fun(), ondisk_make_fun(), spectra_make_fun(), rams_make_fun(), +# tmzml_make_fun(), arrow_make_fun(), sql_make_fun(), sqlidx_make_fun(), +# times = 10) +# saveRDS(make_timings, "vignettes/figures/ssc_vignette_renders/make_timings.rds") + +## ----plot time2make----------------------------------------------------------- +# make_timings <- readRDS("vignettes/figures/ssc_vignette_renders/make_timings.rds") +# make_timings %>% +# as.data.frame() %>% +# arrange(expr) %>% +# mutate(expr=str_remove(expr, "_make_fun\\(\\)")) %>% +# mutate(rep_type=case_when( +# expr%in%c("msnexp", "ondisk", "spectra", "rams")~"Every R session", +# TRUE~"Single-time only" +# )) %>% +# mutate(expr=factor(expr, levels=c("msnexp", "ondisk", "spectra", "rams", +# "tmzml", "arrow", "sql", "sqlidx"), +# labels=c("MSnExp", "OnDiskMSnExp", "Spectra", "RaMS", +# "tmzMLs", "Arrow", "SQL", "SQL (indexed)"))) %>% +# ggplot() + +# geom_boxplot(aes(x=expr, y=time/1e9)) + +# geom_hline(yintercept = 0) + +# facet_wrap(~rep_type, nrow = 1, scales="free_x") + +# labs(y="Seconds", x=NULL) + +# theme(axis.text.x = element_text(angle=90, hjust=1, vjust=0.5)) +# ggsave("vignettes/figures/ssc_vignette_renders/make_time_gp.png", width = 6.5, height = 4, units = "in", device = "png", dpi = 144) + +## ----parallel_proc------------------------------------------------------------ +# unpar_rams <- function(){ +# print("Unpar RaMS") +# lapply(ms_files, grabMSdata) +# } +# unpar_tmzml <- function(){ +# print("Unpar tmzML") +# tmzml_names <- paste0(dirname(dirname(ms_files)), "/tmzMLs/", +# gsub("mzXML", "tmzML", basename(ms_files))) +# dir.create("vignettes/figures/ssc_vignette_renders/tmzMLs") +# pbapply::pbmapply(tmzmlMaker, ms_files, tmzml_names) +# unlink("vignettes/figures/ssc_vignette_renders/tmzMLs", recursive = TRUE) +# } +# +# library(BiocParallel) +# par_param <- SnowParam(workers = 5, progressbar = TRUE) +# par_rams <- function(){ +# print("Par RaMS") +# bplapply(ms_files, grabMSdata, BPPARAM = par_param) +# } +# par_tmzml <- function(){ +# print("Par tmzML") +# tmzml_names <- paste0(dirname(dirname(ms_files)), "/tmzMLs/", +# gsub("mzXML", "tmzML", basename(ms_files))) +# dir.create("vignettes/figures/ssc_vignette_renders/tmzMLs") +# bpmapply(tmzmlMaker, ms_files, tmzml_names, BPPARAM = par_param) +# unlink("vignettes/figures/ssc_vignette_renders/tmzMLs", recursive = TRUE) +# } +# +# par_timings <- microbenchmark(par_rams(), unpar_rams(), par_tmzml(), unpar_tmzml(), times = 5) +# saveRDS(par_timings, "vignettes/figures/ssc_vignette_renders/par_timings.rds") + +## ----parallel proc plot------------------------------------------------------- +# par_timings <- readRDS("vignettes/figures/ssc_vignette_renders/par_timings.rds") +# par_timings %>% +# as.data.frame() %>% +# separate(expr, into = c("sub_type", "par_type"), sep = "_") %>% +# mutate(par_type=str_remove(par_type, "\\(\\)")) %>% +# mutate(sub_type=factor(sub_type, levels=c("unpar", "par"), +# labels=c("Sequential", "Parallel"))) %>% +# mutate(par_type=factor(par_type, levels=c("rams", "tmzml"), +# labels=c("RaMS", "tmzMLs"))) %>% +# ggplot() + +# geom_boxplot(aes(x=par_type, y=time/1e9)) + +# geom_hline(yintercept = 0) + +# facet_wrap(~sub_type, nrow = 1) + +# labs(y="Seconds", x=NULL) + +# theme(axis.text.x = element_text(angle=90, hjust=1, vjust=0.5)) +# ggsave("vignettes/figures/ssc_vignette_renders/par_time_gp.png", width = 6.5, height = 3, units = "in", device = "png", dpi = 144) + +## ----time2query--------------------------------------------------------------- +# msnexp_obj <- readMSData(ms_files, mode="inMemory", msLevel. = 1) +# ondisk_obj <- readMSData(ms_files, mode="onDisk", msLevel. = 1) +# spectra_obj <- Spectra(ms_files, source=MsBackendMzR()) %>% filterMsLevel(1) +# rams_obj <- grabMSdata(ms_files, grab_what="MS1") +# +# tmzml_names <- paste0(dirname(dirname(ms_files)), "/tmzMLs/", +# gsub("mzXML", "tmzML", basename(ms_files))) +# dir.create("vignettes/figures/ssc_vignette_renders/tmzMLs") +# mapply(tmzmlMaker, ms_files, tmzml_names) +# +# write_dataset(rams_obj$MS1, path = "vignettes/figures/ssc_vignette_renders/pqds") +# +# MSdb <- dbConnect(RSQLite::SQLite(), "vignettes/figures/ssc_vignette_renders/MSdata.sqlite") +# dbWriteTable(MSdb, "MS1", rams_obj$MS1, overwrite=TRUE) +# dbDisconnect(MSdb) +# +# MSdb_idx <- dbConnect(RSQLite::SQLite(), "vignettes/figures/ssc_vignette_renders/MSdata_idx.sqlite") +# dbWriteTable(MSdb_idx, "MS1", rams_obj$MS1, overwrite=TRUE) +# rs <- dbSendQuery(MSdb_idx, "CREATE INDEX mz ON MS1 (mz)") +# dbClearResult(rs) +# dbDisconnect(MSdb_idx) +# +# msnexp_query_fun <- function(){ +# plot(chromatogram(msnexp_obj, mz=pmppm(432.2810, ppm = 20))) +# } +# ondisk_query_fun <- function(){ +# plot(chromatogram(ondisk_obj, mz=pmppm(432.2810, ppm = 20))) +# } +# spectra_query_fun <- function(){ +# sfs_filtered <- spectra_obj %>% filterMzRange(pmppm(432.2810, ppm = 20)) +# getIntensities <- function(x, ...) { +# if (nrow(x)) { +# cbind(mz = NA_real_, intensity = x[, "intensity"]) +# } else cbind(mz = NA_real_, intensity = NA_real_) +# } +# sfs_agg <- addProcessing(sfs_filtered, getIntensities) +# eic <- cbind(rt=rtime(sfs_agg), int=unlist(intensity(sfs_agg), use.names = FALSE)) +# plot(eic[,"rt"], eic[,"int"], type="l") +# } +# rams_query_fun <- function(){ +# rams_chrom_data <- rams_obj$MS1[mz%between%pmppm(432.2810, ppm = 20)] +# plot(rams_chrom_data$rt, rams_chrom_data$int, type="l") +# } +# tmzml_query_fun <- function(){ +# tmzml_names <- list.files("vignettes/figures/ssc_vignette_renders/tmzMLs", full.names = TRUE) +# tmzml_obj <- grabMSdata(tmzml_names) +# tmzml_chrom_data <- tmzml_obj$MS1[mz%between%pmppm(432.2810, ppm = 20)] +# plot(tmzml_chrom_data$rt, tmzml_chrom_data$int, type="l") +# } +# arrow_query_fun <- function(){ +# arrow_data <- open_dataset("vignettes/figures/ssc_vignette_renders/pqds") %>% +# filter(mz%between%pmppm(432.2810, ppm = 20)) %>% +# dplyr::collect() +# plot(arrow_data$rt, arrow_data$int, type="l") +# } +# sql_query_fun <- function(){ +# MSdb <- dbConnect(RSQLite::SQLite(), "vignettes/figures/ssc_vignette_renders/MSdata.sqlite") +# EIC_query <- 'SELECT * FROM MS1 WHERE mz BETWEEN :lower_bound AND :upper_bound' +# query_params <- as.list(pmppm(432.2810, ppm = 20)) +# names(query_params) <- c("lower_bound", "upper_bound") +# sql_data <- dbGetQuery(MSdb, EIC_query, params = query_params) +# plot(sql_data$rt, sql_data$int, type="l") +# } +# sqlidx_query_fun <- function(){ +# MSdb_idx <- dbConnect(RSQLite::SQLite(), "vignettes/figures/ssc_vignette_renders/MSdata_idx.sqlite") +# EIC_query <- 'SELECT * FROM MS1 WHERE mz BETWEEN :lower_bound AND :upper_bound' +# query_params <- as.list(pmppm(432.2810, ppm = 20)) +# names(query_params) <- c("lower_bound", "upper_bound") +# sql_data <- dbGetQuery(MSdb_idx, EIC_query, params = query_params) +# sql_data <- sql_data[order(sql_data$filename, sql_data$rt),] +# plot(sql_data$rt, sql_data$int, type="l") +# } +# +# query_timings <- microbenchmark( +# msnexp_query_fun(), ondisk_query_fun(), spectra_query_fun(), rams_query_fun(), +# tmzml_query_fun(), arrow_query_fun(), sql_query_fun(), sqlidx_query_fun(), +# times = 10 +# ) +# query_timings +# saveRDS(query_timings, "vignettes/figures/ssc_vignette_renders/query_timings.rds") + +## ----time2query plot---------------------------------------------------------- +# query_timings <- readRDS("vignettes/figures/ssc_vignette_renders/query_timings.rds") +# query_timings %>% +# as.data.frame() %>% +# arrange(expr) %>% +# mutate(expr=str_remove(expr, "_query_fun\\(\\)")) %>% +# mutate(rep_type=case_when( +# expr%in%c("msnexp", "ondisk", "spectra", "rams")~"Every R session", +# TRUE~"Single-time only" +# )) %>% +# mutate(expr=factor(expr, levels=c("msnexp", "ondisk", "spectra", "rams", +# "tmzml", "arrow", "sql", "sqlidx"), +# labels=c("MSnExp", "OnDiskMSnExp", "Spectra", "RaMS", +# "tmzMLs", "Arrow", "SQL", "SQL (indexed)"))) %>% +# ggplot() + +# geom_boxplot(aes(x=expr, y=time/1e9)) + +# scale_y_log10() + +# labs(y="Seconds", x=NULL) + +# theme(axis.text.x = element_text(angle=90, hjust=1, vjust=0.5)) +# ggsave("vignettes/figures/ssc_vignette_renders/query_time_gp.png", width = 6.5, height = 5, units = "in", device = "png", dpi = 144) + +## ----double-check sql_idx vs RaMS--------------------------------------------- +# MSdb_idx <- dbConnect(RSQLite::SQLite(), "vignettes/figures/ssc_vignette_renders/MSdata_idx.sqlite") +# EIC_query <- 'SELECT * FROM MS1 WHERE mz BETWEEN :lower_bound AND :upper_bound' +# query_params <- as.list(pmppm(432.2810, ppm = 20)) +# names(query_params) <- c("lower_bound", "upper_bound") +# dbGetQuery(MSdb_idx, EIC_query, params = query_params) %>% qplotMS1data() +# +# rams_obj$MS1[mz%between%pmppm(432.2810, ppm = 20)] %>% qplotMS1data() + +## ----make query strings------------------------------------------------------- +# rams_obj <- grabMSdata(ms_files, grab_what="MS1") +# grouped_ms1 <- rams_obj$MS1 %>% +# arrange(desc(int)) %>% +# mutate(mz_group=mz_group(mz, ppm = 10, max_groups = 10, min_group_size=20)) %>% +# drop_na() +# # grouped_ms1 %>% +# # qplotMS1data(facet_col="mz_group", facet_args = list(ncol=2)) +# mzs_to_grab <- grouped_ms1 %>% +# group_by(mz_group) %>% +# summarise(mean_mz=mean(mz), sd_mz=sd(mz), mean_rt=mean(rt)) %>% +# pull(mean_mz) +# +# rams_arrow_call <- lapply(mzs_to_grab, function(mz_i){ +# mzrange <- pmppm(mz_i, 10) +# call("between", as.name("mz"), mzrange[[1]], mzrange[[2]]) +# }) %>% paste(collapse = "|") +# +# sql_comb_call <- sapply(mzs_to_grab, function(mz_i){ +# paste("mz BETWEEN", pmppm(mz_i, 10)[1], "AND", pmppm(mz_i, 10)[2]) +# }) %>% paste(collapse = " OR ") %>% paste("SELECT * FROM MS1 WHERE", .) +# +# print(rams_arrow_call) +# print(sql_comb_call) + +## ----multichrom query timing-------------------------------------------------- +# rams_uni_fun <- function(){ +# print("RaMS unified") +# rams_obj$MS1[eval(parse(text=rams_arrow_call))] +# } +# rams_loop_fun <- function(){ +# print("RaMS loop") +# lapply(mzs_to_grab, function(mz_i){ +# rams_obj$MS1[mz%between%pmppm(mz_i, 10)] +# }) %>% bind_rows() %>% distinct() +# } +# +# arrow_ds <- open_dataset("vignettes/figures/ssc_vignette_renders/pqds") +# arrow_uni_fun <- function(){ +# print("Arrow unified") +# arrow_ds %>% +# filter(eval(parse(text = rams_arrow_call))) %>% +# collect() +# } +# arrow_loop_fun <- function(){ +# print("Arrow loop") +# lapply(mzs_to_grab, function(mz_i){ +# arrow_ds %>% +# filter(mz%between%pmppm(mz_i, 10)) %>% +# collect() +# }) %>% bind_rows() %>% distinct() +# } +# +# MSdb_idx <- dbConnect(RSQLite::SQLite(), "vignettes/figures/ssc_vignette_renders/MSdata_idx.sqlite") +# sql_uni_fun <- function(){ +# print("SQL unified") +# dbGetQuery(MSdb_idx, sql_comb_call) +# } +# sql_query_base <- 'SELECT * FROM MS1 WHERE mz BETWEEN :lower_bound AND :upper_bound' +# sql_loop_fun <- function(){ +# print("SQL loop") +# lapply(mzs_to_grab, function(mz_i){ +# query_params <- as.list(pmppm(mz_i, ppm = 20)) +# names(query_params) <- c("lower_bound", "upper_bound") +# sql_data <- dbGetQuery(MSdb_idx, sql_query_base, params = query_params) +# }) %>% bind_rows() %>% distinct() +# } +# +# multichrom_timings <- microbenchmark( +# rams_uni_fun(), rams_loop_fun(), arrow_uni_fun(), arrow_loop_fun(), +# sql_uni_fun(), sql_loop_fun(), times = 10 +# ) +# saveRDS(multichrom_timings, "vignettes/figures/ssc_vignette_renders/multichrom_timings.rds") +# +# multichrom_timings <- readRDS("vignettes/figures/ssc_vignette_renders/multichrom_timings.rds") +# multichrom_timings %>% +# as.data.frame() %>% +# arrange(expr) %>% +# mutate(expr=str_remove(expr, "_fun\\(\\)")) %>% +# separate(expr, into = c("expr", "query_type"), sep = "_") %>% +# mutate(expr=factor(expr, levels=c("rams", "arrow", "sql"), +# labels=c("RaMS", "Arrow", "SQL"))) %>% +# mutate(query_type=factor(query_type, levels=c("uni", "loop"), +# labels=c("Unified query", "Query loop"))) %>% +# ggplot() + +# geom_boxplot(aes(x=query_type, y=time/1e9), lwd=1) + +# facet_wrap(~expr, nrow=1) + +# scale_y_log10() + +# labs(y="Seconds", x=NULL, color=NULL) + +# theme_bw() +# ggsave("vignettes/figures/ssc_vignette_renders/multichrom_gp.png", width = 6.5, height = 4, units = "in", device = "png", dpi = 144) + +## ----sizing info-------------------------------------------------------------- +# size_list <- list() +# +# size_list$mzXML <- sum(file.size(ms_files)) +# +# msnexp_obj <- readMSData(ms_files, mode="inMemory", msLevel. = 1) +# size_list$msnexp_obj <- pryr::object_size(msnexp_obj) +# rm(msnexp_obj) +# +# ondisk_obj <- readMSData(ms_files, mode="onDisk", msLevel. = 1) +# size_list$ondisk_obj <- pryr::object_size(ondisk_obj) +# rm(ondisk_obj) +# +# sfs_filtered <- Spectra(ms_files, source=MsBackendMzR()) %>% +# filterMsLevel(1) +# size_list$spectra <- pryr::object_size(sfs_filtered) +# rm(sfs_filtered) +# +# rams_obj <- grabMSdata(ms_files, grab_what="MS1") +# size_list$rams <- pryr::object_size(rams_obj) +# +# tmzml_names <- paste0(dirname(dirname(ms_files)), "/tmzMLs/", gsub("mzXML", "tmzML", basename(ms_files))) +# dir.create("vignettes/figures/ssc_vignette_renders/tmzMLs") +# bpmapply(tmzmlMaker, ms_files, tmzml_names, BPPARAM = SnowParam(workers = 5, progressbar = TRUE, tasks=length(tmzml_names))) +# size_list$tmzml <- sum(file.size(list.files("vignettes/figures/ssc_vignette_renders/tmzMLs", full.names = TRUE))) +# unlink("vignettes/figures/ssc_vignette_renders/tmzMLs", recursive = TRUE) +# +# write_dataset(rams_obj$MS1, path = "vignettes/figures/ssc_vignette_renders/pqds") +# size_list$arrow <- sum(file.size(list.files("vignettes/figures/ssc_vignette_renders/pqds", full.names = TRUE))) +# unlink("vignettes/figures/ssc_vignette_renders/pqds", recursive = TRUE) +# +# MSdb <- dbConnect(RSQLite::SQLite(), "vignettes/figures/ssc_vignette_renders/MSdata.sqlite") +# dbWriteTable(MSdb, "MS1", rams_obj$MS1, overwrite=TRUE) +# dbDisconnect(MSdb) +# size_list$MSdb <- file.size("vignettes/figures/ssc_vignette_renders/MSdata.sqlite") +# +# +# MSdb_idx <- dbConnect(RSQLite::SQLite(), "vignettes/figures/ssc_vignette_renders/MSdata.sqlite") +# rs <- dbSendQuery(MSdb_idx, "CREATE INDEX mz ON MS1 (mz)") +# dbClearResult(rs) +# dbDisconnect(MSdb_idx) +# size_list$MSdb_idx <- file.size("vignettes/figures/ssc_vignette_renders/MSdata_idx.sqlite") +# unlink("vignettes/figures/ssc_vignette_renders/MSdata.sqlite") +# +# saveRDS(size_list, "vignettes/figures/ssc_vignette_renders/size_list.rds") + +## ----plot size info----------------------------------------------------------- +# size_list <- readRDS("vignettes/figures/ssc_vignette_renders/size_list.rds") +# size_list %>% +# within(rm(mzXML)) %>% +# sapply(as.numeric) %>% +# data.frame(bytes=.) %>% +# rownames_to_column("expr") %>% +# mutate(expr=str_remove(expr, "_obj")) %>% +# mutate(expr=str_replace(expr, "MSdb_?", "sql")) %>% +# mutate(mem_type=case_when( +# expr%in%c("msnexp", "ondisk", "spectra", "rams")~"Memory", +# TRUE~"Disk" +# )) %>% +# mutate(mem_type=factor(mem_type, levels=c("Memory", "Disk"))) %>% +# mutate(expr=factor(expr, levels=c("msnexp", "ondisk", "spectra", "rams", +# "tmzml", "arrow", "sql", "sqlidx"), +# labels=c("MSnExp", "OnDiskMSnExp", "Spectra", "RaMS", +# "tmzMLs", "Arrow", "SQL", "SQL (indexed)"))) %>% +# ggplot() + +# geom_hline(yintercept = size_list$mzXML/(1024^3)) + +# geom_point(aes(x=expr, y=bytes/(1024^3))) + +# scale_y_log10(breaks=c(0.001, 0.01, 0.1, 1, 10), labels=c("1MB", "10MB", "100MB", "1GB", "10GB"), +# limits=c(0.001, 10)) + +# facet_wrap(~mem_type, scales = "free_x") + +# labs(x=NULL, y=NULL) +# ggsave("vignettes/figures/ssc_vignette_renders/size_cons.png", width = 6.5, height = 4, units = "in", device = "png", dpi = 144) + +## ----summary plot------------------------------------------------------------- +# make_timings <- readRDS("vignettes/figures/ssc_vignette_renders/make_timings.rds") +# query_timings <- readRDS("vignettes/figures/ssc_vignette_renders/query_timings.rds") +# size_df <- readRDS("vignettes/figures/ssc_vignette_renders/size_list.rds") %>% +# sapply(as.numeric) %>% +# data.frame(size=.) %>% +# mutate(size=size/1024^3) %>% +# rownames_to_column("expr") %>% +# mutate(expr=str_remove(expr, "_obj")) %>% +# mutate(expr=str_replace(expr, "MSdb_?", "sql")) +# +# bind_rows(make_timings, query_timings) %>% +# as.data.frame() %>% +# group_by(expr) %>% +# mutate(time=time/1e9) %>% +# summarise(med_time=median(time), IQR_time=IQR(time)) %>% +# mutate(expr=str_remove(expr, "_fun\\(\\)")) %>% +# separate(expr, into = c("expr", "time_type")) %>% +# left_join(size_df) %>% +# mutate(rep_type=case_when( +# expr%in%c("msnexp", "ondisk", "spectra", "rams")~"Memory", +# TRUE~"Disk" +# )) %>% +# mutate(expr=factor(expr, levels=c("msnexp", "ondisk", "spectra", "rams", +# "tmzml", "arrow", "sql", "sqlidx"), +# labels=c("MSnExp", "OnDiskMSnExp", "Spectra", "RaMS", +# "tmzMLs", "Arrow", "SQL", "SQL (indexed)"))) %>% +# pivot_wider(names_from = time_type, values_from=c("med_time", "IQR_time")) %>% +# ggplot() + +# geom_vline(xintercept = 0) + +# geom_linerange(aes(x=med_time_make, ymin=med_time_query-IQR_time_query*2, +# ymax=med_time_query+IQR_time_query*2, color=expr)) + +# geom_linerange(aes(y=med_time_query, xmin=med_time_make-IQR_time_make*2, +# xmax=med_time_make+IQR_time_make*2, color=expr)) + +# geom_point(aes(x=med_time_make, y=med_time_query, color=expr, +# size=size, shape=rep_type)) + +# scale_shape_manual(values=c(16, 15)) + +# scale_y_log10() + +# coord_flip() + +# guides(color = guide_legend(order = 1), shape = guide_legend(order = 2), +# size=guide_legend(order=3)) + +# labs(x="Time to transform (s)", y="Time to query (s)", color=NULL, size="Size (GB)", +# shape="Storage") + +# theme_bw() +# ggsave("vignettes/figures/ssc_vignette_renders/sum_plot.png", width = 6.5, height = 5, units = "in", device = "png", dpi = 144) +# +# +# # bind_rows(make_timings, query_timings) %>% +# # as.data.frame() %>% +# # group_by(expr) %>% +# # mutate(time=time/1e9) %>% +# # summarise(med_time=median(time)) %>% +# # mutate(expr=str_remove(expr, "_fun\\(\\)")) %>% +# # separate(expr, into = c("expr", "time_type")) %>% +# # pivot_wider(names_from = time_type, values_from=med_time) %>% +# # left_join(size_df) %>% +# # plotly::plot_ly(x=~make, y=~query, z=~size, hovertext=~expr, +# # type="scatter3d", mode="markers") + +## ----cleanup------------------------------------------------------------------ +# unlink("vignettes/figures/ssc_vignette_renders/tmzMLs", recursive = TRUE) +# unlink("vignettes/figures/ssc_vignette_renders/pqds", recursive = TRUE) +# unlink("vignettes/figures/ssc_vignette_renders/MSdata.sqlite") +# unlink("vignettes/figures/ssc_vignette_renders/MSdata_idx.sqlite") +# unlink("vignettes/figures/ssc_vignette_renders/Sample/", recursive = TRUE) + diff --git a/doc/speed_size_comparison.Rmd b/doc/speed_size_comparison.Rmd index 7318949..8c52b83 100644 --- a/doc/speed_size_comparison.Rmd +++ b/doc/speed_size_comparison.Rmd @@ -14,6 +14,7 @@ vignette: > ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE, eval=FALSE) options(rmarkdown.html_vignette.check_title = FALSE) +data.table::setDTthreads(2) ``` ```{r} diff --git a/doc/speed_size_comparison.html b/doc/speed_size_comparison.html index a393201..0d7ffd6 100644 --- a/doc/speed_size_comparison.html +++ b/doc/speed_size_comparison.html @@ -12,7 +12,7 @@ - + Benchmarking - RaMS, MSnBase, Spectra, Arrow, and SQL @@ -351,7 +351,7 @@

Benchmarking - RaMS, MSnBase, Spectra, Arrow, and SQL

William Kumler

-

2023-11-29

+

2023-12-13

diff --git a/vignettes/Intro-to-RaMS.Rmd b/vignettes/Intro-to-RaMS.Rmd index 7d411eb..43afd02 100644 --- a/vignettes/Intro-to-RaMS.Rmd +++ b/vignettes/Intro-to-RaMS.Rmd @@ -12,6 +12,7 @@ vignette: > ```{r, include = FALSE} options(rmarkdown.html_vignette.check_title = FALSE) options(tidyverse.quiet = TRUE) +data.table::setDTthreads(2) knitr::opts_chunk$set( collapse = TRUE, diff --git a/vignettes/Intro-to-tmzML.Rmd b/vignettes/Intro-to-tmzML.Rmd index c7d2ae9..e79b9f8 100644 --- a/vignettes/Intro-to-tmzML.Rmd +++ b/vignettes/Intro-to-tmzML.Rmd @@ -12,6 +12,7 @@ vignette: > ```{r setup, include = FALSE} options(rmarkdown.html_vignette.check_title = FALSE) options(tidyverse.quiet = TRUE) +data.table::setDTthreads(2) knitr::opts_chunk$set( collapse = TRUE, diff --git a/vignettes/Minifying-files-with-RaMS.Rmd b/vignettes/Minifying-files-with-RaMS.Rmd index 6ddbe20..f2e9ea4 100644 --- a/vignettes/Minifying-files-with-RaMS.Rmd +++ b/vignettes/Minifying-files-with-RaMS.Rmd @@ -12,6 +12,7 @@ vignette: > ```{r setup, include = FALSE} options(rmarkdown.html_vignette.check_title = FALSE) options(tidyverse.quiet = TRUE) +data.table::setDTthreads(2) knitr::opts_chunk$set( collapse = TRUE, diff --git a/vignettes/RaMS-and-friends.Rmd b/vignettes/RaMS-and-friends.Rmd index c215d02..1ff0206 100644 --- a/vignettes/RaMS-and-friends.Rmd +++ b/vignettes/RaMS-and-friends.Rmd @@ -9,6 +9,7 @@ vignette: > ```{r setup, include = FALSE} options(rmarkdown.html_vignette.check_title = FALSE) +data.table::setDTthreads(2) ``` **Table of contents:** diff --git a/vignettes/speed_size_comparison.Rmd b/vignettes/speed_size_comparison.Rmd index 7318949..8c52b83 100644 --- a/vignettes/speed_size_comparison.Rmd +++ b/vignettes/speed_size_comparison.Rmd @@ -14,6 +14,7 @@ vignette: > ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE, eval=FALSE) options(rmarkdown.html_vignette.check_title = FALSE) +data.table::setDTthreads(2) ``` ```{r}