@@ -926,7 +926,7 @@ Saving space: EICs and rtrange
|==================================================== | 75%
|
|======================================================================| 100%
-#> Total time: 1.57 secs
+#> Total time: 1.72 secs
mzs_of_interest <- c(adenine=136.06232, valine=118.0865)
small_data <- grabMSdata(data_files, grab_what = c("EIC", "EIC_MS2"),
@@ -942,7 +942,7 @@ Saving space: EICs and rtrange
|==================================================== | 75%
|
|======================================================================| 100%
-#> Total time: 1.52 secs
+#> Total time: 1.63 secs
all_data$MS1 %>%
mutate(type="All data") %>%
@@ -986,7 +986,7 @@ Saving space: EICs and rtrange
|==================================================== | 75%
|
|======================================================================| 100%
-#> Total time: 1.49 secs
+#> Total time: 1.28 secs
all_data$MS1 %>%
mutate(type="All data") %>%
@@ -1030,26 +1030,26 @@ Speeding things up
#>
|
|================== | 25%
-#> Reading file LB12HL_AB.mzML.gz... 0.12 secs
-#> Reading MS1 data...0.1 secs
+#> Reading file LB12HL_AB.mzML.gz... 0.15 secs
+#> Reading MS1 data...0.12 secs
#> Reading MS2 data...0.01 secs
#>
|
|=================================== | 50%
-#> Reading file LB12HL_CD.mzML.gz... 0.12 secs
-#> Reading MS1 data...0.1 secs
+#> Reading file LB12HL_CD.mzML.gz... 0.15 secs
+#> Reading MS1 data...0.12 secs
#> Reading MS2 data...0.01 secs
#>
|
|==================================================== | 75%
-#> Reading file LB12HL_EF.mzML.gz... 0.12 secs
-#> Reading MS1 data...0.1 secs
+#> Reading file LB12HL_EF.mzML.gz... 0.13 secs
+#> Reading MS1 data...0.11 secs
#> Reading MS2 data...0.01 secs
#>
|
|======================================================================| 100%
#> Binding files together into single data.table
-#> Total time: 1.55 secs
+#> Total time: 1.65 secs
In general, slow file read times can be improved by compressing the
data. mzML files are highly compressible, with options to compress both
the data itself using the zlib
method and the files as a
@@ -1160,7 +1160,7 @@
The nitty-gritty details
For more information about the mzML data format and its history,
check out the specification at https://www.psidev.info/mzML.
-Vignette last built on 2023-11-29
+Vignette last built on 2023-12-13
diff --git a/doc/Intro-to-tmzML.R b/doc/Intro-to-tmzML.R
index e3a3bbf..9f1deed 100644
--- a/doc/Intro-to-tmzML.R
+++ b/doc/Intro-to-tmzML.R
@@ -1,6 +1,7 @@
## ----setup, include = FALSE---------------------------------------------------
options(rmarkdown.html_vignette.check_title = FALSE)
options(tidyverse.quiet = TRUE)
+data.table::setDTthreads(2)
knitr::opts_chunk$set(
collapse = TRUE,
diff --git a/doc/Intro-to-tmzML.Rmd b/doc/Intro-to-tmzML.Rmd
index c7d2ae9..e79b9f8 100644
--- a/doc/Intro-to-tmzML.Rmd
+++ b/doc/Intro-to-tmzML.Rmd
@@ -12,6 +12,7 @@ vignette: >
```{r setup, include = FALSE}
options(rmarkdown.html_vignette.check_title = FALSE)
options(tidyverse.quiet = TRUE)
+data.table::setDTthreads(2)
knitr::opts_chunk$set(
collapse = TRUE,
diff --git a/doc/Intro-to-tmzML.html b/doc/Intro-to-tmzML.html
index 47164cc..ff60389 100644
--- a/doc/Intro-to-tmzML.html
+++ b/doc/Intro-to-tmzML.html
@@ -462,27 +462,16 @@ Getting started with tmzML documents
ggplot(ms_data_table) + geom_line(aes(x=rt, y=int, color=filename)) + xlim(8, 9.5)
library(dplyr)
-#>
-#> Attaching package: 'dplyr'
-#> The following object is masked from 'package:RaMS':
-#>
-#> between
-#> The following objects are masked from 'package:stats':
-#>
-#> filter, lag
-#> The following objects are masked from 'package:base':
-#>
-#> intersect, setdiff, setequal, union
-ms_data_table %>%
- filter(rt%between%c(8.4, 8.9)) %>%
- group_by(filename) %>%
- summarise(area=sum(int))
-#> # A tibble: 3 × 2
-#> filename area
-#> <chr> <dbl>
-#> 1 LB12HL_AB.tmzML 13057831.
-#> 2 LB12HL_CD.tmzML 24925878.
-#> 3 LB12HL_EF.tmzML 21913457.
+ms_data_table %>%
+ filter(rt%between%c(8.4, 8.9)) %>%
+ group_by(filename) %>%
+ summarise(area=sum(int))
+#> # A tibble: 3 × 2
+#> filename area
+#> <chr> <dbl>
+#> 1 LB12HL_AB.tmzML 13057831.
+#> 2 LB12HL_CD.tmzML 24925878.
+#> 3 LB12HL_EF.tmzML 21913457.
Why not tmzML?
@@ -509,7 +498,7 @@
Why not tmzML?
#> $ MS2 : NULL
#> $ connection:List of 3
#> ..$ files : Named chr [1:3] "tmzMLs/LB12HL_AB.tmzML" "tmzMLs/LB12HL_CD.tmzML" "tmzMLs/LB12HL_EF.tmzML"
-
#> .. ..- attr(*, "names")= chr [1:3] "C:/Users/willi/AppData/Local/R/win-library/4.3/RaMS/extdata/LB12HL_AB.mzML.gz" "C:/Users/willi/AppData/Local/R/win-library/4.3/RaMS/extdata/LB12HL_CD.mzML.gz" "C:/Users/willi/AppData/Local/R/win-library/4.3/RaMS/extdata/LB12HL_EF.mzML.gz"
+
#> .. ..- attr(*, "names")= chr [1:3] "C:/Users/willi/AppData/Local/Temp/RtmpQ9wT8y/temp_libpath49a055ab1e9b/RaMS/extdata/LB12HL_AB.mzML.gz" "C:/Users/willi/AppData/Local/Temp/RtmpQ9wT8y/temp_libpath49a055ab1e9b/RaMS/extdata/LB12HL_CD.mzML.gz" "C:/Users/willi/AppData/Local/Temp/RtmpQ9wT8y/temp_libpath49a055ab1e9b/RaMS/extdata/LB12HL_EF.mzML.gz"
#> ..$ grab_what: chr [1:2] "MS1" "MS2"
#> ..$ verbosity: num 0
#> - attr(*, "class")= chr "msdata_connection"
@@ -573,7 +562,7 @@ tmzML internals
fast so we may be approaching diminishing returns.
unlink("tmzMLs", recursive = TRUE)
-Vignette last built on 2023-11-29
+Vignette last built on 2023-12-13
diff --git a/doc/Minifying-files-with-RaMS.R b/doc/Minifying-files-with-RaMS.R
index 7879811..9fe91e8 100644
--- a/doc/Minifying-files-with-RaMS.R
+++ b/doc/Minifying-files-with-RaMS.R
@@ -1,6 +1,7 @@
## ----setup, include = FALSE---------------------------------------------------
options(rmarkdown.html_vignette.check_title = FALSE)
options(tidyverse.quiet = TRUE)
+data.table::setDTthreads(2)
knitr::opts_chunk$set(
collapse = TRUE,
diff --git a/doc/Minifying-files-with-RaMS.Rmd b/doc/Minifying-files-with-RaMS.Rmd
index 6ddbe20..f2e9ea4 100644
--- a/doc/Minifying-files-with-RaMS.Rmd
+++ b/doc/Minifying-files-with-RaMS.Rmd
@@ -12,6 +12,7 @@ vignette: >
```{r setup, include = FALSE}
options(rmarkdown.html_vignette.check_title = FALSE)
options(tidyverse.quiet = TRUE)
+data.table::setDTthreads(2)
knitr::opts_chunk$set(
collapse = TRUE,
diff --git a/doc/Minifying-files-with-RaMS.html b/doc/Minifying-files-with-RaMS.html
index 0151a94..33594cd 100644
--- a/doc/Minifying-files-with-RaMS.html
+++ b/doc/Minifying-files-with-RaMS.html
@@ -365,23 +365,23 @@ Minifying files with RaMS
Below, we begin with a large MS file containing both MS1
and MS2 data and extract only the data corresponding to
valine/glycine betaine and homarine.
-library(RaMS)
-msdata_files <- list.files(
- system.file("extdata", package = "RaMS"), full.names = TRUE, pattern = "mzML"
-)[1:4]
-
-initial_filename <- msdata_files[1]
-output_filename <- gsub(x=paste0("minified_", basename(initial_filename)), "\\.gz", "")
-
-masses_of_interest <- c(118.0865, 138.0555)
-minifyMSdata(files = initial_filename, output_files = output_filename,
- mz_include = masses_of_interest, ppm = 10, warn = FALSE)
+library(RaMS)
+msdata_files <- list.files(
+ system.file("extdata", package = "RaMS"), full.names = TRUE, pattern = "mzML"
+)[1:4]
+
+initial_filename <- msdata_files[1]
+output_filename <- gsub(x=paste0("minified_", basename(initial_filename)), "\\.gz", "")
+
+masses_of_interest <- c(118.0865, 138.0555)
+minifyMSdata(files = initial_filename, output_files = output_filename,
+ mz_include = masses_of_interest, ppm = 10, warn = FALSE)
Then, when we open the file up (with RaMS
or other
software) we are left with the data corresponding only to those
compounds:
-init_msdata <- grabMSdata(initial_filename)
-msdata <- grabMSdata(output_filename)
-knitr::kable(head(msdata$MS1, 3))
+init_msdata <- grabMSdata(initial_filename)
+msdata <- grabMSdata(output_filename)
+knitr::kable(head(msdata$MS1, 3))
-knitr::kable(head(msdata$MS2, 3))
+knitr::kable(head(msdata$MS2, 3))
@@ -461,23 +461,23 @@ Minifying files with RaMS
Both the TIC and BPC are updated to reflect the smaller file size as
well:
-par(mfrow=c(2, 1), mar=c(2.1, 2.1, 1.1, 0.1))
-plot(init_msdata$BPC$rt, init_msdata$BPC$int, type = "l", main = "Initial BPC")
-plot(msdata$BPC$rt, msdata$BPC$int, type = "l", main = "New BPC")
+par(mfrow=c(2, 1), mar=c(2.1, 2.1, 1.1, 0.1))
+plot(init_msdata$BPC$rt, init_msdata$BPC$int, type = "l", main = "Initial BPC")
+plot(msdata$BPC$rt, msdata$BPC$int, type = "l", main = "New BPC")
The minifyMSdata
function is vectorized so the exact
same syntax can be used for multiple files:
-dir.create("mini_mzMLs/")
-output_files <- paste0("mini_mzMLs/", basename(msdata_files))
-output_files <- gsub(x=output_files, "\\.gz", "")
-
-minifyMSdata(files = msdata_files, output_files = output_files, verbosity = 0,
- mz_include = masses_of_interest, ppm = 10, warn = FALSE)
-mini_msdata <- grabMSdata(output_files, verbosity = 0)
-
-library(ggplot2)
-ggplot(mini_msdata$BPC) + geom_line(aes(x=rt, y=int, color=filename)) + theme_bw()
+dir.create("mini_mzMLs/")
+output_files <- paste0("mini_mzMLs/", basename(msdata_files))
+output_files <- gsub(x=output_files, "\\.gz", "")
+
+minifyMSdata(files = msdata_files, output_files = output_files, verbosity = 0,
+ mz_include = masses_of_interest, ppm = 10, warn = FALSE)
+mini_msdata <- grabMSdata(output_files, verbosity = 0)
+
+library(ggplot2)
+ggplot(mini_msdata$BPC) + geom_line(aes(x=rt, y=int, color=filename)) + theme_bw()
These new files are valid according to the validator provided in
MSnbase, which means that most programs should be able to open them, but
@@ -500,40 +500,40 @@
Minifying files with RaMS
First, we identify the m/z values we’d like to keep in the
minified files. For the demo data, I’ll use the Ingalls Lab list of
targeted compounds - those we have authentic standards for.
-raw_stans <- read.csv(paste0("https://raw.githubusercontent.com/",
- "IngallsLabUW/Ingalls_Standards/",
- "b098927ea0089b6e7a31e1758e7c7eaad5408535/",
- "Ingalls_Lab_Standards_NEW.csv"))
-
-mzs_to_include <- as.numeric(unique(raw_stans[raw_stans$Fraction1=="HILICPos",]$m.z))
-# Include glycine betaine isotopes for README demo
-mzs_to_include <- c(mzs_to_include, 119.0899, 119.0835)
+raw_stans <- read.csv(paste0("https://raw.githubusercontent.com/",
+ "IngallsLabUW/Ingalls_Standards/",
+ "b098927ea0089b6e7a31e1758e7c7eaad5408535/",
+ "Ingalls_Lab_Standards_NEW.csv"))
+
+mzs_to_include <- as.numeric(unique(raw_stans[raw_stans$Fraction1=="HILICPos",]$m.z))
+# Include glycine betaine isotopes for README demo
+mzs_to_include <- c(mzs_to_include, 119.0899, 119.0835)
Then, we download the raw MS data from the online repository into
which it’s been deposited.
-if(!dir.exists("vignettes/data"))dir.create("vignettes/data")
-base_url <- "ftp://ftp.ebi.ac.uk/pub/databases/metabolights/studies/public/MTBLS703/"
-chosen_files <- paste0(base_url, "170223_Smp_LB12HL_", c("AB", "CD", "EF"), "_pos.mzXML")
-new_names <- gsub(x=basename(chosen_files), "170223_Smp_", "")
-
-mapply(download.file, chosen_files, paste0("vignettes/data/", new_names),
- mode = "wb", method = "libcurl")
+if(!dir.exists("vignettes/data"))dir.create("vignettes/data")
+base_url <- "ftp://ftp.ebi.ac.uk/pub/databases/metabolights/studies/public/MTBLS703/"
+chosen_files <- paste0(base_url, "170223_Smp_LB12HL_", c("AB", "CD", "EF"), "_pos.mzXML")
+new_names <- gsub(x=basename(chosen_files), "170223_Smp_", "")
+
+mapply(download.file, chosen_files, paste0("vignettes/data/", new_names),
+ mode = "wb", method = "libcurl")
The MSMS data wasn’t uploaded, so we handle that separately by
pulling it off the lab computer manually and copying it over to our
temporary directory. If you’re following along, you can skip this chunk
or use your own DDA data.
-file.copy(from = paste0("Z:/1_QEdata/2016/2016_Katherine_1335_LightB12_",
- "Experiment/170223_KRH_Rerun_1335_LightB12_Exp_HILIC/",
- "positive/",
- "170223_Poo_AllCyanoAqExtracts_DDApos_2.mzXML"),
- to = "vignettes/data/DDApos_2.mzXML", overwrite = TRUE)
+file.copy(from = paste0("Z:/1_QEdata/2016/2016_Katherine_1335_LightB12_",
+ "Experiment/170223_KRH_Rerun_1335_LightB12_Exp_HILIC/",
+ "positive/",
+ "170223_Poo_AllCyanoAqExtracts_DDApos_2.mzXML"),
+ to = "vignettes/data/DDApos_2.mzXML", overwrite = TRUE)
Then we can actually perform the minification:
-library(RaMS)
-
-if(!dir.exists("inst/extdata"))dir.create("inst/extdata", recursive = TRUE)
-init_files <- list.files("vignettes/data/", full.names = TRUE)
-out_files <- paste0("inst/extdata/", basename(init_files))
-minifyMSdata(files = init_files, output_files = out_files, warn = FALSE,
- mz_include = mzs_to_include, ppm = 20)
+library(RaMS)
+
+if(!dir.exists("inst/extdata"))dir.create("inst/extdata", recursive = TRUE)
+init_files <- list.files("vignettes/data/", full.names = TRUE)
+out_files <- paste0("inst/extdata/", basename(init_files))
+minifyMSdata(files = init_files, output_files = out_files, warn = FALSE,
+ mz_include = mzs_to_include, ppm = 20)
Now we have four minified mzXML files in our inst/extdata folder.
However, we’d like to be able to demo the mzML functionality as well as
that of mzXMLs, so we can use Proteowizard’s
@@ -544,28 +544,28 @@
Minifying files with RaMS
time, keeping data between 4 and 15 minutes.
Finally, we gzip the files to get them as small as possible, also
using msconvert
.
-system("msconvert inst/extdata/*.mzXML -o inst/extdata/temp --noindex")
-system("msconvert --mzXML inst/extdata/*.mzXML -o inst/extdata/temp --noindex")
-system('msconvert inst/extdata/temp/*.mzML --filter \"scanTime [240,900]\" -o inst/extdata -g')
-system('msconvert inst/extdata/temp/*.mzXML --mzXML --filter \"scanTime [240,900]\" -o inst/extdata -g')
+system("msconvert inst/extdata/*.mzXML -o inst/extdata/temp --noindex")
+system("msconvert --mzXML inst/extdata/*.mzXML -o inst/extdata/temp --noindex")
+system('msconvert inst/extdata/temp/*.mzML --filter \"scanTime [240,900]\" -o inst/extdata -g')
+system('msconvert inst/extdata/temp/*.mzXML --mzXML --filter \"scanTime [240,900]\" -o inst/extdata -g')
And then for the last few steps, we again rename the files (since
msconvert
expands them to their full .raw names) and remove
the ones we don’t need for the demos.
-init_files <- list.files("inst/extdata", full.names = TRUE)
-new_names <- paste0("inst/extdata/", gsub(x=init_files, ".*(Smp_|Extracts_)", ""))
-file.rename(init_files, new_names)
-
-unlink("inst/extdata/temp", recursive = TRUE)
-file.remove(list.files("inst/extdata", pattern = "mzXML$", full.names = TRUE))
-file.remove(paste0("inst/extdata/", c("LB12HL_CD.mzXML.gz", "LB12HL_EF.mzXML.gz")))
+init_files <- list.files("inst/extdata", full.names = TRUE)
+new_names <- paste0("inst/extdata/", gsub(x=init_files, ".*(Smp_|Extracts_)", ""))
+file.rename(init_files, new_names)
+
+unlink("inst/extdata/temp", recursive = TRUE)
+file.remove(list.files("inst/extdata", pattern = "mzXML$", full.names = TRUE))
+file.remove(paste0("inst/extdata/", c("LB12HL_CD.mzXML.gz", "LB12HL_EF.mzXML.gz")))
To check that the new files look ok, we can see if we can read them
with RaMS
and MSnbase
.
-MSnbase::readMSData(list.files("inst/extdata", full.names = TRUE)[1], msLevel. = 1)
-RaMS::grabMSdata(new_names[1])
+MSnbase::readMSData(list.files("inst/extdata", full.names = TRUE)[1], msLevel. = 1)
+RaMS::grabMSdata(new_names[1])
Finally, remember to clean up the original downloads folder
-unlink("vignettes/data", recursive = TRUE)
+unlink("vignettes/data", recursive = TRUE)
-README last built on 2022-11-14
+README last built on 2023-12-13
diff --git a/doc/RaMS-and-friends.R b/doc/RaMS-and-friends.R
index 65c1825..a1d47a7 100644
--- a/doc/RaMS-and-friends.R
+++ b/doc/RaMS-and-friends.R
@@ -1,5 +1,6 @@
## ----setup, include = FALSE---------------------------------------------------
options(rmarkdown.html_vignette.check_title = FALSE)
+data.table::setDTthreads(2)
## -----------------------------------------------------------------------------
library(RaMS)
diff --git a/doc/RaMS-and-friends.Rmd b/doc/RaMS-and-friends.Rmd
index c215d02..1ff0206 100644
--- a/doc/RaMS-and-friends.Rmd
+++ b/doc/RaMS-and-friends.Rmd
@@ -9,6 +9,7 @@ vignette: >
```{r setup, include = FALSE}
options(rmarkdown.html_vignette.check_title = FALSE)
+data.table::setDTthreads(2)
```
**Table of contents:**
diff --git a/doc/RaMS-and-friends.html b/doc/RaMS-and-friends.html
index 297677a..beb8057 100644
--- a/doc/RaMS-and-friends.html
+++ b/doc/RaMS-and-friends.html
@@ -360,26 +360,27 @@ Standard export to CSV
exported to CSV files with base R functions. This works best with a few
chromatograms at a time, as the millions of data points found in most MS
files can overwhelm common file readers.
-library(RaMS)
-
-# Locate an MS file
-single_file <- system.file("extdata", "LB12HL_AB.mzML.gz", package = "RaMS")
-
-# Grab the MS data
-msdata <- grabMSdata(single_file, grab_what = "everything")
+library(RaMS)
+
+# Locate an MS file
+single_file <- system.file("extdata", "LB12HL_AB.mzML.gz", package = "RaMS")
+
+# Grab the MS data
+msdata <- grabMSdata(single_file, grab_what = "everything")
##
-## Reading file LB12HL_AB.mzML.gz... 0.23 secs
-## Reading MS1 data...0.14 secs
-## Reading MS2 data...0.02 secs
+## Reading file LB12HL_AB.mzML.gz... 0.15 secs
+## Reading MS1 data...0.11 secs
+## Reading MS2 data...0.01 secs
## Reading BPC...0.08 secs
-## Reading TIC...0.07 secs
-## Reading file metadata...0.14 secs
-## Total time: 0.7 secs
-# Write out MS1 data to .csv file
-write.csv(x = msdata$MS1, file = "MS1_data.csv")
-
-# Clean up afterward
-file.remove("MS1_data.csv")
+## Reading TIC...0.06 secs
+## Reading file metadata...0.1 secs
+## Binding files together into single data.table
+## Total time: 0.51 secs
+# Write out MS1 data to .csv file
+write.csv(x = msdata$MS1, file = "MS1_data.csv")
+
+# Clean up afterward
+file.remove("MS1_data.csv")
## [1] TRUE
@@ -391,25 +392,26 @@
Fancier export to Excel
uses the
openxlsx
package, although there are several alternatives with identical
functionality.
-
library(openxlsx)
-
-# Locate an MS2 file
-MS2_file <- system.file("extdata", "DDApos_2.mzML.gz", package = "RaMS")
-
-# Grab the MS1 and MS2 data
-msdata <- grabMSdata(MS2_file, grab_what=c("MS1", "MS2"))
+
library(openxlsx)
+
+# Locate an MS2 file
+MS2_file <- system.file("extdata", "DDApos_2.mzML.gz", package = "RaMS")
+
+# Grab the MS1 and MS2 data
+msdata <- grabMSdata(MS2_file, grab_what=c("MS1", "MS2"))
##
-## Reading file DDApos_2.mzML.gz... 0.54 secs
-## Reading MS1 data...0.34 secs
-## Reading MS2 data...0.12 secs
-## Total time: 1 secs
-
# Write out MS data to Excel file
-# openxlsx writes each object in a list to a unique sheet
-# Produces one sheet for MS1 and one for MS2
-write.xlsx(msdata, file = "MS2_data.xlsx")
-
-# Clean up afterward
-file.remove("MS2_data.xlsx")
+## Reading file DDApos_2.mzML.gz... 0.41 secs
+## Reading MS1 data...0.55 secs
+## Reading MS2 data...0.11 secs
+## Binding files together into single data.table
+## Total time: 1.07 secs
+
# Write out MS data to Excel file
+# openxlsx writes each object in a list to a unique sheet
+# Produces one sheet for MS1 and one for MS2
+write.xlsx(msdata, file = "MS2_data.xlsx")
+
+# Clean up afterward
+file.remove("MS2_data.xlsx")
## [1] TRUE
@@ -419,12 +421,12 @@
Exporting to SQL database
This vignette will demo the
RSQLite
package’s engine, although several other database engines have similar
functionality.
-
library(DBI)
-# Get data from multiple files to show off
-mzml_files <- system.file(c("extdata/LB12HL_AB.mzML.gz",
- "extdata/LB12HL_CD.mzML.gz"),
- package = "RaMS")
-msdata <- grabMSdata(mzml_files)
+
library(DBI)
+# Get data from multiple files to show off
+mzml_files <- system.file(c("extdata/LB12HL_AB.mzML.gz",
+ "extdata/LB12HL_CD.mzML.gz"),
+ package = "RaMS")
+msdata <- grabMSdata(mzml_files)
##
|
| | 0%
@@ -432,57 +434,58 @@ Exporting to SQL database
|=================================== | 50%
|
|======================================================================| 100%
-## Total time: 1.12 secs
-
# Create the sqlite database and connect to it
-MSdb <- dbConnect(RSQLite::SQLite(), "MSdata.sqlite")
-
-# Export MS1 and MS2 data to sqlite tables
-dbWriteTable(MSdb, "MS1", msdata$MS1)
-dbWriteTable(MSdb, "MS2", msdata$MS2)
-dbListTables(MSdb)
+## Total time: 0.98 secs
+
# Create the sqlite database and connect to it
+MSdb <- dbConnect(RSQLite::SQLite(), "MSdata.sqlite")
+
+# Export MS1 and MS2 data to sqlite tables
+dbWriteTable(MSdb, "MS1", msdata$MS1)
+dbWriteTable(MSdb, "MS2", msdata$MS2)
+dbListTables(MSdb)
## [1] "MS1" "MS2"
-
# Perform a simple query to ensure data was exported correctly
-dbGetQuery(MSdb, 'SELECT * FROM MS1 LIMIT 3')
+
# Perform a simple query to ensure data was exported correctly
+dbGetQuery(MSdb, 'SELECT * FROM MS1 LIMIT 3')
## rt mz int filename
## 1 4.009 139.0503 1800550.12 LB12HL_AB.mzML.gz
## 2 4.009 148.0967 206310.81 LB12HL_AB.mzML.gz
## 3 4.009 136.0618 71907.15 LB12HL_AB.mzML.gz
-
# Perform EIC extraction in SQL rather than in R
-EIC_query <- 'SELECT * FROM MS1 WHERE mz BETWEEN :lower_bound AND :upper_bound'
-query_params <- list(lower_bound=118.086, upper_bound=118.087)
-EIC <- dbGetQuery(MSdb, EIC_query, params = query_params)
-
-# Append with additional files
-extra_file <- system.file("extdata", "LB12HL_EF.mzML.gz", package = "RaMS")
-extra_msdata <- grabMSdata(extra_file, grab_what = "everything")
+
# Perform EIC extraction in SQL rather than in R
+EIC_query <- 'SELECT * FROM MS1 WHERE mz BETWEEN :lower_bound AND :upper_bound'
+query_params <- list(lower_bound=118.086, upper_bound=118.087)
+EIC <- dbGetQuery(MSdb, EIC_query, params = query_params)
+
+# Append with additional files
+extra_file <- system.file("extdata", "LB12HL_EF.mzML.gz", package = "RaMS")
+extra_msdata <- grabMSdata(extra_file, grab_what = "everything")
##
-## Reading file LB12HL_EF.mzML.gz... 0.16 secs
-## Reading MS1 data...0.15 secs
+## Reading file LB12HL_EF.mzML.gz... 0.15 secs
+## Reading MS1 data...0.3 secs
## Reading MS2 data...0.01 secs
-## Reading BPC...0.07 secs
-## Reading TIC...0.07 secs
-## Reading file metadata...0.14 secs
-## Total time: 0.62 secs
-
dbGetQuery(MSdb, 'SELECT COUNT(*) FROM MS1') # Initial number of rows
+## Reading BPC...0.06 secs
+## Reading TIC...0.06 secs
+## Reading file metadata...0.1 secs
+## Binding files together into single data.table
+## Total time: 0.69 secs
+
dbGetQuery(MSdb, 'SELECT COUNT(*) FROM MS1') # Initial number of rows
## COUNT(*)
## 1 42313
-
dbAppendTable(MSdb, "MS1", extra_msdata$MS1)
+
dbAppendTable(MSdb, "MS1", extra_msdata$MS1)
## [1] 22124
-
# Confirm three different files exist in DB
-dbGetQuery(MSdb, 'SELECT DISTINCT filename FROM MS1')
+
# Confirm three different files exist in DB
+dbGetQuery(MSdb, 'SELECT DISTINCT filename FROM MS1')
## filename
## 1 LB12HL_AB.mzML.gz
## 2 LB12HL_CD.mzML.gz
## 3 LB12HL_EF.mzML.gz
-
# Confirm new rows have been added
-dbGetQuery(MSdb, 'SELECT COUNT(*) FROM MS1')
+
# Confirm new rows have been added
+dbGetQuery(MSdb, 'SELECT COUNT(*) FROM MS1')
## COUNT(*)
## 1 64437
-
# Disconnect after export
-dbDisconnect(MSdb)
-
-# Clean up afterward
-unlink("MSdata.sqlite")
+
# Disconnect after export
+dbDisconnect(MSdb)
+
+# Clean up afterward
+unlink("MSdata.sqlite")
Interfacing with Python via reticulate
@@ -492,11 +495,11 @@
Interfacing with Python via reticulate
Python code chunks as shown below.
R code chunk: {r}
-
# Locate a couple MS files
-data_dir <- system.file("extdata", package = "RaMS")
-file_paths <- list.files(data_dir, pattern = "HL.*mzML", full.names = TRUE)
-
-msdata <- grabMSdata(files = file_paths, grab_what = "BPC")$BPC
+
# Locate a couple MS files
+data_dir <- system.file("extdata", package = "RaMS")
+file_paths <- list.files(data_dir, pattern = "HL.*mzML", full.names = TRUE)
+
+msdata <- grabMSdata(files = file_paths, grab_what = "BPC")$BPC
##
|
| | 0%
@@ -506,18 +509,18 @@ R code chunk: {r}
|=============================================== | 67%
|
|======================================================================| 100%
-## Total time: 0.42 secs
+## Total time: 0.34 secs
Python code chunk: {python}
-
# Not run to pass R CMD check on GitHub
-# Make sure python, matplotlib, and seaborn are installed
-
-import seaborn as sns
-import matplotlib.pyplot as plt
-
-sns.relplot(data=r.msdata, kind="line", x="rt", y="int", hue="filename")
-plt.show()
+
# Not run to pass R CMD check on GitHub
+# Make sure python, matplotlib, and seaborn are installed
+
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+sns.relplot(data=r.msdata, kind="line", x="rt", y="int", hue="filename")
+plt.show()
diff --git a/doc/speed_size_comparison.R b/doc/speed_size_comparison.R
new file mode 100644
index 0000000..d495564
--- /dev/null
+++ b/doc/speed_size_comparison.R
@@ -0,0 +1,563 @@
+## ----setup, include=FALSE-----------------------------------------------------
+knitr::opts_chunk$set(echo = TRUE, eval=FALSE)
+options(rmarkdown.html_vignette.check_title = FALSE)
+data.table::setDTthreads(2)
+
+## -----------------------------------------------------------------------------
+# library(RaMS)
+# library(tidyverse)
+# library(microbenchmark)
+# library(MSnbase)
+# library(Spectra)
+# library(DBI)
+# library(arrow)
+# library(rvest)
+# library(xml2)
+#
+# BiocParallel::register(BiocParallel::SerialParam(stop.on.error = FALSE, progressbar = TRUE))
+
+## ----file download------------------------------------------------------------
+# set.seed(123)
+#
+# n_ms_files <- 10
+# base_url <- "ftp://massive.ucsd.edu/v01/MSV000080030/peak/" %>%
+# paste0("Forensic_study_80_volunteers/Forensic_Hands_mzXML/") %>%
+# paste0("Forensic_Hands_plate_1_mzXML/Sample/")
+# file_list <- base_url %>%
+# read_html %>%
+# xml_text() %>%
+# strsplit("\\\n") %>%
+# pluck(1) %>%
+# str_remove(".*2016 ") %>%
+# str_remove("\\\r")
+# chosen_file_list <- sample(file_list, n_ms_files)
+#
+# dir.create("vignettes/figures/ssc_vignette_renders/Sample/")
+# for(i in chosen_file_list){
+# new_file_path <- paste0("vignettes/figures/ssc_vignette_renders/Sample/", i)
+# download.file(paste0(base_url, i), destfile = new_file_path, mode = "wb")
+# }
+#
+# ms_files <- list.files("vignettes/figures/ssc_vignette_renders/Sample/", full.names = TRUE)
+
+## ----MSnExp-------------------------------------------------------------------
+# msnexp_obj <- readMSData(ms_files, mode="inMemory", msLevel. = 1)
+# plot(chromatogram(msnexp_obj, mz=pmppm(432.2810, ppm = 20)))
+
+## ----onDiskMSnExp-------------------------------------------------------------
+# ondisk_obj <- readMSData(ms_files, mode="onDisk", msLevel. = 1)
+# plot(chromatogram(ondisk_obj, mz=pmppm(432.2810, ppm = 20)))
+
+## ----Spectra------------------------------------------------------------------
+# getIntensities <- function(x, ...) {
+# if (nrow(x)) {
+# cbind(mz = NA_real_, intensity = x[, "intensity"])
+# } else cbind(mz = NA_real_, intensity = NA_real_)
+# }
+#
+# sfs_filtered <- Spectra(ms_files, source=MsBackendMzR()) %>%
+# filterMsLevel(1) %>%
+# filterMzRange(pmppm(432.2810, ppm = 20))
+# sfs_agg <- addProcessing(sfs_filtered, getIntensities)
+# eic <- cbind(rt=rtime(sfs_agg), int=unlist(intensity(sfs_agg), use.names = FALSE))
+# plot(eic[,"rt"], eic[,"int"], type="l")
+
+## ----RaMS---------------------------------------------------------------------
+# rams_obj <- grabMSdata(ms_files, grab_what="MS1")
+# rams_chrom_data <- rams_obj$MS1[mz%between%pmppm(432.2810, ppm = 20)]
+# plot(rams_chrom_data$rt, rams_chrom_data$int, type="l")
+
+## ----tmzML--------------------------------------------------------------------
+# tmzml_names <- paste0(dirname(dirname(ms_files)), "/tmzMLs/", gsub("mzXML", "tmzML", basename(ms_files)))
+# dir.create("vignettes/figures/ssc_vignette_renders/tmzMLs")
+# bpmapply(tmzmlMaker, ms_files, tmzml_names, BPPARAM = SnowParam(workers = 3, progressbar = TRUE, tasks=length(tmzml_names)))
+# tmzml_obj <- grabMSdata(tmzml_names)
+# tmzml_chrom_data <- tmzml_obj$MS1[mz%between%pmppm(432.2810, ppm = 20)]
+# plot(tmzml_chrom_data$rt, tmzml_chrom_data$int, type="l")
+
+## ----arrow--------------------------------------------------------------------
+# write_dataset(rams_obj$MS1, path = "vignettes/figures/ssc_vignette_renders/pqds")
+# arrow_data <- open_dataset("vignettes/figures/ssc_vignette_renders/pqds") %>%
+# filter(mz%between%pmppm(432.2810, ppm = 20)) %>%
+# dplyr::collect()
+# plot(arrow_data$rt, arrow_data$int, type="l")
+
+## ----sqlite database----------------------------------------------------------
+# MSdb <- dbConnect(RSQLite::SQLite(), "vignettes/figures/ssc_vignette_renders/MSdata.sqlite")
+# dbWriteTable(MSdb, "MS1", rams_obj$MS1, overwrite=TRUE)
+# EIC_query <- 'SELECT * FROM MS1 WHERE mz BETWEEN :lower_bound AND :upper_bound'
+# query_params <- as.list(pmppm(432.2810, ppm = 20))
+# names(query_params) <- c("lower_bound", "upper_bound")
+# sql_data <- dbGetQuery(MSdb, EIC_query, params = query_params)
+# plot(sql_data$rt, sql_data$int, type="l")
+#
+# rs <- dbSendQuery(MSdb, "CREATE INDEX mz ON MS1 (mz)")
+# dbClearResult(rs)
+# sql_data <- dbGetQuery(MSdb, EIC_query, params = query_params)
+# sql_data <- sql_data[order(sql_data$filename, sql_data$rt),]
+# plot(sql_data$rt, sql_data$int, type="l")
+# dbDisconnect(MSdb)
+
+## ----time2make----------------------------------------------------------------
+# msnexp_make_fun <- function(){
+# readMSData(ms_files, mode="inMemory", msLevel. = 1)
+# }
+# ondisk_make_fun <- function(){
+# readMSData(ms_files, mode="onDisk", msLevel. = 1)
+# }
+# spectra_make_fun <- function(){
+# Spectra(ms_files, source=MsBackendMzR()) %>% filterMsLevel(1)
+# }
+# rams_make_fun <- function(){
+# grabMSdata(ms_files, grab_what="MS1")
+# }
+# tmzml_make_fun <- function(){
+# tmzml_names <- paste0(dirname(dirname(ms_files)), "/tmzMLs/",
+# gsub("mzXML", "tmzML", basename(ms_files)))
+# dir.create("vignettes/figures/ssc_vignette_renders/tmzMLs")
+# mapply(tmzmlMaker, ms_files, tmzml_names)
+# unlink("vignettes/figures/ssc_vignette_renders/tmzMLs", recursive = TRUE)
+# }
+# arrow_make_fun <- function(){
+# msdata <- grabMSdata(ms_files, grab_what="MS1")
+# write_dataset(msdata$MS1, path = "vignettes/figures/ssc_vignette_renders/pqds")
+# unlink("vignettes/figures/ssc_vignette_renders/pqds", recursive = TRUE)
+# }
+# sql_make_fun <- function(){
+# msdata <- grabMSdata(ms_files, grab_what="MS1")
+# MSdb <- dbConnect(RSQLite::SQLite(), "vignettes/figures/ssc_vignette_renders/MSdata.sqlite")
+# dbWriteTable(MSdb, "MS1", msdata$MS1, overwrite=TRUE)
+# dbDisconnect(MSdb)
+# unlink("vignettes/figures/ssc_vignette_renders/MSdata.sqlite")
+# }
+# sqlidx_make_fun <- function(){
+# msdata <- grabMSdata(ms_files, grab_what="MS1")
+# MSdb_idx <- dbConnect(RSQLite::SQLite(), "vignettes/figures/ssc_vignette_renders/MSdata_idx.sqlite")
+# dbWriteTable(MSdb_idx, "MS1", msdata$MS1, overwrite=TRUE)
+# rs <- dbSendQuery(MSdb_idx, "CREATE INDEX mz ON MS1 (mz)")
+# dbClearResult(rs)
+# dbDisconnect(MSdb_idx)
+# unlink("vignettes/figures/ssc_vignette_renders/MSdata_idx.sqlite")
+# }
+#
+# make_timings <- microbenchmark(
+# msnexp_make_fun(), ondisk_make_fun(), spectra_make_fun(), rams_make_fun(),
+# tmzml_make_fun(), arrow_make_fun(), sql_make_fun(), sqlidx_make_fun(),
+# times = 10)
+# saveRDS(make_timings, "vignettes/figures/ssc_vignette_renders/make_timings.rds")
+
+## ----plot time2make-----------------------------------------------------------
+# make_timings <- readRDS("vignettes/figures/ssc_vignette_renders/make_timings.rds")
+# make_timings %>%
+# as.data.frame() %>%
+# arrange(expr) %>%
+# mutate(expr=str_remove(expr, "_make_fun\\(\\)")) %>%
+# mutate(rep_type=case_when(
+# expr%in%c("msnexp", "ondisk", "spectra", "rams")~"Every R session",
+# TRUE~"Single-time only"
+# )) %>%
+# mutate(expr=factor(expr, levels=c("msnexp", "ondisk", "spectra", "rams",
+# "tmzml", "arrow", "sql", "sqlidx"),
+# labels=c("MSnExp", "OnDiskMSnExp", "Spectra", "RaMS",
+# "tmzMLs", "Arrow", "SQL", "SQL (indexed)"))) %>%
+# ggplot() +
+# geom_boxplot(aes(x=expr, y=time/1e9)) +
+# geom_hline(yintercept = 0) +
+# facet_wrap(~rep_type, nrow = 1, scales="free_x") +
+# labs(y="Seconds", x=NULL) +
+# theme(axis.text.x = element_text(angle=90, hjust=1, vjust=0.5))
+# ggsave("vignettes/figures/ssc_vignette_renders/make_time_gp.png", width = 6.5, height = 4, units = "in", device = "png", dpi = 144)
+
+## ----parallel_proc------------------------------------------------------------
+# unpar_rams <- function(){
+# print("Unpar RaMS")
+# lapply(ms_files, grabMSdata)
+# }
+# unpar_tmzml <- function(){
+# print("Unpar tmzML")
+# tmzml_names <- paste0(dirname(dirname(ms_files)), "/tmzMLs/",
+# gsub("mzXML", "tmzML", basename(ms_files)))
+# dir.create("vignettes/figures/ssc_vignette_renders/tmzMLs")
+# pbapply::pbmapply(tmzmlMaker, ms_files, tmzml_names)
+# unlink("vignettes/figures/ssc_vignette_renders/tmzMLs", recursive = TRUE)
+# }
+#
+# library(BiocParallel)
+# par_param <- SnowParam(workers = 5, progressbar = TRUE)
+# par_rams <- function(){
+# print("Par RaMS")
+# bplapply(ms_files, grabMSdata, BPPARAM = par_param)
+# }
+# par_tmzml <- function(){
+# print("Par tmzML")
+# tmzml_names <- paste0(dirname(dirname(ms_files)), "/tmzMLs/",
+# gsub("mzXML", "tmzML", basename(ms_files)))
+# dir.create("vignettes/figures/ssc_vignette_renders/tmzMLs")
+# bpmapply(tmzmlMaker, ms_files, tmzml_names, BPPARAM = par_param)
+# unlink("vignettes/figures/ssc_vignette_renders/tmzMLs", recursive = TRUE)
+# }
+#
+# par_timings <- microbenchmark(par_rams(), unpar_rams(), par_tmzml(), unpar_tmzml(), times = 5)
+# saveRDS(par_timings, "vignettes/figures/ssc_vignette_renders/par_timings.rds")
+
+## ----parallel proc plot-------------------------------------------------------
+# par_timings <- readRDS("vignettes/figures/ssc_vignette_renders/par_timings.rds")
+# par_timings %>%
+# as.data.frame() %>%
+# separate(expr, into = c("sub_type", "par_type"), sep = "_") %>%
+# mutate(par_type=str_remove(par_type, "\\(\\)")) %>%
+# mutate(sub_type=factor(sub_type, levels=c("unpar", "par"),
+# labels=c("Sequential", "Parallel"))) %>%
+# mutate(par_type=factor(par_type, levels=c("rams", "tmzml"),
+# labels=c("RaMS", "tmzMLs"))) %>%
+# ggplot() +
+# geom_boxplot(aes(x=par_type, y=time/1e9)) +
+# geom_hline(yintercept = 0) +
+# facet_wrap(~sub_type, nrow = 1) +
+# labs(y="Seconds", x=NULL) +
+# theme(axis.text.x = element_text(angle=90, hjust=1, vjust=0.5))
+# ggsave("vignettes/figures/ssc_vignette_renders/par_time_gp.png", width = 6.5, height = 3, units = "in", device = "png", dpi = 144)
+
+## ----time2query---------------------------------------------------------------
+# msnexp_obj <- readMSData(ms_files, mode="inMemory", msLevel. = 1)
+# ondisk_obj <- readMSData(ms_files, mode="onDisk", msLevel. = 1)
+# spectra_obj <- Spectra(ms_files, source=MsBackendMzR()) %>% filterMsLevel(1)
+# rams_obj <- grabMSdata(ms_files, grab_what="MS1")
+#
+# tmzml_names <- paste0(dirname(dirname(ms_files)), "/tmzMLs/",
+# gsub("mzXML", "tmzML", basename(ms_files)))
+# dir.create("vignettes/figures/ssc_vignette_renders/tmzMLs")
+# mapply(tmzmlMaker, ms_files, tmzml_names)
+#
+# write_dataset(rams_obj$MS1, path = "vignettes/figures/ssc_vignette_renders/pqds")
+#
+# MSdb <- dbConnect(RSQLite::SQLite(), "vignettes/figures/ssc_vignette_renders/MSdata.sqlite")
+# dbWriteTable(MSdb, "MS1", rams_obj$MS1, overwrite=TRUE)
+# dbDisconnect(MSdb)
+#
+# MSdb_idx <- dbConnect(RSQLite::SQLite(), "vignettes/figures/ssc_vignette_renders/MSdata_idx.sqlite")
+# dbWriteTable(MSdb_idx, "MS1", rams_obj$MS1, overwrite=TRUE)
+# rs <- dbSendQuery(MSdb_idx, "CREATE INDEX mz ON MS1 (mz)")
+# dbClearResult(rs)
+# dbDisconnect(MSdb_idx)
+#
+# msnexp_query_fun <- function(){
+# plot(chromatogram(msnexp_obj, mz=pmppm(432.2810, ppm = 20)))
+# }
+# ondisk_query_fun <- function(){
+# plot(chromatogram(ondisk_obj, mz=pmppm(432.2810, ppm = 20)))
+# }
+# spectra_query_fun <- function(){
+# sfs_filtered <- spectra_obj %>% filterMzRange(pmppm(432.2810, ppm = 20))
+# getIntensities <- function(x, ...) {
+# if (nrow(x)) {
+# cbind(mz = NA_real_, intensity = x[, "intensity"])
+# } else cbind(mz = NA_real_, intensity = NA_real_)
+# }
+# sfs_agg <- addProcessing(sfs_filtered, getIntensities)
+# eic <- cbind(rt=rtime(sfs_agg), int=unlist(intensity(sfs_agg), use.names = FALSE))
+# plot(eic[,"rt"], eic[,"int"], type="l")
+# }
+# rams_query_fun <- function(){
+# rams_chrom_data <- rams_obj$MS1[mz%between%pmppm(432.2810, ppm = 20)]
+# plot(rams_chrom_data$rt, rams_chrom_data$int, type="l")
+# }
+# tmzml_query_fun <- function(){
+# tmzml_names <- list.files("vignettes/figures/ssc_vignette_renders/tmzMLs", full.names = TRUE)
+# tmzml_obj <- grabMSdata(tmzml_names)
+# tmzml_chrom_data <- tmzml_obj$MS1[mz%between%pmppm(432.2810, ppm = 20)]
+# plot(tmzml_chrom_data$rt, tmzml_chrom_data$int, type="l")
+# }
+# arrow_query_fun <- function(){
+# arrow_data <- open_dataset("vignettes/figures/ssc_vignette_renders/pqds") %>%
+# filter(mz%between%pmppm(432.2810, ppm = 20)) %>%
+# dplyr::collect()
+# plot(arrow_data$rt, arrow_data$int, type="l")
+# }
+# sql_query_fun <- function(){
+# MSdb <- dbConnect(RSQLite::SQLite(), "vignettes/figures/ssc_vignette_renders/MSdata.sqlite")
+# EIC_query <- 'SELECT * FROM MS1 WHERE mz BETWEEN :lower_bound AND :upper_bound'
+# query_params <- as.list(pmppm(432.2810, ppm = 20))
+# names(query_params) <- c("lower_bound", "upper_bound")
+# sql_data <- dbGetQuery(MSdb, EIC_query, params = query_params)
+# plot(sql_data$rt, sql_data$int, type="l")
+# }
+# sqlidx_query_fun <- function(){
+# MSdb_idx <- dbConnect(RSQLite::SQLite(), "vignettes/figures/ssc_vignette_renders/MSdata_idx.sqlite")
+# EIC_query <- 'SELECT * FROM MS1 WHERE mz BETWEEN :lower_bound AND :upper_bound'
+# query_params <- as.list(pmppm(432.2810, ppm = 20))
+# names(query_params) <- c("lower_bound", "upper_bound")
+# sql_data <- dbGetQuery(MSdb_idx, EIC_query, params = query_params)
+# sql_data <- sql_data[order(sql_data$filename, sql_data$rt),]
+# plot(sql_data$rt, sql_data$int, type="l")
+# }
+#
+# query_timings <- microbenchmark(
+# msnexp_query_fun(), ondisk_query_fun(), spectra_query_fun(), rams_query_fun(),
+# tmzml_query_fun(), arrow_query_fun(), sql_query_fun(), sqlidx_query_fun(),
+# times = 10
+# )
+# query_timings
+# saveRDS(query_timings, "vignettes/figures/ssc_vignette_renders/query_timings.rds")
+
+## ----time2query plot----------------------------------------------------------
+# query_timings <- readRDS("vignettes/figures/ssc_vignette_renders/query_timings.rds")
+# query_timings %>%
+# as.data.frame() %>%
+# arrange(expr) %>%
+# mutate(expr=str_remove(expr, "_query_fun\\(\\)")) %>%
+# mutate(rep_type=case_when(
+# expr%in%c("msnexp", "ondisk", "spectra", "rams")~"Every R session",
+# TRUE~"Single-time only"
+# )) %>%
+# mutate(expr=factor(expr, levels=c("msnexp", "ondisk", "spectra", "rams",
+# "tmzml", "arrow", "sql", "sqlidx"),
+# labels=c("MSnExp", "OnDiskMSnExp", "Spectra", "RaMS",
+# "tmzMLs", "Arrow", "SQL", "SQL (indexed)"))) %>%
+# ggplot() +
+# geom_boxplot(aes(x=expr, y=time/1e9)) +
+# scale_y_log10() +
+# labs(y="Seconds", x=NULL) +
+# theme(axis.text.x = element_text(angle=90, hjust=1, vjust=0.5))
+# ggsave("vignettes/figures/ssc_vignette_renders/query_time_gp.png", width = 6.5, height = 5, units = "in", device = "png", dpi = 144)
+
+## ----double-check sql_idx vs RaMS---------------------------------------------
+# MSdb_idx <- dbConnect(RSQLite::SQLite(), "vignettes/figures/ssc_vignette_renders/MSdata_idx.sqlite")
+# EIC_query <- 'SELECT * FROM MS1 WHERE mz BETWEEN :lower_bound AND :upper_bound'
+# query_params <- as.list(pmppm(432.2810, ppm = 20))
+# names(query_params) <- c("lower_bound", "upper_bound")
+# dbGetQuery(MSdb_idx, EIC_query, params = query_params) %>% qplotMS1data()
+#
+# rams_obj$MS1[mz%between%pmppm(432.2810, ppm = 20)] %>% qplotMS1data()
+
+## ----make query strings-------------------------------------------------------
+# rams_obj <- grabMSdata(ms_files, grab_what="MS1")
+# grouped_ms1 <- rams_obj$MS1 %>%
+# arrange(desc(int)) %>%
+# mutate(mz_group=mz_group(mz, ppm = 10, max_groups = 10, min_group_size=20)) %>%
+# drop_na()
+# # grouped_ms1 %>%
+# # qplotMS1data(facet_col="mz_group", facet_args = list(ncol=2))
+# mzs_to_grab <- grouped_ms1 %>%
+# group_by(mz_group) %>%
+# summarise(mean_mz=mean(mz), sd_mz=sd(mz), mean_rt=mean(rt)) %>%
+# pull(mean_mz)
+#
+# rams_arrow_call <- lapply(mzs_to_grab, function(mz_i){
+# mzrange <- pmppm(mz_i, 10)
+# call("between", as.name("mz"), mzrange[[1]], mzrange[[2]])
+# }) %>% paste(collapse = "|")
+#
+# sql_comb_call <- sapply(mzs_to_grab, function(mz_i){
+# paste("mz BETWEEN", pmppm(mz_i, 10)[1], "AND", pmppm(mz_i, 10)[2])
+# }) %>% paste(collapse = " OR ") %>% paste("SELECT * FROM MS1 WHERE", .)
+#
+# print(rams_arrow_call)
+# print(sql_comb_call)
+
+## ----multichrom query timing--------------------------------------------------
+# rams_uni_fun <- function(){
+# print("RaMS unified")
+# rams_obj$MS1[eval(parse(text=rams_arrow_call))]
+# }
+# rams_loop_fun <- function(){
+# print("RaMS loop")
+# lapply(mzs_to_grab, function(mz_i){
+# rams_obj$MS1[mz%between%pmppm(mz_i, 10)]
+# }) %>% bind_rows() %>% distinct()
+# }
+#
+# arrow_ds <- open_dataset("vignettes/figures/ssc_vignette_renders/pqds")
+# arrow_uni_fun <- function(){
+# print("Arrow unified")
+# arrow_ds %>%
+# filter(eval(parse(text = rams_arrow_call))) %>%
+# collect()
+# }
+# arrow_loop_fun <- function(){
+# print("Arrow loop")
+# lapply(mzs_to_grab, function(mz_i){
+# arrow_ds %>%
+# filter(mz%between%pmppm(mz_i, 10)) %>%
+# collect()
+# }) %>% bind_rows() %>% distinct()
+# }
+#
+# MSdb_idx <- dbConnect(RSQLite::SQLite(), "vignettes/figures/ssc_vignette_renders/MSdata_idx.sqlite")
+# sql_uni_fun <- function(){
+# print("SQL unified")
+# dbGetQuery(MSdb_idx, sql_comb_call)
+# }
+# sql_query_base <- 'SELECT * FROM MS1 WHERE mz BETWEEN :lower_bound AND :upper_bound'
+# sql_loop_fun <- function(){
+# print("SQL loop")
+# lapply(mzs_to_grab, function(mz_i){
+# query_params <- as.list(pmppm(mz_i, ppm = 20))
+# names(query_params) <- c("lower_bound", "upper_bound")
+# sql_data <- dbGetQuery(MSdb_idx, sql_query_base, params = query_params)
+# }) %>% bind_rows() %>% distinct()
+# }
+#
+# multichrom_timings <- microbenchmark(
+# rams_uni_fun(), rams_loop_fun(), arrow_uni_fun(), arrow_loop_fun(),
+# sql_uni_fun(), sql_loop_fun(), times = 10
+# )
+# saveRDS(multichrom_timings, "vignettes/figures/ssc_vignette_renders/multichrom_timings.rds")
+#
+# multichrom_timings <- readRDS("vignettes/figures/ssc_vignette_renders/multichrom_timings.rds")
+# multichrom_timings %>%
+# as.data.frame() %>%
+# arrange(expr) %>%
+# mutate(expr=str_remove(expr, "_fun\\(\\)")) %>%
+# separate(expr, into = c("expr", "query_type"), sep = "_") %>%
+# mutate(expr=factor(expr, levels=c("rams", "arrow", "sql"),
+# labels=c("RaMS", "Arrow", "SQL"))) %>%
+# mutate(query_type=factor(query_type, levels=c("uni", "loop"),
+# labels=c("Unified query", "Query loop"))) %>%
+# ggplot() +
+# geom_boxplot(aes(x=query_type, y=time/1e9), lwd=1) +
+# facet_wrap(~expr, nrow=1) +
+# scale_y_log10() +
+# labs(y="Seconds", x=NULL, color=NULL) +
+# theme_bw()
+# ggsave("vignettes/figures/ssc_vignette_renders/multichrom_gp.png", width = 6.5, height = 4, units = "in", device = "png", dpi = 144)
+
+## ----sizing info--------------------------------------------------------------
+# size_list <- list()
+#
+# size_list$mzXML <- sum(file.size(ms_files))
+#
+# msnexp_obj <- readMSData(ms_files, mode="inMemory", msLevel. = 1)
+# size_list$msnexp_obj <- pryr::object_size(msnexp_obj)
+# rm(msnexp_obj)
+#
+# ondisk_obj <- readMSData(ms_files, mode="onDisk", msLevel. = 1)
+# size_list$ondisk_obj <- pryr::object_size(ondisk_obj)
+# rm(ondisk_obj)
+#
+# sfs_filtered <- Spectra(ms_files, source=MsBackendMzR()) %>%
+# filterMsLevel(1)
+# size_list$spectra <- pryr::object_size(sfs_filtered)
+# rm(sfs_filtered)
+#
+# rams_obj <- grabMSdata(ms_files, grab_what="MS1")
+# size_list$rams <- pryr::object_size(rams_obj)
+#
+# tmzml_names <- paste0(dirname(dirname(ms_files)), "/tmzMLs/", gsub("mzXML", "tmzML", basename(ms_files)))
+# dir.create("vignettes/figures/ssc_vignette_renders/tmzMLs")
+# bpmapply(tmzmlMaker, ms_files, tmzml_names, BPPARAM = SnowParam(workers = 5, progressbar = TRUE, tasks=length(tmzml_names)))
+# size_list$tmzml <- sum(file.size(list.files("vignettes/figures/ssc_vignette_renders/tmzMLs", full.names = TRUE)))
+# unlink("vignettes/figures/ssc_vignette_renders/tmzMLs", recursive = TRUE)
+#
+# write_dataset(rams_obj$MS1, path = "vignettes/figures/ssc_vignette_renders/pqds")
+# size_list$arrow <- sum(file.size(list.files("vignettes/figures/ssc_vignette_renders/pqds", full.names = TRUE)))
+# unlink("vignettes/figures/ssc_vignette_renders/pqds", recursive = TRUE)
+#
+# MSdb <- dbConnect(RSQLite::SQLite(), "vignettes/figures/ssc_vignette_renders/MSdata.sqlite")
+# dbWriteTable(MSdb, "MS1", rams_obj$MS1, overwrite=TRUE)
+# dbDisconnect(MSdb)
+# size_list$MSdb <- file.size("vignettes/figures/ssc_vignette_renders/MSdata.sqlite")
+#
+#
+# MSdb_idx <- dbConnect(RSQLite::SQLite(), "vignettes/figures/ssc_vignette_renders/MSdata.sqlite")
+# rs <- dbSendQuery(MSdb_idx, "CREATE INDEX mz ON MS1 (mz)")
+# dbClearResult(rs)
+# dbDisconnect(MSdb_idx)
+# size_list$MSdb_idx <- file.size("vignettes/figures/ssc_vignette_renders/MSdata_idx.sqlite")
+# unlink("vignettes/figures/ssc_vignette_renders/MSdata.sqlite")
+#
+# saveRDS(size_list, "vignettes/figures/ssc_vignette_renders/size_list.rds")
+
+## ----plot size info-----------------------------------------------------------
+# size_list <- readRDS("vignettes/figures/ssc_vignette_renders/size_list.rds")
+# size_list %>%
+# within(rm(mzXML)) %>%
+# sapply(as.numeric) %>%
+# data.frame(bytes=.) %>%
+# rownames_to_column("expr") %>%
+# mutate(expr=str_remove(expr, "_obj")) %>%
+# mutate(expr=str_replace(expr, "MSdb_?", "sql")) %>%
+# mutate(mem_type=case_when(
+# expr%in%c("msnexp", "ondisk", "spectra", "rams")~"Memory",
+# TRUE~"Disk"
+# )) %>%
+# mutate(mem_type=factor(mem_type, levels=c("Memory", "Disk"))) %>%
+# mutate(expr=factor(expr, levels=c("msnexp", "ondisk", "spectra", "rams",
+# "tmzml", "arrow", "sql", "sqlidx"),
+# labels=c("MSnExp", "OnDiskMSnExp", "Spectra", "RaMS",
+# "tmzMLs", "Arrow", "SQL", "SQL (indexed)"))) %>%
+# ggplot() +
+# geom_hline(yintercept = size_list$mzXML/(1024^3)) +
+# geom_point(aes(x=expr, y=bytes/(1024^3))) +
+# scale_y_log10(breaks=c(0.001, 0.01, 0.1, 1, 10), labels=c("1MB", "10MB", "100MB", "1GB", "10GB"),
+# limits=c(0.001, 10)) +
+# facet_wrap(~mem_type, scales = "free_x") +
+# labs(x=NULL, y=NULL)
+# ggsave("vignettes/figures/ssc_vignette_renders/size_cons.png", width = 6.5, height = 4, units = "in", device = "png", dpi = 144)
+
+## ----summary plot-------------------------------------------------------------
+# make_timings <- readRDS("vignettes/figures/ssc_vignette_renders/make_timings.rds")
+# query_timings <- readRDS("vignettes/figures/ssc_vignette_renders/query_timings.rds")
+# size_df <- readRDS("vignettes/figures/ssc_vignette_renders/size_list.rds") %>%
+# sapply(as.numeric) %>%
+# data.frame(size=.) %>%
+# mutate(size=size/1024^3) %>%
+# rownames_to_column("expr") %>%
+# mutate(expr=str_remove(expr, "_obj")) %>%
+# mutate(expr=str_replace(expr, "MSdb_?", "sql"))
+#
+# bind_rows(make_timings, query_timings) %>%
+# as.data.frame() %>%
+# group_by(expr) %>%
+# mutate(time=time/1e9) %>%
+# summarise(med_time=median(time), IQR_time=IQR(time)) %>%
+# mutate(expr=str_remove(expr, "_fun\\(\\)")) %>%
+# separate(expr, into = c("expr", "time_type")) %>%
+# left_join(size_df) %>%
+# mutate(rep_type=case_when(
+# expr%in%c("msnexp", "ondisk", "spectra", "rams")~"Memory",
+# TRUE~"Disk"
+# )) %>%
+# mutate(expr=factor(expr, levels=c("msnexp", "ondisk", "spectra", "rams",
+# "tmzml", "arrow", "sql", "sqlidx"),
+# labels=c("MSnExp", "OnDiskMSnExp", "Spectra", "RaMS",
+# "tmzMLs", "Arrow", "SQL", "SQL (indexed)"))) %>%
+# pivot_wider(names_from = time_type, values_from=c("med_time", "IQR_time")) %>%
+# ggplot() +
+# geom_vline(xintercept = 0) +
+# geom_linerange(aes(x=med_time_make, ymin=med_time_query-IQR_time_query*2,
+# ymax=med_time_query+IQR_time_query*2, color=expr)) +
+# geom_linerange(aes(y=med_time_query, xmin=med_time_make-IQR_time_make*2,
+# xmax=med_time_make+IQR_time_make*2, color=expr)) +
+# geom_point(aes(x=med_time_make, y=med_time_query, color=expr,
+# size=size, shape=rep_type)) +
+# scale_shape_manual(values=c(16, 15)) +
+# scale_y_log10() +
+# coord_flip() +
+# guides(color = guide_legend(order = 1), shape = guide_legend(order = 2),
+# size=guide_legend(order=3)) +
+# labs(x="Time to transform (s)", y="Time to query (s)", color=NULL, size="Size (GB)",
+# shape="Storage") +
+# theme_bw()
+# ggsave("vignettes/figures/ssc_vignette_renders/sum_plot.png", width = 6.5, height = 5, units = "in", device = "png", dpi = 144)
+#
+#
+# # bind_rows(make_timings, query_timings) %>%
+# # as.data.frame() %>%
+# # group_by(expr) %>%
+# # mutate(time=time/1e9) %>%
+# # summarise(med_time=median(time)) %>%
+# # mutate(expr=str_remove(expr, "_fun\\(\\)")) %>%
+# # separate(expr, into = c("expr", "time_type")) %>%
+# # pivot_wider(names_from = time_type, values_from=med_time) %>%
+# # left_join(size_df) %>%
+# # plotly::plot_ly(x=~make, y=~query, z=~size, hovertext=~expr,
+# # type="scatter3d", mode="markers")
+
+## ----cleanup------------------------------------------------------------------
+# unlink("vignettes/figures/ssc_vignette_renders/tmzMLs", recursive = TRUE)
+# unlink("vignettes/figures/ssc_vignette_renders/pqds", recursive = TRUE)
+# unlink("vignettes/figures/ssc_vignette_renders/MSdata.sqlite")
+# unlink("vignettes/figures/ssc_vignette_renders/MSdata_idx.sqlite")
+# unlink("vignettes/figures/ssc_vignette_renders/Sample/", recursive = TRUE)
+
diff --git a/doc/speed_size_comparison.Rmd b/doc/speed_size_comparison.Rmd
index 7318949..8c52b83 100644
--- a/doc/speed_size_comparison.Rmd
+++ b/doc/speed_size_comparison.Rmd
@@ -14,6 +14,7 @@ vignette: >
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, eval=FALSE)
options(rmarkdown.html_vignette.check_title = FALSE)
+data.table::setDTthreads(2)
```
```{r}
diff --git a/doc/speed_size_comparison.html b/doc/speed_size_comparison.html
index a393201..0d7ffd6 100644
--- a/doc/speed_size_comparison.html
+++ b/doc/speed_size_comparison.html
@@ -12,7 +12,7 @@
-
+
Benchmarking - RaMS, MSnBase, Spectra, Arrow, and SQL
@@ -351,7 +351,7 @@
Benchmarking - RaMS, MSnBase, Spectra,
Arrow, and SQL
William Kumler
-2023-11-29
+2023-12-13
diff --git a/vignettes/Intro-to-RaMS.Rmd b/vignettes/Intro-to-RaMS.Rmd
index 7d411eb..43afd02 100644
--- a/vignettes/Intro-to-RaMS.Rmd
+++ b/vignettes/Intro-to-RaMS.Rmd
@@ -12,6 +12,7 @@ vignette: >
```{r, include = FALSE}
options(rmarkdown.html_vignette.check_title = FALSE)
options(tidyverse.quiet = TRUE)
+data.table::setDTthreads(2)
knitr::opts_chunk$set(
collapse = TRUE,
diff --git a/vignettes/Intro-to-tmzML.Rmd b/vignettes/Intro-to-tmzML.Rmd
index c7d2ae9..e79b9f8 100644
--- a/vignettes/Intro-to-tmzML.Rmd
+++ b/vignettes/Intro-to-tmzML.Rmd
@@ -12,6 +12,7 @@ vignette: >
```{r setup, include = FALSE}
options(rmarkdown.html_vignette.check_title = FALSE)
options(tidyverse.quiet = TRUE)
+data.table::setDTthreads(2)
knitr::opts_chunk$set(
collapse = TRUE,
diff --git a/vignettes/Minifying-files-with-RaMS.Rmd b/vignettes/Minifying-files-with-RaMS.Rmd
index 6ddbe20..f2e9ea4 100644
--- a/vignettes/Minifying-files-with-RaMS.Rmd
+++ b/vignettes/Minifying-files-with-RaMS.Rmd
@@ -12,6 +12,7 @@ vignette: >
```{r setup, include = FALSE}
options(rmarkdown.html_vignette.check_title = FALSE)
options(tidyverse.quiet = TRUE)
+data.table::setDTthreads(2)
knitr::opts_chunk$set(
collapse = TRUE,
diff --git a/vignettes/RaMS-and-friends.Rmd b/vignettes/RaMS-and-friends.Rmd
index c215d02..1ff0206 100644
--- a/vignettes/RaMS-and-friends.Rmd
+++ b/vignettes/RaMS-and-friends.Rmd
@@ -9,6 +9,7 @@ vignette: >
```{r setup, include = FALSE}
options(rmarkdown.html_vignette.check_title = FALSE)
+data.table::setDTthreads(2)
```
**Table of contents:**
diff --git a/vignettes/speed_size_comparison.Rmd b/vignettes/speed_size_comparison.Rmd
index 7318949..8c52b83 100644
--- a/vignettes/speed_size_comparison.Rmd
+++ b/vignettes/speed_size_comparison.Rmd
@@ -14,6 +14,7 @@ vignette: >
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, eval=FALSE)
options(rmarkdown.html_vignette.check_title = FALSE)
+data.table::setDTthreads(2)
```
```{r}