revisions

Becksteinlab · Oct 3, 2024 · 87c46bd · 87c46bd
1 parent d83a5df
commit 87c46bd
Show file tree

Hide file tree

Showing 2 changed files with 105 additions and 33 deletions.
diff --git a/joss_paper/paper.bib b/joss_paper/paper.bib
@@ -109,6 +109,24 @@ @article {MDverse:2024
   publisher = {eLife Sciences Publications, Ltd},
 }
 
+@article{MLMDMethods:2023,
+author = {Jackson, Nicholas E. and Savoie, Brett M. and Statt, Antonia and Webb, Michael A.},
+title = {Introduction to Machine Learning for Molecular Simulation},
+journal = {Journal of Chemical Theory and Computation},
+volume = {19},
+number = {14},
+pages = {4335-4337},
+year = {2023},
+doi = {10.1021/acs.jctc.3c00735},
+    note ={PMID: 37489106},
+URL = { 
+        https://doi.org/10.1021/acs.jctc.3c00735
+},
+eprint = { 
+        https://doi.org/10.1021/acs.jctc.3c00735
+}
+}
+
 @Article{NumPy:2020,
     title         = {Array programming with {NumPy}},
     author        = {Charles R. Harris and K. Jarrod Millman and St{\'{e}}fan J.
@@ -166,6 +184,28 @@ @inproceedings{ParallelAnalysis:2010
   series = {FAST'10}
 }
 
+@article{SplitApplyCombine:2011,
+ title={The Split-Apply-Combine Strategy for Data Analysis},
+ volume={40},
+ url={https://www.jstatsoft.org/index.php/jss/article/view/v040i01},
+ doi={10.18637/jss.v040.i01},
+ abstract={Many data analysis problems involve the application of a split-apply-combine strategy, where you break up a big problem into manageable pieces, operate on each piece independently and then put all the pieces back together. This insight gives rise to a new R package that allows you to smoothly apply this strategy, without having to worry about the type of structure in which your data is stored. The paper includes two case studies showing how these insights make it easier to work with batting records for veteran baseball players and a large 3d array of spatio-temporal ozone measurements.},
+ number={1},
+ journal={Journal of Statistical Software},
+ author={Wickham, Hadley},
+ year={2011},
+ pages={1–29}
+}
+
+@article{YiiP:2019,
+author = "Shujie Fan and Oliver Beckstein",
+title = "{Molecular Dynamics trajectories of membrane protein YiiP}",
+year = "2019",
+month = "5",
+url = "https://figshare.com/articles/dataset/Molecular_Dynamics_trajectories_of_membrane_protein_YiiP/8202149",
+doi = "10.6084/m9.figshare.8202149.v1"
+}
+
 @misc{Zarr:2024,
   doi = {10.5281/ZENODO.3773449},
   url = {https://zenodo.org/doi/10.5281/zenodo.3773449},
@@ -176,3 +216,17 @@ @misc{Zarr:2024
   copyright = {Creative Commons Attribution 4.0 International}
 }
 
+@misc{Zstandard:2021,
+    series =    {Request for Comments},
+    number =    8878,
+    howpublished =  {RFC 8878},
+    publisher = {RFC Editor},
+    doi =       {10.17487/RFC8878},
+    url =       {https://www.rfc-editor.org/info/rfc8878},
+    author =    {Yann Collet and Murray Kucherawy},
+    title =     {{Zstandard Compression and the 'application/zstd' Media Type}},
+    pagetotal = 45,
+    year =      2021,
+    month =     feb,
+    abstract =  {Zstandard, or "zstd" (pronounced "zee standard"), is a lossless data compression mechanism. This document describes the mechanism and registers a media type, content encoding, and a structured syntax suffix to be used when transporting zstd-compressed content via MIME. Despite use of the word "standard" as part of Zstandard, readers are advised that this document is not an Internet Standards Track specification; it is being published for informational purposes only. This document replaces and obsoletes RFC 8478.},
+}
diff --git a/joss_paper/paper.md b/joss_paper/paper.md
@@ -9,24 +9,32 @@ tags:
 authors:
   - name: Lawson Woods
     orcid: 0009-0003-0713-4167
-    affiliation: 1 
+    affiliation: [3, 4]
   - name: Hugo Macdermott-Opeskin
     orcid: 0000-0002-7393-7457
-    affiliation: 1
   - name: Edis Jakupovic 
-    affiliation: 1
+    affiliation: [1, 2]
   - name: Yuxuan Zhuang
     orcid: 0000-0003-4390-8556
-    affiliations: 1
+    affiliations: [5, 6]
   - name: Richard J Gowers
     orcid: 0000-0002-3241-1846
-    affiliations: 1
   - name: Oliver Beckstein
     orcid: 000-0003-1340-0831
-    affiliation: 1
+    affiliation: [1, 2]
 affiliations:
- - name: Placeholder
+ - name: Department of Physics, Arizona State University, Tempe, Arizona, United States of America
    index: 1
+ - name: Center for Biological Physics, Arizona State University, Tempe, AZ, United States of America
+   index: 2
+ - name: School of Computing and Augmented Intelligence, Arizona State University, Tempe, Arizona, United States of America
+   index: 3
+ - name: School of Molecular Sciences, Arizona State University, Tempe, Arizona, United States of America
+   index: 4
+ - name: Department of Computer Science, Stanford University, Stanford, CA 94305, USA.
+   index: 5
+ - name: Departments of Molecular and Cellular Physiology and Structural Biology, Stanford University School of Medicine, Stanford, CA 94305, USA.
+   index: 6
 date: 22 September 2024
 bibliography: paper.bib
 ---
@@ -56,7 +64,7 @@ new constraints on research in this field.
 Other groups in the field recognize this same need for adherence to 
 FAIR principles [@FAIR:2019] including the MDDB (Molecular Dynamics Data Bank), an EU-scale 
 repository for biosimulation data [@MDDB:2024] and MDverse, a prototype search engine 
-for publicly-available Gromacs simulation data [@MDverse:2024].
+for publicly-available GROMACS simulation data [@MDverse:2024].
 While these efforts currently offer prototype solutions for indexing and 
 searching MD trajectory data, the problem of efficiently distributing the data remains. 
 
@@ -67,54 +75,64 @@ so a solution which could prevent this
 duplication of storage and unnecessary download step would provide greater utility 
 for the computational molecular sciences ecosystem.
 
-Enter `Zarrtraj`, the first fully-functioning tool to our knowledge that allows 
+Enter *Zarrtraj*, the first fully-functioning tool to our knowledge that allows 
 streaming trajectories into analysis software using an established trajectory format.
-`Zarrtraj` is implemented as an `MDAnalysis` [@MDAnalysis:2016] `MDAKit` [@MDAKits:2023] that
-enables streaming MD trajectories in the popular `HDF5`-based H5MD format [@H5MD:2014]
+*Zarrtraj* is implemented as an MDAnalysis [@MDAnalysis:2016] MDAKit [@MDAKits:2023] that
+enables streaming MD trajectories in the popular HDF5-based H5MD format [@H5MD:2014]
 from AWS S3, Google Cloud Buckets, and Azure Blob Storage & Data Lakes without ever downloading them.
-This is possible thanks to the `Zarr` [@Zarr:2024] package which allows 
+This is possible thanks to the *Zarr* [@Zarr:2024] package which allows 
 streaming array-like data from a variety of storage mediums and [Kerchunk](https://github.com/fsspec/kerchunk), 
-which extends the capability of `Zarr` by allowing it to read `HDF5` files.
-Because it implements the standard `MDAnalysis` trajectory reader API,
-`Zarrtraj` can leverage `Zarr`'s ability to read a file in parallel to perform analysis 
-algorithms in parallel using the "split-apply-combine" paradigm. In addition to the `H5MD` format, 
-`Zarrtraj` can stream and write trajectories in the experimental `ZarrMD` 
-format, which ports the `H5MD` layout to the `Zarr` filetype.
-
-One imported, `Zarrtraj` allows passing trajectory URLs just like ordinary files:
+which extends the capability of *Zarr* by allowing it to read HDF5 files.
+Because it implements the standard MDAnalysis trajectory reader API,
+*Zarrtraj* can leverage *Zarr*'s ability to read a slice of a file and even 
+to read a file in parallel, making it compatible with
+analysis algorithms that use the "split-apply-combine" parallelization strategy [@SplitApplyCombine:2011].
+In addition to the H5MD format, 
+*Zarrtraj* can stream and write trajectories in the experimental ZarrMD
+format, which ports the H5MD layout to the *Zarr* filetype.
+
+Once imported, *Zarrtraj* allows passing trajectory URLs just like ordinary files:
 ```python
 import zarrtraj
 import MDAnalysis as mda
 
-u = mda.Universe("sample_topology.top", "s3://sample-bucket-name/trajectory.h5md")
+u = mda.Universe("topology.pdb", "s3://sample-bucket-name/trajectory.h5md")
 ```
-Initial benchmarks show that `Zarrtraj` can iterate
+Initial benchmarks show that *Zarrtraj* can iterate serially
 through an AWS S3 cloud trajectory (load into memory one frame at a time)
 at roughly 1/2 or 1/3 the speed it can iterate through the same trajectory from disk and roughly 
 1/5 to 1/10 the speed it can iterate through the same trajectory on disk in XTC format \autoref{fig:benchmark}.
 However, it should be noted that this speed is influenced by network latency and that
-writing parallelized algorithms can offset this loss of speed.
+writing parallelized algorithms can offset this loss of speed as in \autoref{fig:RMSD}. 
+
+![Benchmarks performed on a machine with 2 Intel Xeon 2.00GHz CPUs, 32GB of RAM, and an SSD configured with RAID 0. The trajectory used for benchmarking was the YiiP trajectory from MDAnalysisData [@YiiP:2019], a 9000-frame (90ns), 111,815 particle simulation of a membrane-protein system. The original 3.47GB XTC trajectory was converted into an uncompressed 11.3GB H5MD trajectory and an uncompressed 11.3GB ZarrMD trajectory using the MDAnalysis `H5MDWriter` and *Zarrtraj* `ZarrMD` writers, respectively. \label{fig:benchmark}](benchmark.png)
 
-![Benchmarks performed on a machine with 2 Intel Xeon 2.00GHz CPUs, 32GB of RAM, and an SSD configured with RAID 0.\label{fig:benchmark}](benchmark.png)
+![RMSD benchmarks performed on the same machine as \autoref{fig:benchmark}. YiiP trajectory aligned to first frame as reference using `MDAnalysis.analysis.align.AlignTraj` and converted to compressed, quantized H5MD (7.8GB) and ZarrMD (4.9GB) trajectories. RMSD performed using development branch of MDAnalysis (2.8.0dev) with "serial" and "dask" backends. See [this notebook]() for full benchmark codes. \label{fig:RMSD}](rmsd.png)
 
-With `Zarrtraj`, we envision research groups making their data publicly available 
+*Zarrtraj* is capable of making use of *Zarr*'s powerful compression and quantization when writing ZarrMD trajectories. 
+The uncompressed MDAnalysisData YiiP trajectory in ZarrMD format is reduced from 11.3GB uncompressed 
+to just 4.9GB after compression with the Zstandard algorithm [@Zstandard:2021] 
+and quantization to 3 digits of precision. See [performance considerations](https://zarrtraj.readthedocs.io/en/latest/performance_considerations.html)
+for more.
+
+This work builds on the existing MDAnalysis `H5MDReader`
+[@H5MDReader:2021], and similarly uses *NumPy* [@NumPy:2020] as a common interface in-between MDAnalysis
+and the file storage medium. *Zarrtraj* was inspired and made possible by similar efforts in the 
+geosciences community to align data practices with FAIR principles [@PANGEO:2022].
+
+With *Zarrtraj*, we envision research groups making their data publicly available 
 via a cloud URL so that anyone can reuse their trajectories and reproduce their results.
 Large databases, like MDDB and MDverse, can expose a URL associated with each 
 trajectory in their databases so that users can make a query and immediately use the resulting
 trajectories to run an analysis on the hits that match their search. Groups seeking to 
-collect a large volume of trajectory data to train machine learning models can make use
+collect a large volume of trajectory data to train machine learning models [@MLMDMethods:2023] can make use
 of our tool to efficiently and inexpensively obtain the data they need from these published 
 URLs.
 
-This work builds on the existing `MDAnalysis` `H5MDReader`
-[@H5MDReader:2021], and similarly uses `NumPy` [@NumPy:2020] as a common interface in-between `MDAnalysis`
-and the file storage medium. `Zarrtraj` was inspired and made possible by similar efforts in the 
-geosciences community to align data practices with FAIR principles [@PANGEO:2022].
-
 
 # Acknowledgements
-Thank you to Dr. Jenna Swarthout Goddard for supporting the GSoC program at MDAnalysis. 
-Thank you to Martin Durant, author of Kerchunk, for helping refine and merge features in his upstream codebase 
+We thank Dr. Jenna Swarthout Goddard for supporting the GSoC program at MDAnalysis. 
+We also thank Martin Durant, author of Kerchunk, for helping refine and merge features in his upstream codebase 
 necessary for this project. LW was a participant in the Google Summer of Code 2024 program.
 
 # References