Becksteinlab · ljwoods2 · Oct 3, 2024 · Sep 22, 2024 · Sep 22, 2024 · Sep 22, 2024
diff --git a/joss_paper/benchmark.png b/joss_paper/benchmark.png
diff --git a/joss_paper/figure_1.ipynb b/joss_paper/figure_1.ipynb
diff --git a/joss_paper/paper.bib b/joss_paper/paper.bib
@@ -0,0 +1,178 @@
+@article{FAIR:2019,
+  title = {Make scientific data FAIR},
+  volume = {570},
+  ISSN = {1476-4687},
+  url = {http://dx.doi.org/10.1038/d41586-019-01720-7},
+  DOI = {10.1038/d41586-019-01720-7},
+  number = {7759},
+  journal = {Nature},
+  publisher = {Springer Science and Business Media LLC},
+  author = {Stall,  Shelley and Yarmey,  Lynn and Cutcher-Gershenfeld,  Joel and Hanson,  Brooks and Lehnert,  Kerstin and Nosek,  Brian and Parsons,  Mark and Robinson,  Erin and Wyborn,  Lesley},
+  year = {2019},
+  month = jun,
+  pages = {27–29}
+}
+
+@misc{FoldingAtHome:2020,
+  title        = {Foldingathome COVID-19 Datasets},
+  url = {https://registry.opendata.aws/foldingathome-covid19},
+  note         = {Accessed: September 25, 2024}
+}
+
+@article{H5MD:2014,
+    title = {H5MD: A structured, efficient, and portable file format for molecular data},
+    journal = {Computer Physics Communications},
+    volume = {185},
+    number = {6},
+    pages = {1546-1553},
+    year = {2014},
+    issn = {0010-4655},
+    doi = {https://doi.org/10.1016/j.cpc.2014.01.018},
+    url = {https://www.sciencedirect.com/science/article/pii/S0010465514000447},
+    author = {Pierre {de Buyl} and Peter H. Colberg and Felix Höfling},
+    keywords = {Molecular simulation, HDF5},
+    abstract = {We propose a new file format named “H5MD” for storing molecular simulation data, such as trajectories of particle positions and velocities, along with thermodynamic observables that are monitored during the course of the simulation. H5MD files are HDF5 (Hierarchical Data Format) files with a specific hierarchy and naming scheme. Thus, H5MD inherits many benefits of HDF5, e.g., structured layout of multi-dimensional datasets, data compression, fast and parallel I/O, and portability across many programming languages and hardware platforms. H5MD files are self-contained, and foster the reproducibility of scientific data and the interchange of data between researchers using different simulation programs and analysis software. In addition, the H5MD specification can serve for other kinds of data (e.g. experimental data) and is extensible to supplemental data, or may be part of an enclosing file structure.}
+}
+
+@inproceedings{H5MDReader:2021,
+	address = {Austin, TX},
+	title = {{MPI}-parallel {Molecular} {Dynamics} {Trajectory} {Analysis} with the {H5MD} {Format} in the {MDAnalysis} {Python} {Package}},
+	url = {https://conference.scipy.org/proceedings/scipy2021/edis_jakupovic.html},
+	doi = {10.25080/majora-1b6fd038-005},
+	abstract = {Molecular dynamics (MD) computer simulations help elucidate details of the molecular processes in complex biological systems, from protein dynamics to drug discovery. One major issue is that these MD simulation files are now commonly terabytes in size, which means analyzing the data from these files becomes a painstakingly expensive task. In the age of national supercomputers, methods of parallel analysis are becoming a necessity for the efficient use of time and high performance computing (HPC) resources but for any approach to parallel analysis, simply reading the file from disk becomes the performance bottleneck that limits overall analysis speed. One promising way around this file I/O hurdle is to use a parallel message passing interface (MPI) implementation with the HDF5 (Hierarchical Data Format 5) file format to access a single file simultaneously with numerous processes on a parallel file system. Our previous feasibility study suggested that this combination can lead to favorable parallel scaling with hundreds of CPU cores, so we implemented a fast and user-friendly HDF5 reader (the H5MDReader class) that adheres to H5MD (HDF5 for Molecular Dynamics) specifications. We made H5MDReader (together with a H5MD output class H5MDWriter) available in the MDAnalysis library, a Python package that simplifies the process of reading and writing various popular MD file formats by providing a streamlined user-interface that is independent of any specific file format. We benchmarked H5MDReader's parallel file reading capabilities on three HPC clusters: ASU Agave, SDSC Comet, and PSC Bridges. The benchmark consisted of a simple split-apply-combine scheme of an I/O bound task that split a 90k frame (113 GiB) coordinate trajectory into chunks for processes, where each process performed the commonly used RMSD (root mean square distance after optimal structural superposition) calculation on their chunk of data, and then gathered the results back to the root process. For baseline performance, we found maximum I/O speedups at 2 full nodes, with Agave showing 20x, and a maximum computation speedup on Comet of 373x on 384 cores (all three HPCs scaled well in their computation task). We went on to test a series of optimizations attempting to speed up I/O performance, including adjusting file system stripe count, implementing a masked array feature that only loads relevant data for the computation task, front loading all I/O by loading the entire trajectory into memory, and manually adjusting the HDF5 dataset chunk shapes. We found the largest improvement in I/O performance by optimizing the chunk shape of the HDF5 datasets to match the iterative access pattern of our analysis benchmark. With respect to baseline serial performance, our best result was a 98x speedup at 112 cores on ASU Agave. In terms of absolute time saved, the analysis went from 4623 seconds in the baseline serial run to 47 seconds in the parallel, properly chunked run. Our results emphasize the fact that file I/O is not just dependent on the access pattern of the file, but more so the synergy between access pattern and the layout of the file on disk.},
+	urldate = {2021-07-05},
+	booktitle = {Proceedings of the 20th {Python} in {Science} {Conference}},
+	author = {Jakupovic, Edis and Beckstein, Oliver},
+	editor = {Agarwal, Meghann and Calloway, Chris and Niederhut, Dillon and Shupe, David},
+	year = {2021},
+	pages = {40--48},
+}
+
+@INPROCEEDINGS{MDAKits:2023,
+  title      = "{MDAKits}: A framework for {FAIR-compliant} molecular
+                simulation analysis",
+  booktitle  = "Proceedings of the Python in Science Conference",
+  author     = "Alibay, Irfan and Wang, Lily and Naughton, Fiona and Kenney,
+                Ian and Barnoud, Jonathan and Gowers, Richard and Beckstein,
+                Oliver",
+  publisher  = "SciPy",
+  pages      = "76--84",
+  year       =  2023,
+  conference = "Python in Science Conference",
+  location   = "Austin, Texas"
+}
+
+
+@InProceedings{MDAnalysis:2016,
+  author    = { {R}ichard {J}. {G}owers and {M}ax {L}inke and {J}onathan {B}arnoud and {T}yler {J}. {E}. {R}eddy and {M}anuel {N}. {M}elo and {S}ean {L}. {S}eyler and {J}an {D}omański and {D}avid {L}. {D}otson and {S}ébastien {B}uchoux and {I}an {M}. {K}enney and {O}liver {B}eckstein },
+  title     = { {M}{D}{A}nalysis: {A} {P}ython {P}ackage for the {R}apid {A}nalysis of {M}olecular {D}ynamics {S}imulations },
+  booktitle = { {P}roceedings of the 15th {P}ython in {S}cience {C}onference },
+  pages     = { 98 - 105 },
+  year      = { 2016 },
+  editor    = { {S}ebastian {B}enthall and {S}cott {R}ostrup },
+  doi       = { 10.25080/Majora-629e541a-00e }
+}
+
+
+@article{MDAnalysis:2011,
+    author = {Michaud-Agrawal, Naveen and Denning, Elizabeth J. and Woolf, Thomas B. and Beckstein, Oliver},
+    title = {MDAnalysis: A toolkit for the analysis of molecular dynamics simulations},
+    journal = {Journal of Computational Chemistry},
+    volume = {32},
+    number = {10},
+    pages = {2319-2327},
+    keywords = {molecular dynamics simulations, analysis, proteins, object-oriented design, software, membrane systems, Python programming language},
+    doi = {https://doi.org/10.1002/jcc.21787},
+    url = {https://onlinelibrary.wiley.com/doi/abs/10.1002/jcc.21787},
+    eprint = {https://onlinelibrary.wiley.com/doi/pdf/10.1002/jcc.21787},
+    abstract = {Abstract MDAnalysis is an object-oriented library for structural and temporal analysis of molecular dynamics (MD) simulation trajectories and individual protein structures. It is written in the Python language with some performance-critical code in C. It uses the powerful NumPy package to expose trajectory data as fast and efficient NumPy arrays. It has been tested on systems of millions of particles. Many common file formats of simulation packages including CHARMM, Gromacs, Amber, and NAMD and the Protein Data Bank format can be read and written. Atoms can be selected with a syntax similar to CHARMM's powerful selection commands. MDAnalysis enables both novice and experienced programmers to rapidly write their own analytical tools and access data stored in trajectories in an easily accessible manner that facilitates interactive explorative analysis. MDAnalysis has been tested on and works for most Unix-based platforms such as Linux and Mac OS X. It is freely available under the GNU General Public License from http://mdanalysis.googlecode.com. © 2011 Wiley Periodicals, Inc. J Comput Chem 2011},
+    year = {2011}
+}
+
+@article {MDverse:2024,
+  article_type = {journal},
+  title = {MDverse, shedding light on the dark matter of molecular dynamics simulations},
+  author = {Tiemann, Johanna KS and Szczuka, Magdalena and Bouarroudj, Lisa and Oussaren, Mohamed and Garcia, Steven and Howard, Rebecca J and Delemotte, Lucie and Lindahl, Erik and Baaden, Marc and Lindorff-Larsen, Kresten and Chavent, Matthieu and Poulain, Pierre},
+  editor = {Haider, Shozeb and Cui, Qiang},
+  volume = 12,
+  year = 2024,
+  month = {aug},
+  pub_date = {2024-08-30},
+  pages = {RP90061},
+  citation = {eLife 2024;12:RP90061},
+  doi = {10.7554/eLife.90061},
+  url = {https://doi.org/10.7554/eLife.90061},
+  abstract = {The rise of open science and the absence of a global dedicated data repository for molecular dynamics (MD) simulations has led to the accumulation of MD files in generalist data repositories, constituting the \textit{dark matter of MD} — data that is technically accessible, but neither indexed, curated, or easily searchable. Leveraging an original search strategy, we found and indexed about 250,000 files and 2000 datasets from Zenodo, Figshare and Open Science Framework. With a focus on files produced by the Gromacs MD software, we illustrate the potential offered by the mining of publicly available MD data. We identified systems with specific molecular composition and were able to characterize essential parameters of MD simulation such as temperature and simulation length, and could identify model resolution, such as all-atom and coarse-grain. Based on this analysis, we inferred metadata to propose a search engine prototype to explore the MD data. To continue in this direction, we call on the community to pursue the effort of sharing MD data, and to report and standardize metadata to reuse this valuable matter.},
+  keywords = {molecular dynamics, simulation, modeling, FAIR},
+  journal = {eLife},
+  issn = {2050-084X},
+  publisher = {eLife Sciences Publications, Ltd},
+}
+
+@Article{NumPy:2020,
+    title         = {Array programming with {NumPy}},
+    author        = {Charles R. Harris and K. Jarrod Millman and St{\'{e}}fan J.
+                    van der Walt and Ralf Gommers and Pauli Virtanen and David
+                    Cournapeau and Eric Wieser and Julian Taylor and Sebastian
+                    Berg and Nathaniel J. Smith and Robert Kern and Matti Picus
+                    and Stephan Hoyer and Marten H. van Kerkwijk and Matthew
+                    Brett and Allan Haldane and Jaime Fern{\'{a}}ndez del
+                    R{\'{i}}o and Mark Wiebe and Pearu Peterson and Pierre
+                    G{\'{e}}rard-Marchant and Kevin Sheppard and Tyler Reddy and
+                    Warren Weckesser and Hameer Abbasi and Christoph Gohlke and
+                    Travis E. Oliphant},
+    year          = {2020},
+    month         = sep,
+    journal       = {Nature},
+    volume        = {585},
+    number        = {7825},
+    pages         = {357--362},
+    doi           = {10.1038/s41586-020-2649-2},
+    publisher     = {Springer Science and Business Media {LLC}},
+    url           = {https://doi.org/10.1038/s41586-020-2649-2}
+}
+
+@ARTICLE{PANGEO:2022,
+  AUTHOR={Stern, Charles  and Abernathey, Ryan  and Hamman, Joseph  and Wegener, Rachel  and Lepore, Chiara  and Harkins, Sean  and Merose, Alexander },
+
+  TITLE={Pangeo Forge: Crowdsourcing Analysis-Ready, Cloud Optimized Data Production},
+
+  JOURNAL={Frontiers in Climate},
+
+  VOLUME={3},
+
+  YEAR={2022},
+
+  URL={https://www.frontiersin.org/journals/climate/articles/10.3389/fclim.2021.782909},
+
+  DOI={10.3389/fclim.2021.782909},
+
+  ISSN={2624-9553},
+
+  ABSTRACT={<p>Pangeo Forge is a new community-driven platform that accelerates science by providing high-level recipe frameworks alongside cloud compute infrastructure for extracting data from provider archives, transforming it into analysis-ready, cloud-optimized (ARCO) data stores, and providing a human- and machine-readable catalog for browsing and loading. In abstracting the scientific domain logic of data recipes from cloud infrastructure concerns, Pangeo Forge aims to open a door for a broader community of scientists to participate in ARCO data production. A wholly open-source platform composed of multiple modular components, Pangeo Forge presents a foundation for the practice of reproducible, cloud-native, big-data ocean, weather, and climate science without relying on proprietary or cloud-vendor-specific tooling.</p>}
+}
+
+@inproceedings{ParallelAnalysis:2010,
+  author = {Tu, Tiankai and Rendleman, Charles A. and Miller, Patrick J. and Sacerdoti, Federico and Dror, Ron O. and Shaw, David E.},
+  title = {Accelerating parallel analysis of scientific simulation data via Zazen},
+  year = {2010},
+  publisher = {USENIX Association},
+  address = {USA},
+  abstract = {As a new generation of parallel supercomputers enables researchers to conduct scientific simulations of unprecedented scale and resolution, terabyte-scale simulation output has become increasingly commonplace. Analysis of such massive data sets is typically I/O-bound: many parallel analysis programs spend most of their execution time reading data from disk rather than performing useful computation. To overcome this I/O bottleneck, we have developed a new data access method. Our main idea is to cache a copy of simulation output files on the local disks of an analysis cluster's compute nodes, and to use a novel task-assignment protocol to co-locate data access with computation. We have implemented our methodology in a parallel disk cache system called Zazen. By avoiding the overhead associated with querying metadata servers and by reading data in parallel from local disks, Zazen is able to deliver a sustained read bandwidth of over 20 gigabytes per second on a commodity Linux cluster with 100 nodes, approaching the optimal aggregated I/O bandwidth attainable on these nodes. Compared with conventional NFS, PVFS2, and Hadoop/HDFS, respectively, Zazen is 75, 18, and 6 times faster for accessing large (1-GB) files, and 25, 13, and 85 times faster for accessing small (2-MB) files. We have deployed Zazen in conjunction with Anton--a special-purpose supercomputer that dramatically accelerates molecular dynamics (MD) simulations-- and have been able to accelerate the parallel analysis of terabyte-scale MD trajectories by about an order of magnitude.},
+  booktitle = {Proceedings of the 8th USENIX Conference on File and Storage Technologies},
+  pages = {10},
+  numpages = {1},
+  location = {San Jose, California},
+  series = {FAST'10}
+}
+
+@misc{Zarr:2024,
+  doi = {10.5281/ZENODO.3773449},
+  url = {https://zenodo.org/doi/10.5281/zenodo.3773449},
+  author = {Alistair Miles,   and jakirkham,   and M Bussonnier,   and Josh Moore,   and Dimitri Papadopoulos Orfanos,   and Davis Bennett,   and David Stansby,   and Joe Hamman,   and James Bourbeau,   and Andrew Fulton,   and Gregory Lee,   and Ryan Abernathey,   and Norman Rzepka,   and Zain Patel,   and Mads R. B. Kristensen,   and Sanket Verma,   and Saransh Chopra,   and Matthew Rocklin,   and AWA BRANDON AWA,   and Max Jones,   and Martin Durant,   and Elliott Sales de Andrade,   and Vincent Schut,   and raphael dussin,   and Shivank Chaudhary,   and Chris Barnes,   and Juan Nunez-Iglesias,   and shikharsg,  },
+  title = {zarr-developers/zarr-python: v3.0.0-alpha},
+  publisher = {Zenodo},
+  year = {2024},
+  copyright = {Creative Commons Attribution 4.0 International}
+}
+