diff --git a/.readthedocs.yaml b/.readthedocs.yaml
new file mode 100644
index 0000000..476326b
--- /dev/null
+++ b/.readthedocs.yaml
@@ -0,0 +1,21 @@
+# Read the Docs configuration file for Sphinx projects
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+version: 2
+
+build:
+ os: ubuntu-22.04
+ tools:
+ python: "3.12"
+
+# Build documentation in the "docs/" directory with Sphinx
+sphinx:
+ configuration: docs/conf.py
+ fail_on_warning: true
+
+python:
+ install:
+ - method: pip
+ path: .
+ extra_requirements:
+ - docs
diff --git a/docs/_static/cladetime_logo_dark_mode.png b/docs/_static/cladetime_logo_dark_mode.png
new file mode 100644
index 0000000..2129dc5
Binary files /dev/null and b/docs/_static/cladetime_logo_dark_mode.png differ
diff --git a/docs/_static/cladetime_logo_light_mode.png b/docs/_static/cladetime_logo_light_mode.png
new file mode 100644
index 0000000..808dc5b
Binary files /dev/null and b/docs/_static/cladetime_logo_light_mode.png differ
diff --git a/docs/_static/reichlab.png b/docs/_static/reichlab.png
new file mode 100644
index 0000000..b372c61
Binary files /dev/null and b/docs/_static/reichlab.png differ
diff --git a/docs/_static/reichlab_favicon.png b/docs/_static/reichlab_favicon.png
new file mode 100644
index 0000000..3cce4ff
Binary files /dev/null and b/docs/_static/reichlab_favicon.png differ
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 0000000..8c79c0c
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,137 @@
+import os
+import sys
+from datetime import date
+
+# Configuration file for the Sphinx documentation builder.
+
+# -- Project information
+
+project = "Cladetime"
+project_copyright = f"{date.today().year}, Reich Lab @ The University of Massachusetts Amherst"
+author = "Reich Lab"
+
+# Add cladetime location to the path, so we can use autodoc to
+# generate API documentation from docstrings.
+root_path = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+sys.path.insert(0, root_path)
+
+release = "0.1"
+# FIXME: get the version dynamically
+version = "0.1.0"
+
+# -- General configuration
+
+extensions = [
+ "myst_parser",
+ "sphinx.ext.autodoc",
+ "sphinx_copybutton",
+ "sphinx.ext.doctest",
+ "sphinx.ext.intersphinx",
+ "sphinx_github_style",
+ "sphinxext.opengraph",
+ "sphinx.ext.napoleon",
+]
+
+intersphinx_mapping = {
+ "python": ("https://docs.python.org/3/", None),
+ "sphinx": ("https://www.sphinx-doc.org/en/master/", None),
+ "polars": ("https://docs.pola.rs/api/python/stable", None),
+}
+intersphinx_disabled_domains = ["std"]
+
+# Copied these settings from the copybutton's config
+# https://github.com/executablebooks/sphinx-copybutton/blob/master/docs/conf.py
+copybutton_prompt_text = r">>> |\.\.\. |\$ |In \[\d*\]: | {2,5}\.\.\.: | {5,8}: "
+copybutton_prompt_is_regexp = True
+copybutton_line_continuation_character = "\\"
+copybutton_here_doc_delimiter = "EOT"
+copybutton_selector = "div:not(.no-copybutton) > div.highlight > pre"
+
+templates_path = ["_templates"]
+
+# The root toctree document.
+root_doc = "index"
+
+# Test code blocks only when explicitly specified
+doctest_test_doctest_blocks = ""
+
+# The theme to use for HTML and HTML Help pages. See the documentation for
+# a list of builtin themes.
+html_static_path = ["_static"]
+html_theme = "furo"
+html_favicon = "_static/reichlab_favicon.png"
+html_title = "Cladetime"
+html_last_updated_fmt = "%Y-%m-%d"
+
+# Settings for the GitHub link extension
+linkcode_url = "https://github.com/reichlab/cladetime"
+
+# These folders are copied to the documentation's HTML output
+html_theme_options = {
+ "announcement": """
+
+ Cladetime is a work in progress. Please feel free to file issues on GitHub.
+
+ """,
+ "sidebar_hide_name": True,
+ "light_logo": "cladetime_logo_light_mode.png",
+ "dark_logo": "cladetime_logo_dark_mode.png",
+ "navigation_with_keys": True,
+ "source_repository": "https://github.com/reichlab/cladetime/",
+ # source for GitHub footer icon:
+ # https://pradyunsg.me/furo/customisation/footer/#using-embedded-svgs
+ "footer_icons": [
+ {
+ "name": "GitHub",
+ "url": "https://github.com/reichlab/cladetime",
+ "html": """
+
+ """,
+ "class": "",
+ },
+ ],
+}
+
+# from https://myst-parser.readthedocs.io/en/latest/syntax/optional.html
+myst_enable_extensions = [
+ "amsmath",
+ "deflist",
+ "dollarmath",
+ "fieldlist",
+ "substitution",
+ "tasklist",
+ "colon_fence",
+ "attrs_inline",
+]
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = "friendly"
+
+# Show typehints as content of the function or method
+autodoc_typehints = "description"
+autodoc_member_order = "bysource"
+
+# Open Graph metadata
+ogp_site_url = "https://cladetime.readthedocs.io"
+ogp_title = "cladetime documentation"
+ogp_type = "website"
+ogp_image = "https://cladetime.readthedocs.io/en/latest/_static/cladetime_logo_light_mode.png"
+ogp_social_cards = {
+ "image": "https://cladetime.readthedocs.io/en/latest/_static/cladetime_logo_light_mode.png",
+ "line_color": "#5d9c9c",
+}
+
+# Warn about all references to unknown targets
+nitpicky = True
+nitpick_ignore = [
+ ("py:class", "datetime"),
+ ("py:class", "polars.LazyFrame"),
+ ("py:class", "polars.lazyframe.frame.LazyFrame"),
+]
+
+
+# -- Options for EPUB output
+epub_show_urls = "footnote"
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 0000000..bc25652
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,68 @@
+:og:description: Cladetime is a Python interface for accessing SARS-CoV-2 sequence and clade data provided by Nextstrain.
+
+===============
+ Cladetime
+===============
+
+Cladetime is a Python interface for accessing `Nextstrain `_ SARS-CoV-2 sequence and clade data.
+
+.. toctree::
+ :titlesonly:
+ :hidden:
+
+ Home
+ user-guide
+ reference/index
+
+Installation
+------------
+
+Cladetime can be installed with `pip `_:
+
+.. code-block:: bash
+
+ $ pip install git+https://github.com/reichlab/cladetime.git
+
+
+Usage
+-----
+
+The CladeTime :any:`CladeTime` class provides a lightweight wrapper around historical and current
+SARS-CoV-2 GenBank sequence and sequence metadata created by `nextstrain.org's `_
+daily workflow pipeline.
+
+.. code-block:: python
+
+ >>> import polars as pl
+ >>> from cladetime import CladeTime
+
+ >>> ct = CladeTime()
+ >>> filtered_sequence_metadata = (
+ ... ct.sequence_metadata.select(["country", "division", "date", "host", "clade_nextstrain"])
+ ... .filter(
+ ... pl.col("country") == "USA",
+ ... pl.col("date").is_not_null(),
+ ... pl.col("host") == "Homo sapiens",
+ ... )
+ ... .cast({"date": pl.Date}, strict=False)
+ ... )
+
+ >>> filtered_sequence_metadata.head(5).collect()
+
+ shape: (5, 5)
+ ┌─────────┬──────────┬────────────┬──────────────┬──────────────────┐
+ │ country ┆ division ┆ date ┆ host ┆ clade_nextstrain │
+ │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
+ │ str ┆ str ┆ date ┆ str ┆ str │
+ ╞═════════╪══════════╪════════════╪══════════════╪══════════════════╡
+ │ USA ┆ Alabama ┆ 2022-07-07 ┆ Homo sapiens ┆ 22A │
+ │ USA ┆ Arizona ┆ 2022-07-02 ┆ Homo sapiens ┆ 22B │
+ │ USA ┆ Arizona ┆ 2022-07-19 ┆ Homo sapiens ┆ 22B │
+ │ USA ┆ Arizona ┆ 2022-07-15 ┆ Homo sapiens ┆ 22B │
+ │ USA ┆ Arizona ┆ 2022-07-20 ┆ Homo sapiens ┆ 22B │
+ └─────────┴──────────┴────────────┴──────────────┴──────────────────┘
+
+See the :doc:`user-guide` for more details about working with Cladetime.
+
+The :doc:`reference/index` documentation provides API-level documentation.
+
diff --git a/docs/reference/cladetime.rst b/docs/reference/cladetime.rst
new file mode 100644
index 0000000..35d1615
--- /dev/null
+++ b/docs/reference/cladetime.rst
@@ -0,0 +1,8 @@
+:og:description: Cladetime is a Python interface for accessing Sars-Cov-2 sequence and clade data provided by Nextstrain.
+
+==========
+CladeTime
+==========
+
+.. autoclass:: cladetime.CladeTime
+ :members:
diff --git a/docs/reference/index.rst b/docs/reference/index.rst
new file mode 100644
index 0000000..7660209
--- /dev/null
+++ b/docs/reference/index.rst
@@ -0,0 +1,6 @@
+API Reference
+=============
+
+.. toctree::
+
+ cladetime
diff --git a/docs/user-guide.rst b/docs/user-guide.rst
new file mode 100644
index 0000000..fb1f6aa
--- /dev/null
+++ b/docs/user-guide.rst
@@ -0,0 +1,94 @@
+===============
+User Guide
+===============
+
+
+
+Finding Nextstrain SARS-CoV-2 sequences and sequence metadata
+--------------------------------------------------------------
+
+Cladetime provides a CladeTime class that provides a lightweight interface to nextstrain.org files.
+
+.. code-block:: python
+
+ >>> from cladetime import CladeTime
+
+ # Instantiating a CladeTime object with no parameters will use the
+ # latest available data from nextstrain.org.
+ >>> ct = CladeTime()
+
+ # URL to the most recent SARS-CoV-2 sequence file (.fasta)
+ >>> ct.url_sequence
+ https://nextstrain-data.s3.amazonaws.com/files/ncov/open/sequences.fasta.zst?versionId=d66Hn1T0eFMAg8osEh8Yrod.QEUBRxvu'
+
+ # URL to the metadata that describes the sequences in the above file
+ >>> ct.url.sequence_metadata
+ 'https://nextstrain-data.s3.amazonaws.com/files/ncov/open/metadata.tsv.zst?versionId=JTXXFlKyyvt9AerxKMwoZflhFYQFrDek'
+
+ # Metadata about the nextstrain data pipeline that created generated the sequence file and its metadata
+ >>> ct.ncov_metadata
+ {'schema_version': 'v1',
+ 'nextclade_version': 'nextclade 3.8.2',
+ 'nextclade_dataset_name': 'SARS-CoV-2',
+ 'nextclade_dataset_version': '2024-09-25--21-50-30Z',
+ 'nextclade_tsv_sha256sum': '5b0f2b64bfe694a3c96bd5a116de8fae23b144bfd3d22da774d4bfe9a84399c3',
+ 'metadata_tsv_sha256sum': '1dc6a4204039e5c69eed84583faf75bbec1629e531dc99aab5bd566d3fb28295'}
+
+
+Working with SARS-CoV-2 sequence metadata
+------------------------------------------
+
+The CladeTime class also provides a Polars LazyFrame object that points to the Nextstrain's sequence metadata file.
+This file is in .tsv format and contains information about the sequences, such as their collection date,
+host, and location.
+
+The metadata also includes a clade assignment for each sequence. Nextstrain assigns clades based on a reference tree,
+and the reference tree varies over time.
+
+.. code-block:: python
+
+ >>> import polars as pl
+ >>> from cladetime import CladeTime
+
+ >>> ct = CladeTime()
+
+ # ct contains a Polars LazyFrame that references the sequence metadata .tsv file on AWS S3
+ >>> lf = ct.sequence_metadata
+ >>> lf
+
+
+
+Getting historical SARS-CoV-2 sequence metadata
+------------------------------------------------
+
+A CladeTime instance created without parameters will reference the most
+recent data available from Nextstrain.
+
+To access sequence metadata at a specific point in time, pass a date string
+in the format 'YYYY-MM-DD' to the CladeTime constructor. Alternatively, you can pass
+a Python datetime object. Both will be treated as UTC dates/times. If a date string
+is specified, the datetime will be set to 00:00:00 hours:minutes:seconds on that
+date, meaning that the CladeTime object will retrieve the sequence metadata that
+was available at the start of that day.
+
+.. code-block:: python
+
+ >>> from cladetime import CladeTime
+
+ >>> ct = CladeTime(sequence_as_of="2024-08-02")
+
+ # ct operations now reference the version of the sequence metadata
+ # that was available at midnight UTC on August 2, 2024.
+ >>> ct.sequence_metadata \
+ ... .cast({"date": pl.Date}, strict=False) \
+ ... .select(pl.max("date")).collect()
+
+ shape: (1, 1)
+ ┌────────────┐
+ │ date │
+ │ --- │
+ │ date │
+ ╞════════════╡
+ │ 2024-07-23 │
+ └────────────┘
+
diff --git a/pyproject.toml b/pyproject.toml
index 260fd1c..74c8192 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,7 +10,11 @@ authors = [
requires-python = ">=3.11"
readme = "README.md"
-license = {text = "MIT"}
+
+classifiers = [
+ "Development Status :: 3 - Alpha",
+ "License :: OSI Approved :: MIT License",
+]
dependencies = [
"awscli>=1.32.92",
@@ -41,9 +45,20 @@ dev = [
"types-python-dateutil",
"types-requests",
]
+docs = [
+ "furo",
+ "matplotlib",
+ "myst-parser",
+ "sphinx>=5.0,<6.0",
+ "sphinx-copybutton",
+ "sphinx-github-style",
+ "sphinxext-opengraph",
+ ]
[project.urls]
Repository = "https://github.com/reichlab/cladetime.git"
+Documentation = "https://cladetime.readthedocs.io/"
+Issues = "https://github.com/reichlab/cladetime/issues"
[project.entry-points."console_scripts"]
assign_clades = "cladetime.assign_clades:main"
diff --git a/src/cladetime/cladetime.py b/src/cladetime/cladetime.py
index d3b108d..47a2589 100644
--- a/src/cladetime/cladetime.py
+++ b/src/cladetime/cladetime.py
@@ -15,51 +15,45 @@
class CladeTime:
- """
- Wrapper around Nextstrain/Nextclade tooling to generate Sars-CoV-2 genome clade assignments
- and aggregations at a specific point in time. CladeTime operates on Genbank sequences.
+ """Interface for Nextstrain SARS-CoV-2 genome sequences and clades.
+
+ The CladeTime class is instantiated with two optional arguments that
+ specify the point in time at which to access genome sequences/metadata
+ as well as the reference tree used for clade assignment. CladeTime
+ interacts with GenBank-based data provided by the Nextstrain project.
+
+ Parameters
+ ----------
+ sequence_as_of : datetime.datetime | str | None
+ Sets the versions of Nextstrain SARS-CoV-2 genome sequence
+ and sequence metadata files that will be used by
+ CladeTime properties and methods. Can be a datetime object or a
+ string in YYYY-MM-DD format, both of which will be treated as
+ UTC. The default value is the current time.
+ tree_as_of : datetime.datetime | str | None
+ Sets the version of the Nextstrain reference tree that will be
+ used by CladeTime. Can be a datetime object or a string in
+ YYYY-MM-DD format, both of which will be treated as UTC.
+ The default value is :any:`sequence_as_of`
Attributes
----------
- sequence_as_of : datetime
- Use the NextStrain sequences and sequence metadata that were available
- as of this date and time (UTC).
- ncov_metadata : dict
- Metadata for the Nextstrain ncov pipeline that generated the sequence and
- sequence metadata that correspond to the sequence_as_of date.
- metadata_metadata : pl.LazyFrame
- A Polars lazyframe reference to url_sequence_metadata.
- tree_as_of : datetime
- Use the NextStrain reference tree that was available as of this
- date and time (UTC).
- Can be a datetime object, a string in the format
- "YYYY-MM-DD", or None (which defaults to the current date and time).
- url_ncov_metadata: str
- S3 URL to the Nextstrain ncov metadata file (.json)
+ url_ncov_metadata : str
+ S3 URL to metadata from the Nextstrain pipeline run that
+ generated the sequence clade assignments in
+ :any:`url_sequence_metadata`
url_sequence : str
S3 URL to the Nextstrain Sars-CoV-2 sequence file (zst-compressed
- .fasta) that was available at the sequence_as_of.
+ .fasta) that was current at the date specified in
+ :any:`sequence_as_of`
url_sequence_metadata : str
S3 URL to the Nextstrain Sars-CoV-2 sequence metadata file
- (zst-compressed tsv) that was available at the sequence_as_of.
+ (zst-compressed tsv) that was current at the date specified in
+ :any:`sequence_as_of`
"""
def __init__(self, sequence_as_of=None, tree_as_of=None):
- """
- Parameters
- ----------
- sequence_as_of : datetime | str | None, default = now()
- Use the NextStrain sequences and sequence metadata that were available
- as of this date. Can be a datetime object, a string in the format
- "YYYY-MM-DD", or None (which defaults to the current date and time).
- CladeTime treats all dates and times as UTC.
- tree_as_of : datetime | str | None, default = now()
- Use the NextStrain reference tree that was available as of this date.
- Can be a datetime object, a string in the format
- "YYYY-MM-DD", or None (which defaults to the sequence_as_of date).
- CladeTime treats all dates and times as UTC.
- """
-
+ """CladeTime constructor."""
self._config = self._get_config()
self.sequence_as_of = sequence_as_of
self.tree_as_of = tree_as_of
@@ -83,11 +77,16 @@ def __init__(self, sequence_as_of=None, tree_as_of=None):
@property
def sequence_as_of(self) -> datetime:
+ """
+ datetime.datetime : The date and time (UTC) used to retrieve NextStrain sequences
+ and sequence metadata. :any:`url_sequence` and
+ :any:`url_sequence_metadata` link to
+ Nextstrain files that were current as of this date.
+ """
return self._sequence_as_of
@sequence_as_of.setter
def sequence_as_of(self, date) -> None:
- """Set the sequence_as_of attribute."""
sequence_as_of = self._validate_as_of_date(date)
utc_now = datetime.now(timezone.utc)
if sequence_as_of > utc_now:
@@ -101,11 +100,16 @@ def sequence_as_of(self, date) -> None:
@property
def tree_as_of(self) -> datetime:
+ """
+ datetime.datetime : The date and time (UTC) used to retrieve the NextStrain
+ reference tree. :any:`get_reference_tree`
+ uses this date to get the reference tree that was current as
+ of this date.
+ """
return self._tree_as_of
@tree_as_of.setter
def tree_as_of(self, date) -> None:
- """Set the tree_as_of attribute."""
if date is None:
tree_as_of = self.sequence_as_of
else:
@@ -126,7 +130,12 @@ def ncov_metadata(self):
@ncov_metadata.getter
def ncov_metadata(self) -> dict:
- """Get the ncov_metadata attribute."""
+ """
+ dict : Metadata for the reference tree that was used for SARS-CoV-2
+ clade assignments as of :any:`tree_as_of`.
+ This property will be empty for dates before 2024-08-01, when
+ Nextstrain began publishing ncov pipeline metadata.
+ """
if self.url_ncov_metadata:
metadata = _get_ncov_metadata(self.url_ncov_metadata)
return metadata
@@ -140,7 +149,10 @@ def sequence_metadata(self):
@sequence_metadata.getter
def sequence_metadata(self) -> pl.LazyFrame:
- """Get the sequence_metadata attribute."""
+ """
+ :class:`polars.LazyFrame` : A Polars LazyFrame that references
+ :any:`url_sequence_metadata`
+ """
if self.url_sequence_metadata:
sequence_metadata = get_covid_genome_metadata(metadata_url=self.url_sequence_metadata)
return sequence_metadata
@@ -182,3 +194,17 @@ def _validate_as_of_date(self, as_of: str) -> datetime:
raise CladeTimeInvalidDateError(f"Date must be after May 1, 2023: {as_of_date}")
return as_of_date
+
+ def get_reference_tree(self) -> dict:
+ """Return a reference tree used for SARS-CoV-2 clade assignments
+
+ Retrieves the reference tree that was current as of
+ :any:`tree_as_of`.
+
+ This method is not yet implemented.
+
+ Returns
+ -------
+ dict
+ """
+ return {self.tree_as_of: "not implemented"}