diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 0000000..476326b --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,21 @@ +# Read the Docs configuration file for Sphinx projects +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +version: 2 + +build: + os: ubuntu-22.04 + tools: + python: "3.12" + +# Build documentation in the "docs/" directory with Sphinx +sphinx: + configuration: docs/conf.py + fail_on_warning: true + +python: + install: + - method: pip + path: . + extra_requirements: + - docs diff --git a/docs/_static/cladetime_logo_dark_mode.png b/docs/_static/cladetime_logo_dark_mode.png new file mode 100644 index 0000000..2129dc5 Binary files /dev/null and b/docs/_static/cladetime_logo_dark_mode.png differ diff --git a/docs/_static/cladetime_logo_light_mode.png b/docs/_static/cladetime_logo_light_mode.png new file mode 100644 index 0000000..808dc5b Binary files /dev/null and b/docs/_static/cladetime_logo_light_mode.png differ diff --git a/docs/_static/reichlab.png b/docs/_static/reichlab.png new file mode 100644 index 0000000..b372c61 Binary files /dev/null and b/docs/_static/reichlab.png differ diff --git a/docs/_static/reichlab_favicon.png b/docs/_static/reichlab_favicon.png new file mode 100644 index 0000000..3cce4ff Binary files /dev/null and b/docs/_static/reichlab_favicon.png differ diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..8c79c0c --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,137 @@ +import os +import sys +from datetime import date + +# Configuration file for the Sphinx documentation builder. + +# -- Project information + +project = "Cladetime" +project_copyright = f"{date.today().year}, Reich Lab @ The University of Massachusetts Amherst" +author = "Reich Lab" + +# Add cladetime location to the path, so we can use autodoc to +# generate API documentation from docstrings. +root_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) +sys.path.insert(0, root_path) + +release = "0.1" +# FIXME: get the version dynamically +version = "0.1.0" + +# -- General configuration + +extensions = [ + "myst_parser", + "sphinx.ext.autodoc", + "sphinx_copybutton", + "sphinx.ext.doctest", + "sphinx.ext.intersphinx", + "sphinx_github_style", + "sphinxext.opengraph", + "sphinx.ext.napoleon", +] + +intersphinx_mapping = { + "python": ("https://docs.python.org/3/", None), + "sphinx": ("https://www.sphinx-doc.org/en/master/", None), + "polars": ("https://docs.pola.rs/api/python/stable", None), +} +intersphinx_disabled_domains = ["std"] + +# Copied these settings from the copybutton's config +# https://github.com/executablebooks/sphinx-copybutton/blob/master/docs/conf.py +copybutton_prompt_text = r">>> |\.\.\. |\$ |In \[\d*\]: | {2,5}\.\.\.: | {5,8}: " +copybutton_prompt_is_regexp = True +copybutton_line_continuation_character = "\\" +copybutton_here_doc_delimiter = "EOT" +copybutton_selector = "div:not(.no-copybutton) > div.highlight > pre" + +templates_path = ["_templates"] + +# The root toctree document. +root_doc = "index" + +# Test code blocks only when explicitly specified +doctest_test_doctest_blocks = "" + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_static_path = ["_static"] +html_theme = "furo" +html_favicon = "_static/reichlab_favicon.png" +html_title = "Cladetime" +html_last_updated_fmt = "%Y-%m-%d" + +# Settings for the GitHub link extension +linkcode_url = "https://github.com/reichlab/cladetime" + +# These folders are copied to the documentation's HTML output +html_theme_options = { + "announcement": """ + + Cladetime is a work in progress. Please feel free to file issues on GitHub. + + """, + "sidebar_hide_name": True, + "light_logo": "cladetime_logo_light_mode.png", + "dark_logo": "cladetime_logo_dark_mode.png", + "navigation_with_keys": True, + "source_repository": "https://github.com/reichlab/cladetime/", + # source for GitHub footer icon: + # https://pradyunsg.me/furo/customisation/footer/#using-embedded-svgs + "footer_icons": [ + { + "name": "GitHub", + "url": "https://github.com/reichlab/cladetime", + "html": """ + + + + """, + "class": "", + }, + ], +} + +# from https://myst-parser.readthedocs.io/en/latest/syntax/optional.html +myst_enable_extensions = [ + "amsmath", + "deflist", + "dollarmath", + "fieldlist", + "substitution", + "tasklist", + "colon_fence", + "attrs_inline", +] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = "friendly" + +# Show typehints as content of the function or method +autodoc_typehints = "description" +autodoc_member_order = "bysource" + +# Open Graph metadata +ogp_site_url = "https://cladetime.readthedocs.io" +ogp_title = "cladetime documentation" +ogp_type = "website" +ogp_image = "https://cladetime.readthedocs.io/en/latest/_static/cladetime_logo_light_mode.png" +ogp_social_cards = { + "image": "https://cladetime.readthedocs.io/en/latest/_static/cladetime_logo_light_mode.png", + "line_color": "#5d9c9c", +} + +# Warn about all references to unknown targets +nitpicky = True +nitpick_ignore = [ + ("py:class", "datetime"), + ("py:class", "polars.LazyFrame"), + ("py:class", "polars.lazyframe.frame.LazyFrame"), +] + + +# -- Options for EPUB output +epub_show_urls = "footnote" diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..bc25652 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,68 @@ +:og:description: Cladetime is a Python interface for accessing SARS-CoV-2 sequence and clade data provided by Nextstrain. + +=============== + Cladetime +=============== + +Cladetime is a Python interface for accessing `Nextstrain `_ SARS-CoV-2 sequence and clade data. + +.. toctree:: + :titlesonly: + :hidden: + + Home + user-guide + reference/index + +Installation +------------ + +Cladetime can be installed with `pip `_: + +.. code-block:: bash + + $ pip install git+https://github.com/reichlab/cladetime.git + + +Usage +----- + +The CladeTime :any:`CladeTime` class provides a lightweight wrapper around historical and current +SARS-CoV-2 GenBank sequence and sequence metadata created by `nextstrain.org's `_ +daily workflow pipeline. + +.. code-block:: python + + >>> import polars as pl + >>> from cladetime import CladeTime + + >>> ct = CladeTime() + >>> filtered_sequence_metadata = ( + ... ct.sequence_metadata.select(["country", "division", "date", "host", "clade_nextstrain"]) + ... .filter( + ... pl.col("country") == "USA", + ... pl.col("date").is_not_null(), + ... pl.col("host") == "Homo sapiens", + ... ) + ... .cast({"date": pl.Date}, strict=False) + ... ) + + >>> filtered_sequence_metadata.head(5).collect() + + shape: (5, 5) + ┌─────────┬──────────┬────────────┬──────────────┬──────────────────┐ + │ country ┆ division ┆ date ┆ host ┆ clade_nextstrain │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ date ┆ str ┆ str │ + ╞═════════╪══════════╪════════════╪══════════════╪══════════════════╡ + │ USA ┆ Alabama ┆ 2022-07-07 ┆ Homo sapiens ┆ 22A │ + │ USA ┆ Arizona ┆ 2022-07-02 ┆ Homo sapiens ┆ 22B │ + │ USA ┆ Arizona ┆ 2022-07-19 ┆ Homo sapiens ┆ 22B │ + │ USA ┆ Arizona ┆ 2022-07-15 ┆ Homo sapiens ┆ 22B │ + │ USA ┆ Arizona ┆ 2022-07-20 ┆ Homo sapiens ┆ 22B │ + └─────────┴──────────┴────────────┴──────────────┴──────────────────┘ + +See the :doc:`user-guide` for more details about working with Cladetime. + +The :doc:`reference/index` documentation provides API-level documentation. + diff --git a/docs/reference/cladetime.rst b/docs/reference/cladetime.rst new file mode 100644 index 0000000..35d1615 --- /dev/null +++ b/docs/reference/cladetime.rst @@ -0,0 +1,8 @@ +:og:description: Cladetime is a Python interface for accessing Sars-Cov-2 sequence and clade data provided by Nextstrain. + +========== +CladeTime +========== + +.. autoclass:: cladetime.CladeTime + :members: diff --git a/docs/reference/index.rst b/docs/reference/index.rst new file mode 100644 index 0000000..7660209 --- /dev/null +++ b/docs/reference/index.rst @@ -0,0 +1,6 @@ +API Reference +============= + +.. toctree:: + + cladetime diff --git a/docs/user-guide.rst b/docs/user-guide.rst new file mode 100644 index 0000000..fb1f6aa --- /dev/null +++ b/docs/user-guide.rst @@ -0,0 +1,94 @@ +=============== +User Guide +=============== + + + +Finding Nextstrain SARS-CoV-2 sequences and sequence metadata +-------------------------------------------------------------- + +Cladetime provides a CladeTime class that provides a lightweight interface to nextstrain.org files. + +.. code-block:: python + + >>> from cladetime import CladeTime + + # Instantiating a CladeTime object with no parameters will use the + # latest available data from nextstrain.org. + >>> ct = CladeTime() + + # URL to the most recent SARS-CoV-2 sequence file (.fasta) + >>> ct.url_sequence + https://nextstrain-data.s3.amazonaws.com/files/ncov/open/sequences.fasta.zst?versionId=d66Hn1T0eFMAg8osEh8Yrod.QEUBRxvu' + + # URL to the metadata that describes the sequences in the above file + >>> ct.url.sequence_metadata + 'https://nextstrain-data.s3.amazonaws.com/files/ncov/open/metadata.tsv.zst?versionId=JTXXFlKyyvt9AerxKMwoZflhFYQFrDek' + + # Metadata about the nextstrain data pipeline that created generated the sequence file and its metadata + >>> ct.ncov_metadata + {'schema_version': 'v1', + 'nextclade_version': 'nextclade 3.8.2', + 'nextclade_dataset_name': 'SARS-CoV-2', + 'nextclade_dataset_version': '2024-09-25--21-50-30Z', + 'nextclade_tsv_sha256sum': '5b0f2b64bfe694a3c96bd5a116de8fae23b144bfd3d22da774d4bfe9a84399c3', + 'metadata_tsv_sha256sum': '1dc6a4204039e5c69eed84583faf75bbec1629e531dc99aab5bd566d3fb28295'} + + +Working with SARS-CoV-2 sequence metadata +------------------------------------------ + +The CladeTime class also provides a Polars LazyFrame object that points to the Nextstrain's sequence metadata file. +This file is in .tsv format and contains information about the sequences, such as their collection date, +host, and location. + +The metadata also includes a clade assignment for each sequence. Nextstrain assigns clades based on a reference tree, +and the reference tree varies over time. + +.. code-block:: python + + >>> import polars as pl + >>> from cladetime import CladeTime + + >>> ct = CladeTime() + + # ct contains a Polars LazyFrame that references the sequence metadata .tsv file on AWS S3 + >>> lf = ct.sequence_metadata + >>> lf + + + +Getting historical SARS-CoV-2 sequence metadata +------------------------------------------------ + +A CladeTime instance created without parameters will reference the most +recent data available from Nextstrain. + +To access sequence metadata at a specific point in time, pass a date string +in the format 'YYYY-MM-DD' to the CladeTime constructor. Alternatively, you can pass +a Python datetime object. Both will be treated as UTC dates/times. If a date string +is specified, the datetime will be set to 00:00:00 hours:minutes:seconds on that +date, meaning that the CladeTime object will retrieve the sequence metadata that +was available at the start of that day. + +.. code-block:: python + + >>> from cladetime import CladeTime + + >>> ct = CladeTime(sequence_as_of="2024-08-02") + + # ct operations now reference the version of the sequence metadata + # that was available at midnight UTC on August 2, 2024. + >>> ct.sequence_metadata \ + ... .cast({"date": pl.Date}, strict=False) \ + ... .select(pl.max("date")).collect() + + shape: (1, 1) + ┌────────────┐ + │ date │ + │ --- │ + │ date │ + ╞════════════╡ + │ 2024-07-23 │ + └────────────┘ + diff --git a/pyproject.toml b/pyproject.toml index 260fd1c..74c8192 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,11 @@ authors = [ requires-python = ">=3.11" readme = "README.md" -license = {text = "MIT"} + +classifiers = [ + "Development Status :: 3 - Alpha", + "License :: OSI Approved :: MIT License", +] dependencies = [ "awscli>=1.32.92", @@ -41,9 +45,20 @@ dev = [ "types-python-dateutil", "types-requests", ] +docs = [ + "furo", + "matplotlib", + "myst-parser", + "sphinx>=5.0,<6.0", + "sphinx-copybutton", + "sphinx-github-style", + "sphinxext-opengraph", + ] [project.urls] Repository = "https://github.com/reichlab/cladetime.git" +Documentation = "https://cladetime.readthedocs.io/" +Issues = "https://github.com/reichlab/cladetime/issues" [project.entry-points."console_scripts"] assign_clades = "cladetime.assign_clades:main" diff --git a/src/cladetime/cladetime.py b/src/cladetime/cladetime.py index d3b108d..47a2589 100644 --- a/src/cladetime/cladetime.py +++ b/src/cladetime/cladetime.py @@ -15,51 +15,45 @@ class CladeTime: - """ - Wrapper around Nextstrain/Nextclade tooling to generate Sars-CoV-2 genome clade assignments - and aggregations at a specific point in time. CladeTime operates on Genbank sequences. + """Interface for Nextstrain SARS-CoV-2 genome sequences and clades. + + The CladeTime class is instantiated with two optional arguments that + specify the point in time at which to access genome sequences/metadata + as well as the reference tree used for clade assignment. CladeTime + interacts with GenBank-based data provided by the Nextstrain project. + + Parameters + ---------- + sequence_as_of : datetime.datetime | str | None + Sets the versions of Nextstrain SARS-CoV-2 genome sequence + and sequence metadata files that will be used by + CladeTime properties and methods. Can be a datetime object or a + string in YYYY-MM-DD format, both of which will be treated as + UTC. The default value is the current time. + tree_as_of : datetime.datetime | str | None + Sets the version of the Nextstrain reference tree that will be + used by CladeTime. Can be a datetime object or a string in + YYYY-MM-DD format, both of which will be treated as UTC. + The default value is :any:`sequence_as_of` Attributes ---------- - sequence_as_of : datetime - Use the NextStrain sequences and sequence metadata that were available - as of this date and time (UTC). - ncov_metadata : dict - Metadata for the Nextstrain ncov pipeline that generated the sequence and - sequence metadata that correspond to the sequence_as_of date. - metadata_metadata : pl.LazyFrame - A Polars lazyframe reference to url_sequence_metadata. - tree_as_of : datetime - Use the NextStrain reference tree that was available as of this - date and time (UTC). - Can be a datetime object, a string in the format - "YYYY-MM-DD", or None (which defaults to the current date and time). - url_ncov_metadata: str - S3 URL to the Nextstrain ncov metadata file (.json) + url_ncov_metadata : str + S3 URL to metadata from the Nextstrain pipeline run that + generated the sequence clade assignments in + :any:`url_sequence_metadata` url_sequence : str S3 URL to the Nextstrain Sars-CoV-2 sequence file (zst-compressed - .fasta) that was available at the sequence_as_of. + .fasta) that was current at the date specified in + :any:`sequence_as_of` url_sequence_metadata : str S3 URL to the Nextstrain Sars-CoV-2 sequence metadata file - (zst-compressed tsv) that was available at the sequence_as_of. + (zst-compressed tsv) that was current at the date specified in + :any:`sequence_as_of` """ def __init__(self, sequence_as_of=None, tree_as_of=None): - """ - Parameters - ---------- - sequence_as_of : datetime | str | None, default = now() - Use the NextStrain sequences and sequence metadata that were available - as of this date. Can be a datetime object, a string in the format - "YYYY-MM-DD", or None (which defaults to the current date and time). - CladeTime treats all dates and times as UTC. - tree_as_of : datetime | str | None, default = now() - Use the NextStrain reference tree that was available as of this date. - Can be a datetime object, a string in the format - "YYYY-MM-DD", or None (which defaults to the sequence_as_of date). - CladeTime treats all dates and times as UTC. - """ - + """CladeTime constructor.""" self._config = self._get_config() self.sequence_as_of = sequence_as_of self.tree_as_of = tree_as_of @@ -83,11 +77,16 @@ def __init__(self, sequence_as_of=None, tree_as_of=None): @property def sequence_as_of(self) -> datetime: + """ + datetime.datetime : The date and time (UTC) used to retrieve NextStrain sequences + and sequence metadata. :any:`url_sequence` and + :any:`url_sequence_metadata` link to + Nextstrain files that were current as of this date. + """ return self._sequence_as_of @sequence_as_of.setter def sequence_as_of(self, date) -> None: - """Set the sequence_as_of attribute.""" sequence_as_of = self._validate_as_of_date(date) utc_now = datetime.now(timezone.utc) if sequence_as_of > utc_now: @@ -101,11 +100,16 @@ def sequence_as_of(self, date) -> None: @property def tree_as_of(self) -> datetime: + """ + datetime.datetime : The date and time (UTC) used to retrieve the NextStrain + reference tree. :any:`get_reference_tree` + uses this date to get the reference tree that was current as + of this date. + """ return self._tree_as_of @tree_as_of.setter def tree_as_of(self, date) -> None: - """Set the tree_as_of attribute.""" if date is None: tree_as_of = self.sequence_as_of else: @@ -126,7 +130,12 @@ def ncov_metadata(self): @ncov_metadata.getter def ncov_metadata(self) -> dict: - """Get the ncov_metadata attribute.""" + """ + dict : Metadata for the reference tree that was used for SARS-CoV-2 + clade assignments as of :any:`tree_as_of`. + This property will be empty for dates before 2024-08-01, when + Nextstrain began publishing ncov pipeline metadata. + """ if self.url_ncov_metadata: metadata = _get_ncov_metadata(self.url_ncov_metadata) return metadata @@ -140,7 +149,10 @@ def sequence_metadata(self): @sequence_metadata.getter def sequence_metadata(self) -> pl.LazyFrame: - """Get the sequence_metadata attribute.""" + """ + :class:`polars.LazyFrame` : A Polars LazyFrame that references + :any:`url_sequence_metadata` + """ if self.url_sequence_metadata: sequence_metadata = get_covid_genome_metadata(metadata_url=self.url_sequence_metadata) return sequence_metadata @@ -182,3 +194,17 @@ def _validate_as_of_date(self, as_of: str) -> datetime: raise CladeTimeInvalidDateError(f"Date must be after May 1, 2023: {as_of_date}") return as_of_date + + def get_reference_tree(self) -> dict: + """Return a reference tree used for SARS-CoV-2 clade assignments + + Retrieves the reference tree that was current as of + :any:`tree_as_of`. + + This method is not yet implemented. + + Returns + ------- + dict + """ + return {self.tree_as_of: "not implemented"}