Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add readthedocs and sphinx to cladetime #35

Merged
merged 8 commits into from
Oct 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions .readthedocs.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Read the Docs configuration file for Sphinx projects
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details

version: 2

build:
os: ubuntu-22.04
tools:
python: "3.12"

# Build documentation in the "docs/" directory with Sphinx
sphinx:
configuration: docs/conf.py
fail_on_warning: true

python:
install:
- method: pip
path: .
extra_requirements:
- docs
Binary file added docs/_static/cladetime_logo_dark_mode.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/_static/cladetime_logo_light_mode.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/_static/reichlab.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/_static/reichlab_favicon.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
137 changes: 137 additions & 0 deletions docs/conf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
import os
import sys
from datetime import date

# Configuration file for the Sphinx documentation builder.

# -- Project information

project = "Cladetime"
project_copyright = f"{date.today().year}, Reich Lab @ The University of Massachusetts Amherst"
author = "Reich Lab"

# Add cladetime location to the path, so we can use autodoc to
# generate API documentation from docstrings.
root_path = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
sys.path.insert(0, root_path)

release = "0.1"
# FIXME: get the version dynamically
version = "0.1.0"

# -- General configuration

extensions = [
"myst_parser",
"sphinx.ext.autodoc",
"sphinx_copybutton",
"sphinx.ext.doctest",
"sphinx.ext.intersphinx",
"sphinx_github_style",
"sphinxext.opengraph",
"sphinx.ext.napoleon",
]

intersphinx_mapping = {
"python": ("https://docs.python.org/3/", None),
"sphinx": ("https://www.sphinx-doc.org/en/master/", None),
"polars": ("https://docs.pola.rs/api/python/stable", None),
}
intersphinx_disabled_domains = ["std"]

# Copied these settings from the copybutton's config
# https://github.com/executablebooks/sphinx-copybutton/blob/master/docs/conf.py
copybutton_prompt_text = r">>> |\.\.\. |\$ |In \[\d*\]: | {2,5}\.\.\.: | {5,8}: "
copybutton_prompt_is_regexp = True
copybutton_line_continuation_character = "\\"
copybutton_here_doc_delimiter = "EOT"
copybutton_selector = "div:not(.no-copybutton) > div.highlight > pre"

templates_path = ["_templates"]

# The root toctree document.
root_doc = "index"

# Test code blocks only when explicitly specified
doctest_test_doctest_blocks = ""

# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
html_static_path = ["_static"]
html_theme = "furo"
html_favicon = "_static/reichlab_favicon.png"
html_title = "Cladetime"
html_last_updated_fmt = "%Y-%m-%d"

# Settings for the GitHub link extension
linkcode_url = "https://github.com/reichlab/cladetime"

# These folders are copied to the documentation's HTML output
html_theme_options = {
"announcement": """
<a style=\"text-decoration: none; color: white;\"
href=\"https://github.com/reichlab/cladetime/issues">
Cladetime is a work in progress. Please feel free to file issues on GitHub.
</a>
""",
"sidebar_hide_name": True,
"light_logo": "cladetime_logo_light_mode.png",
"dark_logo": "cladetime_logo_dark_mode.png",
"navigation_with_keys": True,
"source_repository": "https://github.com/reichlab/cladetime/",
# source for GitHub footer icon:
# https://pradyunsg.me/furo/customisation/footer/#using-embedded-svgs
"footer_icons": [
{
"name": "GitHub",
"url": "https://github.com/reichlab/cladetime",
"html": """
<svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 16 16">
<path fill-rule="evenodd" d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0 0 16 8c0-4.42-3.58-8-8-8z"></path>
</svg>
""",
"class": "",
},
],
}

# from https://myst-parser.readthedocs.io/en/latest/syntax/optional.html
myst_enable_extensions = [
"amsmath",
"deflist",
"dollarmath",
"fieldlist",
"substitution",
"tasklist",
"colon_fence",
"attrs_inline",
]

# The name of the Pygments (syntax highlighting) style to use.
pygments_style = "friendly"

# Show typehints as content of the function or method
autodoc_typehints = "description"
autodoc_member_order = "bysource"

# Open Graph metadata
ogp_site_url = "https://cladetime.readthedocs.io"
ogp_title = "cladetime documentation"
ogp_type = "website"
ogp_image = "https://cladetime.readthedocs.io/en/latest/_static/cladetime_logo_light_mode.png"
ogp_social_cards = {
"image": "https://cladetime.readthedocs.io/en/latest/_static/cladetime_logo_light_mode.png",
"line_color": "#5d9c9c",
}

# Warn about all references to unknown targets
nitpicky = True
nitpick_ignore = [
("py:class", "datetime"),
("py:class", "polars.LazyFrame"),
("py:class", "polars.lazyframe.frame.LazyFrame"),
]


# -- Options for EPUB output
epub_show_urls = "footnote"
68 changes: 68 additions & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
:og:description: Cladetime is a Python interface for accessing SARS-CoV-2 sequence and clade data provided by Nextstrain.

===============
Cladetime
===============

Cladetime is a Python interface for accessing `Nextstrain <https://nextstrain.org/>`_ SARS-CoV-2 sequence and clade data.

.. toctree::
:titlesonly:
:hidden:

Home <self>
user-guide
reference/index

Installation
------------

Cladetime can be installed with `pip <https://pip.pypa.io/>`_:

.. code-block:: bash

$ pip install git+https://github.com/reichlab/cladetime.git


Usage
-----

The CladeTime :any:`CladeTime` class provides a lightweight wrapper around historical and current
SARS-CoV-2 GenBank sequence and sequence metadata created by `nextstrain.org's <https://nextstrain.org/>`_
daily workflow pipeline.

.. code-block:: python

>>> import polars as pl
>>> from cladetime import CladeTime

>>> ct = CladeTime()
>>> filtered_sequence_metadata = (
... ct.sequence_metadata.select(["country", "division", "date", "host", "clade_nextstrain"])
... .filter(
... pl.col("country") == "USA",
... pl.col("date").is_not_null(),
... pl.col("host") == "Homo sapiens",
... )
... .cast({"date": pl.Date}, strict=False)
... )

>>> filtered_sequence_metadata.head(5).collect()

shape: (5, 5)
┌─────────┬──────────┬────────────┬──────────────┬──────────────────┐
│ country ┆ division ┆ date ┆ host ┆ clade_nextstrain │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ str ┆ date ┆ str ┆ str │
╞═════════╪══════════╪════════════╪══════════════╪══════════════════╡
│ USA ┆ Alabama ┆ 2022-07-07 ┆ Homo sapiens ┆ 22A │
│ USA ┆ Arizona ┆ 2022-07-02 ┆ Homo sapiens ┆ 22B │
│ USA ┆ Arizona ┆ 2022-07-19 ┆ Homo sapiens ┆ 22B │
│ USA ┆ Arizona ┆ 2022-07-15 ┆ Homo sapiens ┆ 22B │
│ USA ┆ Arizona ┆ 2022-07-20 ┆ Homo sapiens ┆ 22B │
└─────────┴──────────┴────────────┴──────────────┴──────────────────┘

See the :doc:`user-guide` for more details about working with Cladetime.

The :doc:`reference/index` documentation provides API-level documentation.

8 changes: 8 additions & 0 deletions docs/reference/cladetime.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
:og:description: Cladetime is a Python interface for accessing Sars-Cov-2 sequence and clade data provided by Nextstrain.

==========
CladeTime
==========

.. autoclass:: cladetime.CladeTime
:members:
6 changes: 6 additions & 0 deletions docs/reference/index.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
API Reference
=============

.. toctree::

cladetime
94 changes: 94 additions & 0 deletions docs/user-guide.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
===============
User Guide
===============



Finding Nextstrain SARS-CoV-2 sequences and sequence metadata
--------------------------------------------------------------

Cladetime provides a CladeTime class that provides a lightweight interface to nextstrain.org files.

.. code-block:: python

>>> from cladetime import CladeTime

# Instantiating a CladeTime object with no parameters will use the
# latest available data from nextstrain.org.
>>> ct = CladeTime()

# URL to the most recent SARS-CoV-2 sequence file (.fasta)
>>> ct.url_sequence
https://nextstrain-data.s3.amazonaws.com/files/ncov/open/sequences.fasta.zst?versionId=d66Hn1T0eFMAg8osEh8Yrod.QEUBRxvu'

# URL to the metadata that describes the sequences in the above file
>>> ct.url.sequence_metadata
'https://nextstrain-data.s3.amazonaws.com/files/ncov/open/metadata.tsv.zst?versionId=JTXXFlKyyvt9AerxKMwoZflhFYQFrDek'

# Metadata about the nextstrain data pipeline that created generated the sequence file and its metadata
>>> ct.ncov_metadata
{'schema_version': 'v1',
'nextclade_version': 'nextclade 3.8.2',
'nextclade_dataset_name': 'SARS-CoV-2',
'nextclade_dataset_version': '2024-09-25--21-50-30Z',
'nextclade_tsv_sha256sum': '5b0f2b64bfe694a3c96bd5a116de8fae23b144bfd3d22da774d4bfe9a84399c3',
'metadata_tsv_sha256sum': '1dc6a4204039e5c69eed84583faf75bbec1629e531dc99aab5bd566d3fb28295'}


Working with SARS-CoV-2 sequence metadata
------------------------------------------

The CladeTime class also provides a Polars LazyFrame object that points to the Nextstrain's sequence metadata file.
This file is in .tsv format and contains information about the sequences, such as their collection date,
host, and location.

The metadata also includes a clade assignment for each sequence. Nextstrain assigns clades based on a reference tree,
and the reference tree varies over time.

.. code-block:: python

>>> import polars as pl
>>> from cladetime import CladeTime

>>> ct = CladeTime()

# ct contains a Polars LazyFrame that references the sequence metadata .tsv file on AWS S3
>>> lf = ct.sequence_metadata
>>> lf
<LazyFrame at 0x105341190>


Getting historical SARS-CoV-2 sequence metadata
------------------------------------------------

A CladeTime instance created without parameters will reference the most
recent data available from Nextstrain.

To access sequence metadata at a specific point in time, pass a date string
in the format 'YYYY-MM-DD' to the CladeTime constructor. Alternatively, you can pass
a Python datetime object. Both will be treated as UTC dates/times. If a date string
is specified, the datetime will be set to 00:00:00 hours:minutes:seconds on that
date, meaning that the CladeTime object will retrieve the sequence metadata that
was available at the start of that day.

.. code-block:: python

>>> from cladetime import CladeTime

>>> ct = CladeTime(sequence_as_of="2024-08-02")

# ct operations now reference the version of the sequence metadata
# that was available at midnight UTC on August 2, 2024.
>>> ct.sequence_metadata \
... .cast({"date": pl.Date}, strict=False) \
... .select(pl.max("date")).collect()

shape: (1, 1)
┌────────────┐
│ date │
│ --- │
│ date │
╞════════════╡
│ 2024-07-23 │
└────────────┘

17 changes: 16 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,11 @@ authors = [

requires-python = ">=3.11"
readme = "README.md"
license = {text = "MIT"}

classifiers = [
"Development Status :: 3 - Alpha",
"License :: OSI Approved :: MIT License",
]

dependencies = [
"awscli>=1.32.92",
Expand Down Expand Up @@ -41,9 +45,20 @@ dev = [
"types-python-dateutil",
"types-requests",
]
docs = [
"furo",
"matplotlib",
"myst-parser",
"sphinx>=5.0,<6.0",
"sphinx-copybutton",
"sphinx-github-style",
"sphinxext-opengraph",
]

[project.urls]
Repository = "https://github.com/reichlab/cladetime.git"
Documentation = "https://cladetime.readthedocs.io/"
Issues = "https://github.com/reichlab/cladetime/issues"

[project.entry-points."console_scripts"]
assign_clades = "cladetime.assign_clades:main"
Expand Down
Loading