diff --git a/.gitignore b/.gitignore index c1ef3afc..7747ea87 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ node_modules/ htmlcov/ .venv/ dist/ +test_data/docs/ diff --git a/Makefile b/Makefile index bc00fdf7..b321b708 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: help lint format test clean flit-publish package cut-release +.PHONY: help lint format test clean flit-publish package cut-release performance-report SYSTEM_PYTHON=$(shell which python3) PLATFORM=$(shell printf '%s_%s' "$$(uname -s | tr '[:upper:]' '[:lower:]')" "$$(uname -m)") @@ -51,6 +51,7 @@ dist/${PACKAGE_NAME}.asc: dist/snooty-${VERSION}-${PLATFORM}.zip ## Build and si clean: ## Remove all build artifacts -rm -r snooty.tar.zip* snootycli.py .venv -rm -rf dist + -rm -rf .docs flit-publish: test ## Deploy the package to pypi SOURCE_DATE_EPOCH="$$SOURCE_DATE_EPOCH" flit publish @@ -83,3 +84,9 @@ cut-release: ## Release a new version of snooty. Must provide BUMP_TO_VERSION @echo @echo "Creating the release may now take several minutes. Check https://github.com/mongodb/snooty-parser/actions for status." @echo "Release will be created at: https://github.com/mongodb/snooty-parser/releases/tag/v${BUMP_TO_VERSION}" + +DOCS_COMMIT=1c6dfe71fd45fbdcdf5c7b73f050f615f4279064 +performance-report: .venv/.EXISTS ## Fetch a sample corpus, and generate a timing report for each part of the parse + if [ ! -d .docs ]; then git clone https://github.com/mongodb/docs.git .docs; fi + cd .docs; if [ `git rev-parse HEAD` != "${DOCS_COMMIT}" ]; then git fetch && git reset --hard "${DOCS_COMMIT}"; fi + . .venv/bin/activate && python3 -m snooty.performance_report .docs diff --git a/snooty/parser.py b/snooty/parser.py index e91fc82e..fb3253b5 100644 --- a/snooty/parser.py +++ b/snooty/parser.py @@ -835,20 +835,21 @@ def delete(self, path: PurePath) -> None: self.backend.on_delete(self.get_fileid(path), self.build_identifiers) - def build(self) -> None: + def build(self, max_workers: Optional[int] = None) -> None: all_yaml_diagnostics: Dict[PurePath, List[Diagnostic]] = {} - pool = multiprocessing.Pool() - try: - paths = util.get_files(self.config.source_path, RST_EXTENSIONS) - logger.debug("Processing rst files") - results = pool.imap_unordered(partial(parse_rst, self.parser), paths) - for page, diagnostics in results: - self._page_updated(page, diagnostics) - finally: - # We cannot use the multiprocessing.Pool context manager API due to the following: - # https://pytest-cov.readthedocs.io/en/latest/subprocess-support.html#if-you-use-multiprocessing-pool - pool.close() - pool.join() + pool = multiprocessing.Pool(max_workers) + with util.PerformanceLogger.singleton().start("parse rst"): + try: + paths = util.get_files(self.config.source_path, RST_EXTENSIONS) + logger.debug("Processing rst files") + results = pool.imap_unordered(partial(parse_rst, self.parser), paths) + for page, diagnostics in results: + self._page_updated(page, diagnostics) + finally: + # We cannot use the multiprocessing.Pool context manager API due to the following: + # https://pytest-cov.readthedocs.io/en/latest/subprocess-support.html#if-you-use-multiprocessing-pool + pool.close() + pool.join() # Categorize our YAML files logger.debug("Categorizing YAML files") @@ -891,7 +892,8 @@ def create_page(filename: str) -> Tuple[Page, EmbeddedRstParser]: page, all_yaml_diagnostics.get(page.source_path, []) ) - semantic_parse, semantic_diagnostics = self.semantic_parser.run(self.pages) + with util.PerformanceLogger.singleton().start("postprocessing"): + semantic_parse, semantic_diagnostics = self.semantic_parser.run(self.pages) for fileid, page in self.semantic_parser.pages.items(): self.backend.on_update(self.prefix, self.build_identifiers, fileid, page) @@ -1065,10 +1067,10 @@ def delete(self, path: PurePath) -> None: with self._lock: self._project.delete(path) - def build(self) -> None: + def build(self, max_workers: Optional[int] = None) -> None: """Build the full project.""" with self._lock: - self._project.build() + self._project.build(max_workers) def stop_monitoring(self) -> None: """Stop the filesystem monitoring thread associated with this project.""" diff --git a/snooty/performance_report.py b/snooty/performance_report.py new file mode 100644 index 00000000..f65fa4c7 --- /dev/null +++ b/snooty/performance_report.py @@ -0,0 +1,55 @@ +import sys +import logging +from pathlib import Path +from typing import Dict, List +from .parser import Project +from .types import Diagnostic, Page, FileId, SerializableType, BuildIdentifierSet +from .util import PerformanceLogger + +logging.basicConfig(level=logging.INFO) + + +class Backend: + def on_progress(self, progress: int, total: int, message: str) -> None: + pass + + def on_diagnostics(self, path: FileId, diagnostics: List[Diagnostic]) -> None: + pass + + def on_update( + self, + prefix: List[str], + build_identifiers: BuildIdentifierSet, + page_id: FileId, + page: Page, + ) -> None: + pass + + def on_update_metadata( + self, + prefix: List[str], + build_identifiers: BuildIdentifierSet, + field: Dict[str, SerializableType], + ) -> None: + pass + + def on_delete(self, page_id: FileId, build_identifiers: BuildIdentifierSet) -> None: + pass + + +def main() -> None: + backend = Backend() + root_path = Path(sys.argv[1]) + project = Project(root_path, backend, {}) + + n_runs = 3 + for i in range(n_runs): + print(f"run {i+1}/{n_runs}") + project.build(1) + + for name, time in PerformanceLogger.singleton().times().items(): + print(f"{name}:{time:10.4}") + + +if __name__ == "__main__": + main() diff --git a/snooty/util.py b/snooty/util.py index 00bc2ef3..154b47bb 100644 --- a/snooty/util.py +++ b/snooty/util.py @@ -1,11 +1,14 @@ import logging import os +import time import docutils.nodes import docutils.parsers.rst.directives import watchdog.events import watchdog.observers import watchdog.observers.api +from contextlib import contextmanager from dataclasses import dataclass +from collections import defaultdict from pathlib import Path, PurePath from typing import ( cast, @@ -13,6 +16,7 @@ Callable, Container, Counter, + List, Dict, Optional, Tuple, @@ -233,3 +237,29 @@ def split_domain(name: str) -> Tuple[str, str]: return "", parts[0] return parts[0], parts[1] + + +class PerformanceLogger: + _singleton: Optional["PerformanceLogger"] = None + + def __init__(self) -> None: + self._times: Dict[str, List[float]] = defaultdict(list) + + @contextmanager + def start(self, name: str) -> Iterator[None]: + start_time = time.perf_counter() + try: + yield None + finally: + self._times[name].append(time.perf_counter() - start_time) + + def times(self) -> Dict[str, float]: + return {k: min(v) for k, v in self._times.items()} + + @classmethod + def singleton(cls) -> "PerformanceLogger": + assert cls._singleton is not None + return cls._singleton + + +PerformanceLogger._singleton = PerformanceLogger()