From 9e64f77aa35867eb97e3c18d83e2f6aab5e45747 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Perceval=20Wajsb=C3=BCrt?= <perceval.wajsburt-ext@aphp.fr>
Date: Tue, 12 Sep 2023 21:35:56 +0200
Subject: [PATCH] test: fetch code blocks in docstrings via markdown extension

---
 .github/workflows/documentation.yml |   2 +-
 .github/workflows/release.yml       |   2 +-
 .github/workflows/tests.yml         |   2 +-
 Makefile                            |   4 +-
 contributing.md                     |   4 +-
 pyproject.toml                      |  16 +--
 tests/extract_docs_code.py          | 156 ++++++++++++++++++++++++++++
 tests/test_docs.py                  |  38 ++++---
 8 files changed, 196 insertions(+), 28 deletions(-)
 create mode 100644 tests/extract_docs_code.py

diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml
index ca281dbd5..13108923b 100644
--- a/.github/workflows/documentation.yml
+++ b/.github/workflows/documentation.yml
@@ -13,7 +13,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install '.[docs]'
+        pip install '.[dev]'
     - name: Set up Git
       run: |
         git config user.name ${{ github.actor }}
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 540fd7fc8..6f8dba5fb 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -72,7 +72,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install '.[docs]'
+        pip install '.[dev]'
     - name: Set up Git
       run: |
         git config user.name ${{ github.actor }}
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 01a2133be..d1f23bc45 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -79,7 +79,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install '.[docs]'
+        pip install '.[dev]'
     - name: Build documentation
       run: |
         mkdocs build --clean
diff --git a/Makefile b/Makefile
index 6c1b824f0..11944d17c 100644
--- a/Makefile
+++ b/Makefile
@@ -14,14 +14,14 @@ create-env: .venv
 
 install : .venv
 	. .venv/bin/activate
-	pip install -r '.[dev,docs,setup]'.txt
+	pip install -r '.[dev,setup]'.txt
 	python scripts/conjugate_verbs.py
 	pip install -e .
 	pre-commit install
 
 documentation: .venv
 	. .venv/bin/activate
-	pip install -e '.[docs]'
+	pip install -e '.[dev]'
 	mkdocs serve
 
 test: .venv
diff --git a/contributing.md b/contributing.md
index 2ca7634f2..aba1d1f0d 100644
--- a/contributing.md
+++ b/contributing.md
@@ -24,7 +24,7 @@ $ python -m venv venv
 $ source venv/bin/activate
 
 # Install the package with common, dev, setup dependencies in editable mode
-$ pip install -e '.[dev,docs,setup]'
+$ pip install -e '.[dev,setup]'
 # And build resources
 $ python scripts/conjugate_verbs.py
 ```
@@ -113,7 +113,7 @@ We use `MkDocs` for EDS-NLP's documentation. You can checkout the changes you ma
 
 ```console
 # Install the requirements
-$ pip install -e '.[docs]'
+$ pip install -e '.[dev]'
 ---> 100%
 color:green Installation successful
 
diff --git a/pyproject.toml b/pyproject.toml
index bdd337357..2ad72e398 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,12 +41,8 @@ dev = [
     "pytest-cov>=3.0.0,<4.0.0",
     "pytest-html>=3.1.1,<4.0.0",
     "torch>=1.0.0",
-]
-setup = [
-    "mlconjug3<3.9.0",
-    "typer"
-]
-docs = [
+
+    # docs
     "mike~=1.1.2",
     "mkdocs-charts-plugin==0.0.8",
     "mkdocs-img2fig-plugin==0.9.3",
@@ -59,6 +55,10 @@ docs = [
     "pybtex~=0.24.0",
     "pathspec>=0.11.1",  # required by vendored mkdocs-autorefs PR
 ]
+setup = [
+    "mlconjug3<3.9.0",
+    "typer"
+]
 
 [project.urls]
 "Source Code" = "https://github.com/aphp/edsnlp"
@@ -151,6 +151,10 @@ where = ["."]
 [project.entry-points."spacy_languages"]
 "eds" = "edsnlp.language:EDSLanguage"
 
+[project.entry-points."mkdocs.plugins"]
+"bibtex" = "docs.scripts.bibtex:BibTexPlugin"
+"autorefs" = "docs.scripts.autorefs.plugin:AutorefsPlugin"
+
 [build-system]
 requires = [
     "setuptools",
diff --git a/tests/extract_docs_code.py b/tests/extract_docs_code.py
new file mode 100644
index 000000000..1faec44ef
--- /dev/null
+++ b/tests/extract_docs_code.py
@@ -0,0 +1,156 @@
+import re
+import shutil
+import tempfile
+from textwrap import dedent
+from typing import Tuple
+
+from markdown.extensions import Extension
+from markdown.extensions.attr_list import get_attrs
+from markdown.extensions.codehilite import parse_hl_lines
+from markdown.extensions.fenced_code import FencedBlockPreprocessor
+from mkdocs.commands.build import build
+from mkdocs.config import load_config
+from mkdocs.config.config_options import Type as MkType
+from mkdocs.config.defaults import MkDocsConfig
+from mkdocs.plugins import BasePlugin
+from mkdocstrings.extension import AutoDocProcessor
+from mkdocstrings.plugin import MkdocstringsPlugin
+
+BRACKET_RE = re.compile(r"\[([^\[]+)\]")
+CITE_RE = re.compile(r"@([\w_:-]+)")
+DEF_RE = re.compile(r"\A {0,3}\[@([\w_:-]+)\]:\s*(.*)")
+INDENT_RE = re.compile(r"\A\t| {4}(.*)")
+
+CITATION_RE = r"(\[@(?:[\w_:-]+)(?: *, *@(?:[\w_:-]+))*\])"
+
+
+class PyCodePreprocessor(FencedBlockPreprocessor):
+    """Gather reference definitions and citation keys"""
+
+    FENCED_BLOCK_RE = re.compile(
+        dedent(
+            r"""
+            (?P<fence>^[ ]*(?:~{3,}|`{3,}))[ ]*                          # opening fence
+            ((\{(?P<attrs>[^\}\n]*)\})|                              # (optional {attrs} or
+            (\.?(?P<lang>[\w#.+-]*)[ ]*)?                            # optional (.)lang
+            (hl_lines=(?P<quot>"|')(?P<hl_lines>.*?)(?P=quot)[ ]*)?) # optional hl_lines)
+            \n                                                       # newline (end of opening fence)
+            (?P<code>.*?)(?<=\n)                                     # the code block
+            (?P=fence)[ ]*$                                          # closing fence
+        """  # noqa: E501
+        ),
+        re.MULTILINE | re.DOTALL | re.VERBOSE,
+    )
+
+    def __init__(self, md, code_blocks):
+        super().__init__(md, {})
+        self.code_blocks = code_blocks
+
+    def run(self, lines):
+        text = "\n".join(lines)
+        if 'nlp.add_pipe(f"eds.aids")' in text:
+            print("TEXT", text)
+        while True:
+            # ----  https://github.com/Python-Markdown/markdown/blob/5a2fee/markdown/extensions/fenced_code.py#L84C9-L98  # noqa: E501
+            m = self.FENCED_BLOCK_RE.search(text)
+            if 'nlp.add_pipe(f"eds.aids")' in text:
+                print("CODE ==>", m.group("code") if m else None)
+            if m:
+                lang, id, classes, config = None, "", [], {}
+                if m.group("attrs"):
+                    id, classes, config = self.handle_attrs(get_attrs(m.group("attrs")))
+                    if len(classes):
+                        lang = classes.pop(0)
+                else:
+                    if m.group("lang"):
+                        lang = m.group("lang")
+                    if m.group("hl_lines"):
+                        # Support `hl_lines` outside of `attrs` for
+                        # backward-compatibility
+                        config["hl_lines"] = parse_hl_lines(m.group("hl_lines"))
+                # ----
+                code = m.group("code")
+
+                if lang == "python" and "no-check" not in classes:
+                    self.code_blocks.append(dedent(code))
+            else:
+                break
+            text = text[m.end() :]
+
+        return lines
+
+
+context_citations = None
+
+
+class PyCodeExtension(Extension):
+    def __init__(self, code_blocks):
+        super(PyCodeExtension, self).__init__()
+        self.code_blocks = code_blocks
+
+    def extendMarkdown(self, md):
+        self.md = md
+        md.registerExtension(self)
+        md.preprocessors.register(
+            PyCodePreprocessor(md, self.code_blocks), "fenced_code", 31
+        )
+        for ext in md.registeredExtensions:
+            if isinstance(ext, AutoDocProcessor):
+                ext._config["mdx"].append(self)
+
+
+def makeExtension(*args, **kwargs):
+    return PyCodeExtension(*args, **kwargs)
+
+
+class PyCodeExtractorPlugin(BasePlugin):
+    config_scheme: Tuple[Tuple[str, MkType]] = (
+        # ("bibtex_file", MkType(str)),  # type: ignore[assignment]
+        # ("order", MkType(str, default="unsorted")),  # type: ignore[assignment]
+    )
+
+    def __init__(self, global_config):
+        self.global_config = global_config
+        self.page_code_blocks = []
+        self.docs_code_blocks = []
+
+    def on_config(self, config: MkDocsConfig):
+        self.ext = PyCodeExtension(self.page_code_blocks)
+        # After pymdownx.highlight, because of weird registering deleting the first
+        # extension
+        config["markdown_extensions"].append(self.ext)
+        config["markdown_extensions"].remove("pymdownx.highlight")
+        config["markdown_extensions"].remove("fenced_code")
+
+    def on_pre_build(self, *, config: MkDocsConfig):
+        mkdocstrings_plugin: MkdocstringsPlugin = config.plugins["mkdocstrings"]
+        mkdocstrings_plugin.get_handler("python")
+
+    def on_page_content(self, html, page, config, files):
+        if len(self.page_code_blocks):
+            self.docs_code_blocks.append((page.url, "\n".join(self.page_code_blocks)))
+        self.page_code_blocks.clear()
+        return html
+
+
+def extract_docs_code():
+    config = load_config()
+
+    temp_dir = tempfile.mkdtemp()
+    try:
+        config["site_dir"] = temp_dir
+
+        # plug the pycode extractor plugin
+        plugin = PyCodeExtractorPlugin(config)
+        config.plugins["pycode_extractor"] = plugin
+
+        config["plugins"].run_event("startup", command="build", dirty=False)
+        try:
+            build(config)
+        finally:
+            config["plugins"].run_event("shutdown")
+
+    finally:
+        shutil.rmtree(temp_dir, ignore_errors=True)
+
+    return plugin.docs_code_blocks
diff --git a/tests/test_docs.py b/tests/test_docs.py
index f62bafcb5..6d239aaa6 100644
--- a/tests/test_docs.py
+++ b/tests/test_docs.py
@@ -1,23 +1,31 @@
-from itertools import chain
-from pathlib import Path
-
 import pytest
+from extract_docs_code import extract_docs_code
+
+url_to_code = dict(extract_docs_code())
 
-from edsnlp.utils.blocs import check_md_file
 
-# @pytest.fixture(autouse=True, scope="module")
-# def brat_folder():
-#     yield
-#     shutil.rmtree("path/to/brat")
+def printer(code: str) -> None:
+    """
+    Prints a code bloc with lines for easier debugging.
 
+    Parameters
+    ----------
+    code : str
+        Code bloc.
+    """
+    lines = []
+    for i, line in enumerate(code.split("\n")):
+        lines.append(f"{i + 1:03}  {line}")
 
-files = chain(
-    Path("./").glob("*.md"),
-    Path("docs").glob("**/*.md"),
-)
+    print("\n".join(lines))
 
 
 # Note the use of `str`, makes for pretty output
-@pytest.mark.parametrize("path", files, ids=str)
-def test_code_blocks(path):
-    check_md_file(path=path, memory=True)
+@pytest.mark.parametrize("url", sorted(url_to_code.keys()), ids=str)
+def test_code_blocks(url):
+    raw = url_to_code[url]
+    try:
+        exec(raw, {"__MODULE__": "__main__"})
+    except Exception:
+        printer(raw)
+        raise