Skip to content

Commit

Permalink
docs: make most of our API clickable
Browse files Browse the repository at this point in the history
  • Loading branch information
percevalw committed Nov 8, 2024
1 parent 439fd7c commit c100b50
Show file tree
Hide file tree
Showing 9 changed files with 274 additions and 169 deletions.
1 change: 1 addition & 0 deletions changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
- Allow `converter` argument of `edsnlp.data.read/from_...` to be a list of converters instead of a single converter
- New revamped and documented `edsnlp.train` script and API
- Support YAML config files (supported only CFG/INI files before)
- Most of EDS-NLP functions are now clickable in the documentation

### Changed

Expand Down
5 changes: 5 additions & 0 deletions docs/assets/stylesheets/extra.css
Original file line number Diff line number Diff line change
Expand Up @@ -171,3 +171,8 @@ body, input {
min-width: initial !important;
padding: .5em 0.75em;
}

a.discrete-link {
color: inherit !important;
border-bottom: 1px dashed var(--md-default-fg-color--lighter) !important;
}
235 changes: 235 additions & 0 deletions docs/scripts/clickable_snippets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
# Based on https://github.com/darwindarak/mdx_bib
import os
import re
from bisect import bisect_right
from typing import Tuple

import jedi
import mkdocs.structure.pages
import parso
import regex
from mkdocs.config.config_options import Type as MkType
from mkdocs.config.defaults import MkDocsConfig
from mkdocs.plugins import BasePlugin

from docs.scripts.autorefs.plugin import AutorefsPlugin

try:
from importlib.metadata import entry_points
except ImportError:
from importlib_metadata import entry_points


from bs4 import BeautifulSoup

BRACKET_RE = re.compile(r"\[([^\[]+)\]")
CITE_RE = re.compile(r"@([\w_:-]+)")
DEF_RE = re.compile(r"\A {0,3}\[@([\w_:-]+)\]:\s*(.*)")
INDENT_RE = re.compile(r"\A\t| {4}(.*)")

HREF_REGEX = (
r"(?<=<\s*(?:a[^>]*href|img[^>]*src)=)"
r'(?:"([^"]*)"|\'([^\']*)|[ ]*([^ =>]*)(?![a-z]+=))'
)
# Maybe find something less specific ?
PIPE_REGEX = r"(?<![a-zA-Z0-9._-])eds[.]([a-zA-Z0-9._-]*)(?![a-zA-Z0-9._-])"

HTML_PIPE_REGEX = r"""(?x)
(?<![a-zA-Z0-9._-])
<span[^>]*>eds<\/span>
<span[^>]*>[.]<\/span>
<span[^>]*>([a-zA-Z0-9._-]*)<\/span>
(?![a-zA-Z0-9._-])
"""

CITATION_RE = r"(\[@(?:[\w_:-]+)(?: *, *@(?:[\w_:-]+))*\])"


class ClickableSnippetsPlugin(BasePlugin):
config_scheme: Tuple[Tuple[str, MkType]] = (
# ("bibtex_file", MkType(str)), # type: ignore[assignment]
# ("order", MkType(str, default="unsorted")), # type: ignore[assignment]
)

@mkdocs.plugins.event_priority(1000)
def on_config(self, config: MkDocsConfig):
for event_name, events in config.plugins.events.items():
for event in list(events):
if "autorefs" in str(event):
events.remove(event)
old_plugin = config["plugins"]["autorefs"]
plugin_config = dict(old_plugin.config)
plugin = AutorefsPlugin()
config.plugins["autorefs"] = plugin
config["plugins"]["autorefs"] = plugin
plugin.load_config(plugin_config)

@classmethod
def get_ep_namespace(cls, ep, namespace):
if hasattr(ep, "select"):
return ep.select(group=namespace)
else: # dict
return ep.get(namespace, [])

@mkdocs.plugins.event_priority(-1000)
def on_post_page(
self,
output: str,
page: mkdocs.structure.pages.Page,
config: mkdocs.config.Config,
):
"""
1. Replace absolute paths with path relative to the rendered page
This must be performed after all other plugins have run.
2. Replace component names with links to the component reference
Parameters
----------
output
page
config
Returns
-------
"""

autorefs: AutorefsPlugin = config["plugins"]["autorefs"]
ep = entry_points()
spacy_factories_entry_points = {
ep.name: ep.value
for ep in (
*self.get_ep_namespace(ep, "spacy_factories"),
*self.get_ep_namespace(ep, "edsnlp_factories"),
)
}

def replace_component(match):
full_group = match.group(0)
name = "eds." + match.group(1)
ep = spacy_factories_entry_points.get(name)
preceding = output[match.start(0) - 50 : match.start(0)]
if ep is not None and "DEFAULT:" not in preceding:
try:
url = autorefs.get_item_url(ep.replace(":", "."))
except KeyError:
pass
else:
return f"<a href={url}>{name}</a>"
return full_group

def replace_link(match):
relative_url = url = match.group(1) or match.group(2) or match.group(3)
page_url = os.path.join("/", page.file.url)
if url.startswith("/"):
relative_url = os.path.relpath(url, page_url)
return f'"{relative_url}"'

output = regex.sub(PIPE_REGEX, replace_component, output)
output = regex.sub(HTML_PIPE_REGEX, replace_component, output)
output = regex.sub(HREF_REGEX, replace_link, output)

all_snippets = ""
all_offsets = []
all_nodes = []

soups = []

# Replace absolute paths with path relative to the rendered page
for match in regex.finditer("<code>.*?</code>", output, flags=regex.DOTALL):
node = match.group(0)
if "\n" in node:
soup, snippet, python_offsets, html_nodes = self.convert_html_to_code(
node
)
size = len(all_snippets)
all_snippets += snippet + "\n"
all_offsets.extend([size + i for i in python_offsets])
all_nodes.extend(html_nodes)
soups.append((soup, match.start(0), match.end(0)))

interpreter = jedi.Interpreter(all_snippets, [{}])
line_lengths = [0]
for line in all_snippets.split("\n"):
line_lengths.append(len(line) + line_lengths[-1] + 1)
line_lengths[-1] -= 1

# print(all_snippets)
# print("----")
for name in self.iter_names(interpreter._module_node):
try:
line, col = name.start_pos
offset = line_lengths[line - 1] + col
node_idx = bisect_right(all_offsets, offset) - 1

node = all_nodes[node_idx]
goto = (interpreter.goto(line, col, follow_imports=True) or [None])[0]
if (
goto
and goto.full_name
and goto.full_name.startswith("edsnlp")
and goto.type != "module"
):
url = autorefs.get_item_url(goto.full_name)
# Check if node has no link in its upstream ancestors
if not node.find_parents("a"):
node.replace_with(
BeautifulSoup(
f'<a class="discrete-link" href="{url}">{node}</a>',
"html5lib",
)
)
except Exception:
pass
# print("\n\n")

# Re-insert soups into the output
for soup, start, end in reversed(soups):
output = output[:start] + str(soup) + output[end:]

return output

@classmethod
def iter_names(cls, root):
if isinstance(root, parso.python.tree.Name):
yield root
for child in getattr(root, "children", ()):
yield from cls.iter_names(child)

@classmethod
def convert_html_to_code(cls, html_content: str) -> Tuple[str, list, list]:
pre_html_content = "<pre>" + html_content + "</pre>"
soup = BeautifulSoup(pre_html_content, "html5lib")
code_element = soup.find("code")

line_lengths = [0]
for line in pre_html_content.split("\n"):
line_lengths.append(len(line) + line_lengths[-1] + 1)
line_lengths[-1] -= 1

python_code = ""
code_offsets = []
# html_offsets = [0] # <pre>
html_nodes = []
code_offset = 0

def extract_text_with_offsets(el):
nonlocal python_code, code_offset
for content in el.contents:
# Recursively process child elements
if isinstance(content, str):
python_code += content
code_offsets.append(code_offset)
code_offset += len(content)
html_nodes.append(content)
continue
extract_text_with_offsets(content)

extract_text_with_offsets(code_element)
# html_offsets = html_offsets[1:]

return soup, python_code, code_offsets, html_nodes

# print("\nOffset Mapping (Python Index -> HTML Index):")
# for mapping in offset_mapping:
# print(mapping)
108 changes: 0 additions & 108 deletions docs/scripts/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,6 @@
import mkdocs.structure.files
import mkdocs.structure.nav
import mkdocs.structure.pages
import regex
from mkdocs.config.defaults import MkDocsConfig

from docs.scripts.autorefs.plugin import AutorefsPlugin

try:
from importlib.metadata import entry_points
except ImportError:
from importlib_metadata import entry_points


def exclude_file(name):
Expand All @@ -33,21 +24,6 @@ def exclude_file(name):
"""


@mkdocs.plugins.event_priority(1000)
def on_config(config: MkDocsConfig):
for event_name, events in config.plugins.events.items():
for event in list(events):
if "autorefs" in str(event):
print("REMOVING EVENT", event_name, event)
events.remove(event)
old_plugin = config["plugins"]["autorefs"]
plugin_config = dict(old_plugin.config)
plugin = AutorefsPlugin()
config.plugins["autorefs"] = plugin
config["plugins"]["autorefs"] = plugin
plugin.load_config(plugin_config)


def on_files(files: mkdocs.structure.files.Files, config: mkdocs.config.Config):
"""
Recursively the navigation of the mkdocs config
Expand Down Expand Up @@ -145,87 +121,3 @@ def on_page_read_source(page, config):
if page.file.src_path in VIRTUAL_FILES:
return VIRTUAL_FILES[page.file.src_path]
return None


HREF_REGEX = (
r"(?<=<\s*(?:a[^>]*href|img[^>]*src)=)"
r'(?:"([^"]*)"|\'([^\']*)|[ ]*([^ =>]*)(?![a-z]+=))'
)
# Maybe find something less specific ?
PIPE_REGEX = r"(?<![a-zA-Z0-9._-])eds[.]([a-zA-Z0-9._-]*)(?![a-zA-Z0-9._-])"

HTML_PIPE_REGEX = r"""(?x)
(?<![a-zA-Z0-9._-])
<span[^>]*>eds<\/span>
<span[^>]*>[.]<\/span>
<span[^>]*>([a-zA-Z0-9._-]*)<\/span>
(?![a-zA-Z0-9._-])
"""


def get_ep_namespace(ep, namespace):
if hasattr(ep, "select"):
return ep.select(group=namespace)
else: # dict
return ep.get(namespace, [])


@mkdocs.plugins.event_priority(-1000)
def on_post_page(
output: str,
page: mkdocs.structure.pages.Page,
config: mkdocs.config.Config,
):
"""
1. Replace absolute paths with path relative to the rendered page
This must be performed after all other plugins have run.
2. Replace component names with links to the component reference
Parameters
----------
output
page
config
Returns
-------
"""

autorefs: AutorefsPlugin = config["plugins"]["autorefs"]
ep = entry_points()
spacy_factories_entry_points = {
ep.name: ep.value
for ep in (
*get_ep_namespace(ep, "spacy_factories"),
*get_ep_namespace(ep, "edsnlp_factories"),
)
}

def replace_component(match):
full_group = match.group(0)
name = "eds." + match.group(1)
ep = spacy_factories_entry_points.get(name)
preceding = output[match.start(0) - 50 : match.start(0)]
if ep is not None and "DEFAULT:" not in preceding:
try:
url = autorefs.get_item_url(ep.replace(":", "."))
except KeyError:
pass
else:
return f"<a href={url}>{name}</a>"
return full_group

def replace_link(match):
relative_url = url = match.group(1) or match.group(2) or match.group(3)
page_url = os.path.join("/", page.file.url)
if url.startswith("/"):
relative_url = os.path.relpath(url, page_url)
return f'"{relative_url}"'

# Replace absolute paths with path relative to the rendered page
output = regex.sub(PIPE_REGEX, replace_component, output)
output = regex.sub(HTML_PIPE_REGEX, replace_component, output)
output = regex.sub(HREF_REGEX, replace_link, output)

return output
Loading

0 comments on commit c100b50

Please sign in to comment.