Skip to content

Commit

Permalink
check embedded code blocks (#1093)
Browse files Browse the repository at this point in the history
* first version of embedded snippets check

* add missing code block types where needed

* small change to snippets script

* fix all parser problems in code blocks

* add better error messages and add check to ci

* add linting of embedded snippets

* small improvement for snippets linting

* remove one ignored error code

* add ruff dep

* add mypy (comment out for now)

* fix bug in script

* ignore lint setup for embedded snippets

* fix linting and small mypy adjustments

* switches from shell to sh as shell block type

* make snippet checker code nicer

* small script changes and readme

* add lint and type check count
  • Loading branch information
sh-rp authored Mar 18, 2024
1 parent 5e0b8b4 commit ce701b5
Show file tree
Hide file tree
Showing 112 changed files with 5,660 additions and 4,995 deletions.
5 changes: 3 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,9 @@ format:
# poetry run isort ./

test-and-lint-snippets:
poetry run mypy --config-file mypy.ini docs/website docs/examples
poetry run flake8 --max-line-length=200 docs/website docs/examples
cd docs/tools && poetry run python check_embedded_snippets.py full
poetry run mypy --config-file mypy.ini docs/website docs/examples docs/tools --exclude docs/tools/lint_setup
poetry run flake8 --max-line-length=200 docs/website docs/examples docs/tools
cd docs/website/docs && poetry run pytest --ignore=node_modules

lint-security:
Expand Down
37 changes: 37 additions & 0 deletions docs/tools/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# DLT docs tools

## `check_embedded_snippets.py`
This script find's all embedded snippets in our docs, extracts them and performs the following check:

* Snippet must have a valid language set, e.g. ```py
* Snippet must be parseable (works for py, toml, yaml and json snippets)
* Snippet must pass linting (works for py)
* Coming soon: snippet must pass type checking

This script is run on CI to ensure code quality in our docs.

### Usage

```sh
# Run a full check on all snippets
python check_embedded_snippets.py full

# Show all available commands and arguments for this script
python check_embedded_snippets.py --help

# Only run the linting stage
python check_embedded_snippets.py lint

# Run all stages but only for snippets in files that have the string "walkthrough" in the filepath
# you will probably be using this a lot when working on one doc page
python check_embedded_snippets.py full -f walkthrough

# Run the parsing stage, but only on snippets 49, 345 and 789
python check_embedded_snippets.py parse -s 49,345,789

# run all checks but with a bit more output to the terminal
python check_embedded_snippets.py full -v
```

### Snippet numbers
Each snippet will be assigned an index in the order it is encountered. This is useful during creation of new snippets in the docs to selectively only run a few snippets. These numbers will change as snippets are inserted into the docs.
Empty file added docs/tools/__init__.py
Empty file.
332 changes: 332 additions & 0 deletions docs/tools/check_embedded_snippets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,332 @@
"""
Walks through all markdown files, finds all code snippets, and checks wether they are parseable.
"""
from typing import List, Dict, Optional

import os, ast, json, yaml, tomlkit, subprocess, argparse # noqa: I251
from dataclasses import dataclass
from textwrap import dedent

import dlt.cli.echo as fmt

DOCS_DIR = "../website/docs"

SNIPPET_MARKER = "```"
ALLOWED_LANGUAGES = ["py", "toml", "json", "yaml", "text", "sh", "bat", "sql"]

LINT_TEMPLATE = "./lint_setup/template.py"
LINT_FILE = "./lint_setup/lint_me.py"

ENABLE_MYPY = False


@dataclass
class Snippet:
index: int
language: str
code: str
file: str
line: int

def __str__(self) -> str:
return (
f"Snippet No. {self.index} in {self.file} at line {self.line} with language"
f" {self.language}"
)


def collect_markdown_files(verbose: bool) -> List[str]:
"""
Discovers all docs markdown files
"""
markdown_files: List[str] = []
for path, _, files in os.walk(DOCS_DIR):
if "api_reference" in path:
continue
if "jaffle_shop" in path:
continue
for file in files:
if file.endswith(".md"):
markdown_files.append(os.path.join(path, file))
if verbose:
fmt.echo(f"Discovered {os.path.join(path, file)}")

if len(markdown_files) < 50: # sanity check
fmt.error("Found too few files. Something went wrong.")
exit(1)

fmt.note(f"Discovered {len(markdown_files)} markdown files")

return markdown_files


def collect_snippets(markdown_files: List[str], verbose: bool) -> List[Snippet]:
"""
Extract all snippets from markdown files
"""
snippets: List[Snippet] = []
index = 0
for file in markdown_files:
# go line by line and find all code blocks
with open(file, "r", encoding="utf-8") as f:
current_snippet: Snippet = None
lint_count = 0
for line in f.readlines():
lint_count += 1
if line.strip().startswith(SNIPPET_MARKER):
if current_snippet:
# process snippet
snippets.append(current_snippet)
current_snippet.code = dedent(current_snippet.code)
current_snippet = None
else:
# start new snippet
index += 1
current_snippet = Snippet(
index=index,
language=line.strip().split(SNIPPET_MARKER)[1] or "unknown",
code="",
file=file,
line=lint_count,
)
elif current_snippet:
current_snippet.code += line
assert not current_snippet, (
"It seems that the last snippet in the file was not closed. Please check the file "
+ file
)

fmt.note(f"Discovered {len(snippets)} snippets")
if verbose:
for lang in ALLOWED_LANGUAGES:
lang_count = len([s for s in snippets if s.language == lang])
fmt.echo(f"Found {lang_count} snippets marked as {lang}")
if len(snippets) < 100: # sanity check
fmt.error("Found too few snippets. Something went wrong.")
exit(1)
return snippets


def filter_snippets(snippets: List[Snippet], files: str, snippet_numbers: str) -> List[Snippet]:
"""
Filter out snippets based on file or snippet number
"""
fmt.secho(fmt.bold("Filtering Snippets"))
filtered_snippets: List[Snippet] = []
filtered_count = 0
for snippet in snippets:
if files and (files not in snippet.file):
filtered_count += 1
continue
elif snippet_numbers and (str(snippet.index) not in snippet_numbers):
filtered_count += 1
continue
filtered_snippets.append(snippet)
if filtered_count:
fmt.note(
f"{filtered_count} Snippets skipped based on file and snippet number settings."
f" {len(filtered_snippets)} snippets remaining."
)
else:
fmt.note("0 Snippets skipped based on file and snippet number settings")

if len(filtered_snippets) == 0: # sanity check
fmt.error("No snippets remaining after filter, nothing to do.")
exit(1)
return filtered_snippets


def check_language(snippets: List[Snippet]) -> None:
"""
Check if the language is allowed
"""
fmt.secho(fmt.bold("Checking snippets language settings"))
failed_count = 0
for snippet in snippets:
if snippet.language not in ALLOWED_LANGUAGES:
fmt.warning(f"{str(snippet)} has an invalid language {snippet.language} setting.")
failed_count += 1

if failed_count:
fmt.error(f"""\
Found {failed_count} snippets with invalid language settings.
* Please choose the correct language for your snippets: {ALLOWED_LANGUAGES}"
* All sh commands, except for windows (bat), should be marked as sh.
* All code blocks that are not a specific (markup-) language should be marked as text.\
""")
exit(1)
else:
fmt.note("All snippets have valid language settings")


def clear():
fmt.echo("\r" + " " * 200 + "\r", nl=False)


def parse_snippets(snippets: List[Snippet], verbose: bool) -> None:
"""
Parse all snippets with the respective parser library
"""
fmt.secho(fmt.bold("Parsing snippets"))
failed_count = 0
for snippet in snippets:
# parse snippet by type
clear()
fmt.echo(f"\rParsing {snippet}", nl=False)
try:
if snippet.language == "py":
ast.parse(snippet.code)
elif snippet.language == "toml":
tomlkit.loads(snippet.code)
elif snippet.language == "json":
json.loads(snippet.code)
elif snippet.language == "yaml":
yaml.safe_load(snippet.code)
# ignore text and sh scripts
elif snippet.language in ["text", "sh", "bat", "sql"]:
pass
else:
raise ValueError(f"Unknown language {snippet.language}")
except Exception as exc:
clear()
fmt.warning(f"Failed to parse {str(snippet)}")
fmt.echo(exc)
failed_count += 1

clear()
if failed_count:
fmt.error(f"Failed to parse {failed_count} snippets")
exit(1)
else:
fmt.note("All snippets could be parsed")


def prepare_for_linting(snippet: Snippet) -> None:
"""
Prepare the lintme file with the snippet code and the template header
"""
with open(LINT_TEMPLATE, "r", encoding="utf-8") as f:
lint_template = f.read()
with open(LINT_FILE, "w", encoding="utf-8") as f:
f.write(lint_template)
f.write("# Snippet start\n\n")
f.write(snippet.code)


def lint_snippets(snippets: List[Snippet], verbose: bool) -> None:
"""
Lint all python snippets with ruff
"""
fmt.secho(fmt.bold("Linting Python snippets"))
failed_count = 0
count = 0
for snippet in snippets:
count += 1
prepare_for_linting(snippet)
result = subprocess.run(["ruff", "check", LINT_FILE], capture_output=True, text=True)
clear()
fmt.echo(f"\rLinting {snippet} ({count} of {len(snippets)})", nl=False)
if "error" in result.stdout.lower():
failed_count += 1
clear()
fmt.warning(f"Failed to lint {str(snippet)}")
fmt.echo(result.stdout.strip())

clear()
if failed_count:
fmt.error(f"Failed to lint {failed_count} snippets")
exit(1)
else:
fmt.note("All snippets could be linted")


def typecheck_snippets(snippets: List[Snippet], verbose: bool) -> None:
"""
TODO: Type check all python snippets with mypy
"""
fmt.secho(fmt.bold("Type checking Python snippets"))
failed_count = 0
count = 0
for snippet in snippets:
count += 1
clear()
fmt.echo(f"\rType checking {snippet} ({count} of {len(snippets)})", nl=False)
prepare_for_linting(snippet)
result = subprocess.run(["mypy", LINT_FILE], capture_output=True, text=True)
if "no issues found" not in result.stdout.lower():
failed_count += 1
clear()
fmt.warning(f"Failed to type check {str(snippet)}")
fmt.echo(result.stdout.strip())

clear()
if failed_count:
fmt.error(f"Failed to type check {failed_count} snippets")
exit(1)
else:
fmt.note("All snippets passed type checking")


if __name__ == "__main__":
fmt.note(
"Welcome to Snippet Checker 3000, run 'python check_embedded_snippets.py --help' for help."
)

# setup cli
parser = argparse.ArgumentParser(
description=(
"Check embedded snippets. Discover, parse, lint, and type check all code snippets in"
" the docs."
),
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"command",
help=(
'Which checks to run. "full" will run all checks, parse, lint or typecheck will only'
" run that specific step"
),
choices=["full", "parse", "lint", "typecheck"],
default="full",
)
parser.add_argument("-v", "--verbose", help="Increase output verbosity", action="store_true")
parser.add_argument(
"-f",
"--files",
help="Filter .md files to files containing this string in filename",
type=str,
)
parser.add_argument(
"-s",
"--snippetnumbers",
help=(
"Filter checked snippets to snippetnumbers contained in this string, example:"
' "13,412,345"'
),
type=lambda i: i.split(","),
default=None,
)

args = parser.parse_args()

fmt.secho(fmt.bold("Discovering snippets"))

# find all markdown files and collect all snippets
markdown_files = collect_markdown_files(args.verbose)
snippets = collect_snippets(markdown_files, args.verbose)

# check language settings
check_language(snippets)

# filter snippets
filtered_snippets = filter_snippets(snippets, args.files, args.snippetnumbers)

if args.command in ["parse", "full"]:
parse_snippets(filtered_snippets, args.verbose)

# these stages are python only
python_snippets = [s for s in filtered_snippets if s.language == "py"]
if args.command in ["lint", "full"]:
lint_snippets(python_snippets, args.verbose)
if ENABLE_MYPY and args.command in ["typecheck", "full"]:
typecheck_snippets(python_snippets, args.verbose)
1 change: 1 addition & 0 deletions docs/tools/lint_setup/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
lint_me.py
Empty file.
Loading

0 comments on commit ce701b5

Please sign in to comment.