check-peps.py

#!/usr/bin/env python3

# This file is placed in the public domain or under the
# CC0-1.0-Universal license, whichever is more permissive.

"""check-peps: Check PEPs for common mistakes.

Usage: check-peps [-d | --detailed] <PEP files...>

Only the PEPs specified are checked.
If none are specified, all PEPs are checked.

Use "--detailed" to show the contents of lines where errors were found.
"""

from __future__ import annotations

import datetime as dt
import re
import sys
from pathlib import Path

TYPE_CHECKING = False
if TYPE_CHECKING:
    from collections.abc import Iterable, Iterator, KeysView, Sequence
    from typing import TypeAlias

    # (line number, warning message)
    Message: TypeAlias = tuple[int, str]
    MessageIterator: TypeAlias = Iterator[Message]


# get the directory with the PEP sources
ROOT_DIR = Path(__file__).resolve().parent
PEP_ROOT = ROOT_DIR / "peps"

# See PEP 12 for the order
# Note we retain "BDFL-Delegate"
ALL_HEADERS = (
    "PEP",
    "Title",
    "Version",
    "Last-Modified",
    "Author",
    "Sponsor",
    "BDFL-Delegate", "PEP-Delegate",
    "Discussions-To",
    "Status",
    "Type",
    "Topic",
    "Content-Type",
    "Requires",
    "Created",
    "Python-Version",
    "Post-History",
    "Replaces",
    "Superseded-By",
    "Resolution",
)
REQUIRED_HEADERS = frozenset({"PEP", "Title", "Author", "Status", "Type", "Created"})

# See PEP 1 for the full list
ALL_STATUSES = frozenset({
    "Accepted",
    "Active",
    "April Fool!",
    "Deferred",
    "Draft",
    "Final",
    "Provisional",
    "Rejected",
    "Superseded",
    "Withdrawn",
})

# PEPs that are allowed to link directly to PEPs
SKIP_DIRECT_PEP_LINK_CHECK = frozenset({"0009", "0287", "0676", "0684", "8001"})

DEFAULT_FLAGS = re.ASCII | re.IGNORECASE  # Insensitive latin

# any sequence of letters or '-', followed by a single ':' and a space or end of line
HEADER_PATTERN = re.compile(r"^([a-z\-]+):(?: |$)", DEFAULT_FLAGS)
# any sequence of unicode letters or legal special characters
NAME_PATTERN = re.compile(r"(?:[^\W\d_]|[ ',\-.])+(?: |$)")
# any sequence of ASCII letters, digits, or legal special characters
EMAIL_LOCAL_PART_PATTERN = re.compile(r"[\w!#$%&'*+\-/=?^{|}~.]+", DEFAULT_FLAGS)

DISCOURSE_THREAD_PATTERN = re.compile(r"([\w\-]+/)?\d+", DEFAULT_FLAGS)
DISCOURSE_POST_PATTERN = re.compile(r"([\w\-]+/)?\d+(/\d+)?", DEFAULT_FLAGS)

MAILMAN_2_PATTERN = re.compile(r"[\w\-]+/\d{4}-[a-z]+/\d+\.html", DEFAULT_FLAGS)
MAILMAN_3_THREAD_PATTERN = re.compile(r"[\w\-]+@python\.org/thread/[a-z0-9]+/?", DEFAULT_FLAGS)
MAILMAN_3_MESSAGE_PATTERN = re.compile(r"[\w\-]+@python\.org/message/[a-z0-9]+/?(#[a-z0-9]+)?", DEFAULT_FLAGS)

# Controlled by the "--detailed" flag
DETAILED_ERRORS = False


def check(filenames: Sequence[str] = (), /) -> int:
    """The main entry-point."""
    if filenames:
        filenames = map(Path, filenames)
    else:
        filenames = PEP_ROOT.glob("pep-????.rst")
    if (count := sum(map(check_file, filenames))) > 0:
        s = "s" * (count != 1)
        print(f"check-peps failed: {count} error{s}", file=sys.stderr)
        return 1
    return 0


def check_file(filename: Path, /) -> int:
    filename = filename.resolve()
    try:
        content = filename.read_text(encoding="utf-8")
    except FileNotFoundError:
        return _output_error(filename, [""], [(0, "Could not read PEP!")])
    else:
        lines = content.splitlines()
        return _output_error(filename, lines, check_peps(filename, lines))


def check_peps(filename: Path, lines: Sequence[str], /) -> MessageIterator:
    yield from check_headers(lines)
    for line_num, line in enumerate(lines, start=1):
        if filename.stem.removeprefix("pep-") in SKIP_DIRECT_PEP_LINK_CHECK:
            continue
        yield from check_direct_links(line_num, line.lstrip())


def check_headers(lines: Sequence[str], /) -> MessageIterator:
    yield from _validate_pep_number(next(iter(lines), ""))

    found_headers = {}
    line_num = 0
    for line_num, line in enumerate(lines, start=1):
        if line.strip() == "":
            headers_end_line_num = line_num
            break
        if match := HEADER_PATTERN.match(line):
            header = match[1]
            if header in ALL_HEADERS:
                if header not in found_headers:
                    found_headers[match[1]] = line_num
                else:
                    yield line_num, f"Must not have duplicate header: {header} "
            else:
                yield line_num, f"Must not have invalid header: {header}"
    else:
        headers_end_line_num = line_num

    yield from _validate_required_headers(found_headers.keys())

    shifted_line_nums = list(found_headers.values())[1:]
    for i, (header, line_num) in enumerate(found_headers.items()):
        start = line_num - 1
        end = headers_end_line_num - 1
        if i < len(found_headers) - 1:
            end = shifted_line_nums[i] - 1
        remainder = "\n".join(lines[start:end]).removeprefix(f"{header}:")
        if remainder != "":
            if remainder[0] not in {" ", "\n"}:
                yield line_num, f"Headers must have a space after the colon: {header}"
            remainder = remainder.lstrip()
        yield from _validate_header(header, line_num, remainder)


def _validate_header(header: str, line_num: int, content: str) -> MessageIterator:
    if header == "Title":
        yield from _validate_title(line_num, content)
    elif header == "Author":
        yield from _validate_author(line_num, content)
    elif header == "Sponsor":
        yield from _validate_sponsor(line_num, content)
    elif header in {"BDFL-Delegate", "PEP-Delegate"}:
        yield from _validate_delegate(line_num, content)
    elif header == "Discussions-To":
        yield from _validate_discussions_to(line_num, content)
    elif header == "Status":
        yield from _validate_status(line_num, content)
    elif header == "Type":
        yield from _validate_type(line_num, content)
    elif header == "Topic":
        yield from _validate_topic(line_num, content)
    elif header == "Content-Type":
        yield from _validate_content_type(line_num, content)
    elif header in {"Requires", "Replaces", "Superseded-By"}:
        yield from _validate_pep_references(line_num, content)
    elif header == "Created":
        yield from _validate_created(line_num, content)
    elif header == "Python-Version":
        yield from _validate_python_version(line_num, content)
    elif header == "Post-History":
        yield from _validate_post_history(line_num, content)
    elif header == "Resolution":
        yield from _validate_resolution(line_num, content)


def check_direct_links(line_num: int, line: str) -> MessageIterator:
    """Check that PEPs and RFCs aren't linked directly"""

    line = line.lower()
    if "dev/peps/pep-" in line or "peps.python.org/pep-" in line:
        yield line_num, "Use the :pep:`NNN` role to refer to PEPs"
    if "rfc-editor.org/rfc/" in line or "ietf.org/doc/html/rfc" in line:
        yield line_num, "Use the :rfc:`NNN` role to refer to RFCs"


def _output_error(filename: Path, lines: Sequence[str], errors: Iterable[Message]) -> int:
    relative_filename = filename.relative_to(ROOT_DIR)
    err_count = 0
    for line_num, msg in errors:
        err_count += 1

        print(f"{relative_filename}:{line_num}:  {msg}")
        if not DETAILED_ERRORS:
            continue

        line = lines[line_num - 1]
        print("     |")
        print(f"{line_num: >4} | '{line}'")
        print("     |")

    return err_count


###########################
#  PEP Header Validators  #
###########################


def _validate_required_headers(found_headers: KeysView[str]) -> MessageIterator:
    """PEPs must have all required headers, in the PEP 12 order"""

    if missing := REQUIRED_HEADERS.difference(found_headers):
        for missing_header in sorted(missing, key=ALL_HEADERS.index):
            yield 1, f"Must have required header: {missing_header}"

    ordered_headers = sorted(found_headers, key=ALL_HEADERS.index)
    if list(found_headers) != ordered_headers:
        order_str = ", ".join(ordered_headers)
        yield 1, "Headers must be in PEP 12 order. Correct order: " + order_str


def _validate_pep_number(line: str) -> MessageIterator:
    """'PEP' header must be a number 1-9999"""

    if not line.startswith("PEP: "):
        yield 1, "PEP must begin with the 'PEP:' header"
        return

    pep_number = line.removeprefix("PEP: ").lstrip()
    yield from _pep_num(1, pep_number, "'PEP:' header")


def _validate_title(line_num: int, line: str) -> MessageIterator:
    """'Title' must be 1-79 characters"""

    if len(line) == 0:
        yield line_num, "PEP must have a title"
    elif len(line) > 79:
        yield line_num, "PEP title must be less than 80 characters"


def _validate_author(line_num: int, body: str) -> MessageIterator:
    """'Author' must be list of 'Name <email@example.com>, …'"""

    lines = body.split("\n")
    for offset, line in enumerate(lines):
        if offset >= 1 and line[:9].isspace():
            # Checks for:
            # Author: Alice
            #             Bob
            #         ^^^^
            # Note that len("Author: ") == 8
            yield line_num + offset, "Author line must not be over-indented"
        if offset < len(lines) - 1:
            if not line.endswith(","):
                yield line_num + offset, "Author continuation lines must end with a comma"
        for part in line.removesuffix(",").split(", "):
            yield from _email(line_num + offset, part, "Author")


def _validate_sponsor(line_num: int, line: str) -> MessageIterator:
    """'Sponsor' must have format 'Name <email@example.com>'"""

    yield from _email(line_num, line, "Sponsor")


def _validate_delegate(line_num: int, line: str) -> MessageIterator:
    """'Delegate' must have format 'Name <email@example.com>'"""

    if line == "":
        return

    # PEP 451
    if ", " in line:
        for part in line.removesuffix(",").split(", "):
            yield from _email(line_num, part, "Delegate")
        return

    yield from _email(line_num, line, "Delegate")


def _validate_discussions_to(line_num: int, line: str) -> MessageIterator:
    """'Discussions-To' must be a thread URL"""

    yield from _thread(line_num, line, "Discussions-To", discussions_to=True)
    if line.startswith("https://"):
        return
    for suffix in "@python.org", "@googlegroups.com":
        if line.endswith(suffix):
            remainder = line.removesuffix(suffix)
            if re.fullmatch(r"[\w\-]+", remainder) is None:
                yield line_num, "Discussions-To must be a valid mailing list"
            return
    yield line_num, "Discussions-To must be a valid thread URL or mailing list"


def _validate_status(line_num: int, line: str) -> MessageIterator:
    """'Status' must be a valid PEP status"""

    if line not in ALL_STATUSES:
        yield line_num, "Status must be a valid PEP status"


def _validate_type(line_num: int, line: str) -> MessageIterator:
    """'Type' must be a valid PEP type"""

    if line not in {"Standards Track", "Informational", "Process"}:
        yield line_num, "Type must be a valid PEP type"


def _validate_topic(line_num: int, line: str) -> MessageIterator:
    """'Topic' must be for a valid sub-index"""

    topics = line.split(", ")
    unique_topics = set(topics)
    if len(topics) > len(unique_topics):
        yield line_num, "Topic must not contain duplicates"

    if unique_topics - {"Governance", "Packaging", "Typing", "Release"}:
        if not all(map(str.istitle, unique_topics)):
            yield line_num, "Topic must be properly capitalised (Title Case)"
        if unique_topics - {"governance", "packaging", "typing", "release"}:
            yield line_num, "Topic must be for a valid sub-index"
    if sorted(topics) != topics:
        yield line_num, "Topic must be sorted lexicographically"


def _validate_content_type(line_num: int, line: str) -> MessageIterator:
    """'Content-Type' must be 'text/x-rst'"""

    if line != "text/x-rst":
        yield line_num, "Content-Type must be 'text/x-rst'"


def _validate_pep_references(line_num: int, line: str) -> MessageIterator:
    """`Requires`/`Replaces`/`Superseded-By` must be 'NNN' PEP IDs"""

    line = line.removesuffix(",").rstrip()
    if line.count(", ") != line.count(","):
        yield line_num, "PEP references must be separated by comma-spaces (', ')"
        return

    references = line.split(", ")
    for reference in references:
        yield from _pep_num(line_num, reference, "PEP reference")


def _validate_created(line_num: int, line: str) -> MessageIterator:
    """'Created' must be a 'DD-mmm-YYYY' date"""

    yield from _date(line_num, line, "Created")


def _validate_python_version(line_num: int, line: str) -> MessageIterator:
    """'Python-Version' must be an ``X.Y[.Z]`` version"""

    versions = line.split(", ")
    for version in versions:
        if version.count(".") not in {1, 2}:
            yield line_num, f"Python-Version must have two or three segments: {version}"
            continue

        try:
            major, minor, micro = version.split(".", 2)
        except ValueError:
            major, minor = version.split(".", 1)
            micro = ""

        if major not in "123":
            yield line_num, f"Python-Version major part must be 1, 2, or 3: {version}"
        if not _is_digits(minor) and minor != "x":
            yield line_num, f"Python-Version minor part must be numeric: {version}"
        elif minor != "0" and minor[0] == "0":
            yield line_num, f"Python-Version minor part must not have leading zeros: {version}"

        if micro == "":
            return
        if minor == "x":
            yield line_num, f"Python-Version micro part must be empty if minor part is 'x': {version}"
        elif micro[0] == "0":
            yield line_num, f"Python-Version micro part must not have leading zeros: {version}"
        elif not _is_digits(micro):
            yield line_num, f"Python-Version micro part must be numeric: {version}"


def _validate_post_history(line_num: int, body: str) -> MessageIterator:
    """'Post-History' must be '`DD-mmm-YYYY <Thread URL>`__, …'"""

    if body == "":
        return

    for offset, line in enumerate(body.removesuffix(",").split("\n"), start=line_num):
        for post in line.removesuffix(",").strip().split(", "):
            if not post.startswith("`") and not post.endswith(">`__"):
                yield from _date(offset, post, "Post-History")
            else:
                post_date, post_url = post[1:-4].split(" <")
                yield from _date(offset, post_date, "Post-History")
                yield from _thread(offset, post_url, "Post-History")


def _validate_resolution(line_num: int, line: str) -> MessageIterator:
    """'Resolution' must be a direct thread/message URL"""

    yield from _thread(line_num, line, "Resolution", allow_message=True)


########################
#  Validation Helpers  #
########################

def _pep_num(line_num: int, pep_number: str, prefix: str) -> MessageIterator:
    if pep_number == "":
        yield line_num, f"{prefix} must not be blank: {pep_number!r}"
        return
    if pep_number.startswith("0") and pep_number != "0":
        yield line_num, f"{prefix} must not contain leading zeros: {pep_number!r}"
    if not _is_digits(pep_number):
        yield line_num, f"{prefix} must be numeric: {pep_number!r}"
    elif not 0 <= int(pep_number) <= 9999:
        yield line_num, f"{prefix} must be between 0 and 9999: {pep_number!r}"


def _is_digits(string: str) -> bool:
    """Match a string of ASCII digits ([0-9]+)."""
    return string.isascii() and string.isdigit()


def _email(line_num: int, author_email: str, prefix: str) -> MessageIterator:
    author_email = author_email.strip()

    if author_email.count("<") > 1:
        msg = f"{prefix} entries must not contain multiple '<': {author_email!r}"
        yield line_num, msg
    if author_email.count(">") > 1:
        msg = f"{prefix} entries must not contain multiple '>': {author_email!r}"
        yield line_num, msg
    if author_email.count("@") > 1:
        msg = f"{prefix} entries must not contain multiple '@': {author_email!r}"
        yield line_num, msg

    author = author_email.split("<", 1)[0].rstrip()
    if NAME_PATTERN.fullmatch(author) is None:
        msg = f"{prefix} entries must begin with a valid 'Name': {author_email!r}"
        yield line_num, msg
        return

    email_text = author_email.removeprefix(author)
    if not email_text:
        # Does not have the optional email part
        return

    if not email_text.startswith(" <") or not email_text.endswith(">"):
        msg = f"{prefix} entries must be formatted as 'Name <email@example.com>': {author_email!r}"
        yield line_num, msg
    email_text = email_text.removeprefix(" <").removesuffix(">")

    if "@" in email_text:
        local, domain = email_text.rsplit("@", 1)
    elif " at " in email_text:
        local, domain = email_text.rsplit(" at ", 1)
    else:
        yield line_num, f"{prefix} entries must contain a valid email address: {author_email!r}"
        return
    if EMAIL_LOCAL_PART_PATTERN.fullmatch(local) is None or _invalid_domain(domain):
        yield line_num, f"{prefix} entries must contain a valid email address: {author_email!r}"


def _invalid_domain(domain_part: str) -> bool:
    *labels, root = domain_part.split(".")
    for label in labels:
        if not label.replace("-", "").isalnum():
            return True
    return not root.isalnum() or not root.isascii()


def _thread(line_num: int, url: str, prefix: str, *, allow_message: bool = False, discussions_to: bool = False) -> MessageIterator:
    if allow_message and discussions_to:
        msg = "allow_message and discussions_to cannot both be True"
        raise ValueError(msg)

    msg = f"{prefix} must be a valid thread URL"

    if not url.startswith("https://"):
        if not discussions_to:
            yield line_num, msg
        return

    if url.startswith("https://discuss.python.org/t/"):
        remainder = url.removeprefix("https://discuss.python.org/t/").removesuffix("/")

        # Discussions-To links must be the thread itself, not a post
        if discussions_to:
            # The equivalent pattern is similar to '([\w\-]+/)?\d+',
            # but the topic name must contain a non-numeric character

            # We use ``str.rpartition`` as the topic name is optional
            topic_name, _, topic_id = remainder.rpartition("/")
            if topic_name == '' and _is_digits(topic_id):
                return
            topic_name = topic_name.replace("-", "0").replace("_", "0")
            # the topic name must not be entirely numeric
            valid_topic_name = not _is_digits(topic_name) and topic_name.isalnum()
            if valid_topic_name and _is_digits(topic_id):
                return
        else:
            # The equivalent pattern is similar to '([\w\-]+/)?\d+(/\d+)?',
            # but the topic name must contain a non-numeric character
            if remainder.count("/") == 2:
                # When there are three parts, the URL must be "topic-name/topic-id/post-id".
                topic_name, topic_id, post_id = remainder.rsplit("/", 2)
                topic_name = topic_name.replace("-", "0").replace("_", "0")
                valid_topic_name = not _is_digits(topic_name) and topic_name.isalnum()
                if valid_topic_name and _is_digits(topic_id) and _is_digits(post_id):
                    # the topic name must not be entirely numeric
                    return
            elif remainder.count("/") == 1:
                # When there are only two parts, there's an ambiguity between
                # "topic-name/topic-id" and "topic-id/post-id".
                # We disambiguate by checking if the LHS is a valid name and
                # the RHS is a valid topic ID (for the former),
                # and then if both the LHS and RHS are valid IDs (for the latter).
                left, right = remainder.rsplit("/")
                left = left.replace("-", "0").replace("_", "0")
                # the topic name must not be entirely numeric
                left_is_name = not _is_digits(left) and left.isalnum()
                if left_is_name and _is_digits(right):
                    return
                elif _is_digits(left) and _is_digits(right):
                    return
            else:
                # When there's only one part, it must be a valid topic ID.
                if _is_digits(remainder):
                    return

    if url.startswith("https://mail.python.org/pipermail/"):
        remainder = url.removeprefix("https://mail.python.org/pipermail/")
        if MAILMAN_2_PATTERN.fullmatch(remainder) is not None:
            return

    if url.startswith("https://mail.python.org/archives/list/"):
        remainder = url.removeprefix("https://mail.python.org/archives/list/")
        if allow_message and MAILMAN_3_MESSAGE_PATTERN.fullmatch(remainder) is not None:
            return
        if MAILMAN_3_THREAD_PATTERN.fullmatch(remainder) is not None:
            return

    yield line_num, msg


def _date(line_num: int, date_str: str, prefix: str) -> MessageIterator:
    try:
        parsed_date = dt.datetime.strptime(date_str, "%d-%b-%Y")
    except ValueError:
        yield line_num, f"{prefix} must be a 'DD-mmm-YYYY' date: {date_str!r}"
        return
    else:
        if date_str[1] == "-":  # Date must be zero-padded
            yield line_num, f"{prefix} must be a 'DD-mmm-YYYY' date: {date_str!r}"
            return

    if parsed_date.year < 1990:
        yield line_num, f"{prefix} must not be before Python was invented: {date_str!r}"
    if parsed_date > (dt.datetime.now() + dt.timedelta(days=14)):
        yield line_num, f"{prefix} must not be in the future: {date_str!r}"


if __name__ == "__main__":
    if {"-h", "--help", "-?"}.intersection(sys.argv[1:]):
        print(__doc__, file=sys.stderr)
        raise SystemExit(0)

    files = {}
    for arg in sys.argv[1:]:
        if not arg.startswith("-"):
            files[arg] = None
        elif arg in {"-d", "--detailed"}:
            DETAILED_ERRORS = True
        else:
            print(f"Unknown option: {arg!r}", file=sys.stderr)
            raise SystemExit(1)
    raise SystemExit(check(files))