Skip to content

Commit

Permalink
Merge pull request #231 from dhellmann/pypi-filter-factory-json
Browse files Browse the repository at this point in the history
use json api to retrieve names from pypi.org
  • Loading branch information
mergify[bot] authored Dec 19, 2024
2 parents 65a9b84 + 2eafb07 commit d41eceb
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 70 deletions.
2 changes: 2 additions & 0 deletions docs/source/history.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ Bug Fixes

- `#229 <https://github.com/sphinx-contrib/spelling/pull/229>`__ Gracefully
handle if git is not installed
- `#227 <https://github.com/sphinx-contrib/spelling/pull/229>`__ Use pypi.org's
JSON API instead of XML-RPC.

7.7.0
=====
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ classifiers = [

requires-python = ">=3.10"

dependencies = ["PyEnchant>=3.1.1", "Sphinx>=3.0.0"]
dependencies = ["PyEnchant>=3.1.1", "Sphinx>=3.0.0", "requests>=2.32.3"]

[project.optional-dependencies]
test = ["pytest", "pytest-cov", "coverage!=4.4,>=4.0"]
Expand Down
82 changes: 45 additions & 37 deletions sphinxcontrib/spelling/filters.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,16 @@
#
# Copyright (c) 2010 Doug Hellmann. All rights reserved.
#
"""Spelling checker extension for Sphinx.
"""
"""Spelling checker extension for Sphinx."""

# TODO - Words with multiple uppercase letters treated as classes and ignored

import builtins
import importlib
import subprocess
import sys
from xmlrpc import client as xmlrpc_client

import requests
from enchant.tokenize import Filter, get_tokenizer, tokenize, unit_tokenize
from sphinx.util import logging

Expand All @@ -22,18 +21,19 @@ class AcronymFilter(Filter):
"""If a word looks like an acronym (all upper case letters),
ignore it.
"""

def _skip(self, word):
return (
word.isupper() or # all caps
word.isupper() # all caps
or
# pluralized acronym ("URLs")
(word[-1].lower() == 's' and word[:-1].isupper())
(word[-1].lower() == "s" and word[:-1].isupper())
)


class list_tokenize(tokenize):

def __init__(self, words):
super().__init__('')
super().__init__("")
self._words = words

def next(self):
Expand All @@ -44,8 +44,8 @@ def next(self):


class ContractionFilter(Filter):
"""Strip common contractions from words.
"""
"""Strip common contractions from words."""

splits = {
"aren't": ["are", "not"],
"can't": ["can", "not"],
Expand Down Expand Up @@ -138,8 +138,7 @@ def _split(self, word):


class IgnoreWordsFilter(Filter):
"""Given a set of words, ignore them all.
"""
"""Given a set of words, ignore them all."""

def __init__(self, tokenizer, word_set):
self.word_set = set(word_set)
Expand All @@ -150,7 +149,6 @@ def _skip(self, word):


class IgnoreWordsFilterFactory:

def __init__(self, words):
self.words = words

Expand All @@ -159,23 +157,31 @@ def __call__(self, tokenizer):


class PyPIFilterFactory(IgnoreWordsFilterFactory):
"""Build an IgnoreWordsFilter for all of the names of packages on PyPI.
"""
"""Build an IgnoreWordsFilter for all of the names of packages on PyPI."""

def __init__(self):
client = xmlrpc_client.ServerProxy('https://pypi.python.org/pypi')
super().__init__(client.list_packages())
r = requests.get(
"https://pypi.org/simple/",
headers={
"user-agent": "sphinxcontrib.spelling",
"accept": "application/vnd.pypi.simple.v1+json",
},
)
names = [i["name"] for i in r.json()["projects"]]
logger.debug("retrieved %d project names from pypi.org", len(names))
super().__init__(names)


class PythonBuiltinsFilter(Filter):
"""Ignore names of built-in Python symbols.
"""
"""Ignore names of built-in Python symbols."""

def _skip(self, word):
return hasattr(builtins, word)


class ImportableModuleFilter(Filter):
"""Ignore names of modules that we could import.
"""
"""Ignore names of modules that we could import."""

def __init__(self, tokenizer):
super().__init__(tokenizer)
self.found_modules = set(sys.builtin_module_names)
Expand All @@ -185,7 +191,7 @@ def __init__(self, tokenizer):
# valid module, which is consistent with the behavior before
# version 7.3.1. See
# https://github.com/sphinx-contrib/spelling/issues/141
self.sought_modules.add('__main__')
self.sought_modules.add("__main__")

def _skip(self, word):
# If the word looks like a python module filename, strip the
Expand All @@ -195,13 +201,13 @@ def _skip(self, word):
# it look like Sphinx is complaining about a commandline
# argument. See
# https://github.com/sphinx-contrib/spelling/issues/142
if word.endswith('.py'):
if word.endswith(".py"):
logger.debug(
'removing .py extension from %r before searching for module',
word)
"removing .py extension from %r before searching for module", word
)
word = word[:-3]

valid_module_name = all(n.isidentifier() for n in word.split('.'))
valid_module_name = all(n.isidentifier() for n in word.split("."))
if not valid_module_name:
return False

Expand All @@ -214,8 +220,7 @@ def _skip(self, word):
# error out of distutils, or something else triggered
# by failing to be able to import a parent package to
# use the metadata to search for a subpackage.
logger.debug('find_spec(%r) failed, invalid module name: %s',
word, err)
logger.debug("find_spec(%r) failed, invalid module name: %s", word, err)
else:
if mod is not None:
self.found_modules.add(word)
Expand All @@ -230,25 +235,28 @@ class ContributorFilter(IgnoreWordsFilter):
tokens that are in the set.
"""

_pretty_format = (
'%(trailers:key=Co-Authored-By,separator=%x0A)%x0A%an%x0A%cn'
)
_pretty_format = "%(trailers:key=Co-Authored-By,separator=%x0A)%x0A%an%x0A%cn"

def __init__(self, tokenizer):
contributors = self._get_contributors()
super().__init__(tokenizer, contributors)

def _get_contributors(self):
logger.info('Scanning contributors')
cmd = ['git', 'log', '--quiet', '--no-color',
f'--pretty=format:{self._pretty_format}']
logger.info("Scanning contributors")
cmd = [
"git",
"log",
"--quiet",
"--no-color",
f"--pretty=format:{self._pretty_format}",
]

try:
p = subprocess.run(cmd, check=True, stdout=subprocess.PIPE)
except (subprocess.CalledProcessError, FileNotFoundError) as err:
logger.warning('Called: %s', ' '.join(cmd))
logger.warning('Failed to scan contributors: %s', err)
logger.warning("Called: %s", " ".join(cmd))
logger.warning("Failed to scan contributors: %s", err)
return set()
output = p.stdout.decode('utf-8')
tokenizer = get_tokenizer('en_US', filters=[])
output = p.stdout.decode("utf-8")
tokenizer = get_tokenizer("en_US", filters=[])
return {word for word, pos in tokenizer(output)}
66 changes: 34 additions & 32 deletions tests/test_filter.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
#
# Copyright (c) 2010 Doug Hellmann. All rights reserved.
#
"""Tests for filters.
"""
"""Tests for filters."""

import contextlib
import logging
Expand All @@ -12,38 +11,38 @@
import pytest
from enchant.tokenize import get_tokenizer

from sphinxcontrib.spelling import filters # isort:skip
from tests import helpers # isort:skip
from sphinxcontrib.spelling import filters # isort:skip
from tests import helpers # isort:skip

# Replace the sphinx logger with a normal one so pytest can collect
# the output.
filters.logger = logging.getLogger('test.filters')
filters.logger = logging.getLogger("test.filters")


def test_builtin_unicode():
f = filters.PythonBuiltinsFilter(None)
assert not f._skip('passé')
assert not f._skip("passé")


def test_builtin_regular():
f = filters.PythonBuiltinsFilter(None)
assert f._skip('print')
assert f._skip("print")


def test_acronym():
text = 'a front-end for DBM-style databases'
t = get_tokenizer('en_US', [])
text = "a front-end for DBM-style databases"
t = get_tokenizer("en_US", [])
f = filters.AcronymFilter(t)
words = [w[0] for w in f(text)]
assert 'DBM' not in words, 'Failed to filter out acronym'
assert "DBM" not in words, "Failed to filter out acronym"


def test_acronym_unicode():
text = 'a front-end for DBM-style databases'
t = get_tokenizer('en_US', [])
text = "a front-end for DBM-style databases"
t = get_tokenizer("en_US", [])
f = filters.AcronymFilter(t)
words = [w[0] for w in f(text)]
assert 'DBM' not in words, 'Failed to filter out acronym'
assert "DBM" not in words, "Failed to filter out acronym"


@helpers.require_git_repo
Expand Down Expand Up @@ -77,7 +76,7 @@ def test_acronym_unicode():
"Timotheus",
"Tobias",
"Tricoli",
]
],
)
def test_contributors(name):
f = filters.ContributorFilter(None)
Expand All @@ -87,11 +86,11 @@ def test_contributors(name):
@pytest.mark.parametrize(
"word,expected",
[
('os', True),
('os.name', False),
('__main__', False),
("os", True),
("os.name", False),
("__main__", False),
("don't", False),
]
],
)
def test_importable_module_skip(word, expected):
f = filters.ImportableModuleFilter(None)
Expand All @@ -110,42 +109,45 @@ def import_path(new_path):


def test_importable_module_with_side_effets(tmpdir):
logging.debug('tmpdir %r', tmpdir)
logging.debug('cwd %r', os.getcwd())
logging.debug("tmpdir %r", tmpdir)
logging.debug("cwd %r", os.getcwd())

parentdir = tmpdir.join('parent')
parentdir = tmpdir.join("parent")
parentdir.mkdir()

parentdir.join('__init__.py').write(
'raise SystemExit("exit as side-effect")\n'
)
parentdir.join('child.py').write('')
parentdir.join("__init__.py").write('raise SystemExit("exit as side-effect")\n')
parentdir.join("child.py").write("")

with import_path([str(tmpdir)] + sys.path):
f = filters.ImportableModuleFilter(None)
skip_parent = f._skip('parent')
skip_both = f._skip('parent.child')
skip_parent = f._skip("parent")
skip_both = f._skip("parent.child")

# The parent module name is valid because it is not imported, only
# discovered.
assert skip_parent is True
assert 'parent' in f.found_modules
assert "parent" in f.found_modules

# The child module name is not valid because the parent is
# imported to find the child and that triggers the side-effect.
assert skip_both is False
assert 'parent.child' not in f.found_modules
assert "parent.child" not in f.found_modules


def test_importable_module_with_system_exit(tmpdir):
path = tmpdir.join('mytestmodule.py')
path = tmpdir.join("mytestmodule.py")
path.write('raise SystemExit("exit as side-effect")\n')

with import_path([str(tmpdir)] + sys.path):
f = filters.ImportableModuleFilter(None)
skip = f._skip('mytestmodule')
skip = f._skip("mytestmodule")

# The filter does not actually import the module in this case, so
# it shows up as a valid word.
assert skip is True
assert 'mytestmodule' in f.found_modules
assert "mytestmodule" in f.found_modules


def test_pypi_filter_factory():
f = filters.PyPIFilterFactory()
assert "sphinxcontrib-spelling" in f.words

0 comments on commit d41eceb

Please sign in to comment.