Merge pull request #231 from dhellmann/pypi-filter-factory-json

use json api to retrieve names from pypi.org
sphinx-contrib · Dec 19, 2024 · d41eceb · d41eceb
2 parents 65a9b84 + 2eafb07
commit d41eceb
Show file tree

Hide file tree

Showing 4 changed files with 82 additions and 70 deletions.
diff --git a/docs/source/history.rst b/docs/source/history.rst
@@ -35,6 +35,8 @@ Bug Fixes
 
 - `#229 <https://github.com/sphinx-contrib/spelling/pull/229>`__ Gracefully
   handle if git is not installed
+- `#227 <https://github.com/sphinx-contrib/spelling/pull/229>`__ Use pypi.org's
+  JSON API instead of XML-RPC.
 
 7.7.0
 =====

diff --git a/pyproject.toml b/pyproject.toml
@@ -30,7 +30,7 @@ classifiers = [
 
 requires-python = ">=3.10"
 
-dependencies = ["PyEnchant>=3.1.1", "Sphinx>=3.0.0"]
+dependencies = ["PyEnchant>=3.1.1", "Sphinx>=3.0.0", "requests>=2.32.3"]
 
 [project.optional-dependencies]
 test = ["pytest", "pytest-cov", "coverage!=4.4,>=4.0"]

diff --git a/sphinxcontrib/spelling/filters.py b/sphinxcontrib/spelling/filters.py
@@ -1,17 +1,16 @@
 #
 # Copyright (c) 2010 Doug Hellmann.  All rights reserved.
 #
-"""Spelling checker extension for Sphinx.
-"""
+"""Spelling checker extension for Sphinx."""
 
 # TODO - Words with multiple uppercase letters treated as classes and ignored
 
 import builtins
 import importlib
 import subprocess
 import sys
-from xmlrpc import client as xmlrpc_client
 
+import requests
 from enchant.tokenize import Filter, get_tokenizer, tokenize, unit_tokenize
 from sphinx.util import logging
 
@@ -22,18 +21,19 @@ class AcronymFilter(Filter):
     """If a word looks like an acronym (all upper case letters),
     ignore it.
     """
+
     def _skip(self, word):
         return (
-            word.isupper() or  # all caps
+            word.isupper()  # all caps
+            or
             # pluralized acronym ("URLs")
-            (word[-1].lower() == 's' and word[:-1].isupper())
+            (word[-1].lower() == "s" and word[:-1].isupper())
         )
 
 
 class list_tokenize(tokenize):
-
     def __init__(self, words):
-        super().__init__('')
+        super().__init__("")
         self._words = words
 
     def next(self):
@@ -44,8 +44,8 @@ def next(self):
 
 
 class ContractionFilter(Filter):
-    """Strip common contractions from words.
-    """
+    """Strip common contractions from words."""
+
     splits = {
         "aren't": ["are", "not"],
         "can't": ["can", "not"],
@@ -138,8 +138,7 @@ def _split(self, word):
 
 
 class IgnoreWordsFilter(Filter):
-    """Given a set of words, ignore them all.
-    """
+    """Given a set of words, ignore them all."""
 
     def __init__(self, tokenizer, word_set):
         self.word_set = set(word_set)
@@ -150,7 +149,6 @@ def _skip(self, word):
 
 
 class IgnoreWordsFilterFactory:
-
     def __init__(self, words):
         self.words = words
 
@@ -159,23 +157,31 @@ def __call__(self, tokenizer):
 
 
 class PyPIFilterFactory(IgnoreWordsFilterFactory):
-    """Build an IgnoreWordsFilter for all of the names of packages on PyPI.
-    """
+    """Build an IgnoreWordsFilter for all of the names of packages on PyPI."""
+
     def __init__(self):
-        client = xmlrpc_client.ServerProxy('https://pypi.python.org/pypi')
-        super().__init__(client.list_packages())
+        r = requests.get(
+            "https://pypi.org/simple/",
+            headers={
+                "user-agent": "sphinxcontrib.spelling",
+                "accept": "application/vnd.pypi.simple.v1+json",
+            },
+        )
+        names = [i["name"] for i in r.json()["projects"]]
+        logger.debug("retrieved %d project names from pypi.org", len(names))
+        super().__init__(names)
 
 
 class PythonBuiltinsFilter(Filter):
-    """Ignore names of built-in Python symbols.
-    """
+    """Ignore names of built-in Python symbols."""
+
     def _skip(self, word):
         return hasattr(builtins, word)
 
 
 class ImportableModuleFilter(Filter):
-    """Ignore names of modules that we could import.
-    """
+    """Ignore names of modules that we could import."""
+
     def __init__(self, tokenizer):
         super().__init__(tokenizer)
         self.found_modules = set(sys.builtin_module_names)
@@ -185,7 +191,7 @@ def __init__(self, tokenizer):
         # valid module, which is consistent with the behavior before
         # version 7.3.1.  See
         # https://github.com/sphinx-contrib/spelling/issues/141
-        self.sought_modules.add('__main__')
+        self.sought_modules.add("__main__")
 
     def _skip(self, word):
         # If the word looks like a python module filename, strip the
@@ -195,13 +201,13 @@ def _skip(self, word):
         # it look like Sphinx is complaining about a commandline
         # argument. See
         # https://github.com/sphinx-contrib/spelling/issues/142
-        if word.endswith('.py'):
+        if word.endswith(".py"):
             logger.debug(
-                'removing .py extension from %r before searching for module',
-                word)
+                "removing .py extension from %r before searching for module", word
+            )
             word = word[:-3]
 
-        valid_module_name = all(n.isidentifier() for n in word.split('.'))
+        valid_module_name = all(n.isidentifier() for n in word.split("."))
         if not valid_module_name:
             return False
 
@@ -214,8 +220,7 @@ def _skip(self, word):
                 # error out of distutils, or something else triggered
                 # by failing to be able to import a parent package to
                 # use the metadata to search for a subpackage.
-                logger.debug('find_spec(%r) failed, invalid module name: %s',
-                             word, err)
+                logger.debug("find_spec(%r) failed, invalid module name: %s", word, err)
             else:
                 if mod is not None:
                     self.found_modules.add(word)
@@ -230,25 +235,28 @@ class ContributorFilter(IgnoreWordsFilter):
     tokens that are in the set.
     """
 
-    _pretty_format = (
-        '%(trailers:key=Co-Authored-By,separator=%x0A)%x0A%an%x0A%cn'
-    )
+    _pretty_format = "%(trailers:key=Co-Authored-By,separator=%x0A)%x0A%an%x0A%cn"
 
     def __init__(self, tokenizer):
         contributors = self._get_contributors()
         super().__init__(tokenizer, contributors)
 
     def _get_contributors(self):
-        logger.info('Scanning contributors')
-        cmd = ['git', 'log', '--quiet', '--no-color',
-               f'--pretty=format:{self._pretty_format}']
+        logger.info("Scanning contributors")
+        cmd = [
+            "git",
+            "log",
+            "--quiet",
+            "--no-color",
+            f"--pretty=format:{self._pretty_format}",
+        ]
 
         try:
             p = subprocess.run(cmd, check=True, stdout=subprocess.PIPE)
         except (subprocess.CalledProcessError, FileNotFoundError) as err:
-            logger.warning('Called: %s', ' '.join(cmd))
-            logger.warning('Failed to scan contributors: %s', err)
+            logger.warning("Called: %s", " ".join(cmd))
+            logger.warning("Failed to scan contributors: %s", err)
             return set()
-        output = p.stdout.decode('utf-8')
-        tokenizer = get_tokenizer('en_US', filters=[])
+        output = p.stdout.decode("utf-8")
+        tokenizer = get_tokenizer("en_US", filters=[])
         return {word for word, pos in tokenizer(output)}
diff --git a/tests/test_filter.py b/tests/test_filter.py
@@ -1,8 +1,7 @@
 #
 # Copyright (c) 2010 Doug Hellmann.  All rights reserved.
 #
-"""Tests for filters.
-"""
+"""Tests for filters."""
 
 import contextlib
 import logging
@@ -12,38 +11,38 @@
 import pytest
 from enchant.tokenize import get_tokenizer
 
-from sphinxcontrib.spelling import filters # isort:skip
-from tests import helpers # isort:skip
+from sphinxcontrib.spelling import filters  # isort:skip
+from tests import helpers  # isort:skip
 
 # Replace the sphinx logger with a normal one so pytest can collect
 # the output.
-filters.logger = logging.getLogger('test.filters')
+filters.logger = logging.getLogger("test.filters")
 
 
 def test_builtin_unicode():
     f = filters.PythonBuiltinsFilter(None)
-    assert not f._skip('passé')
+    assert not f._skip("passé")
 
 
 def test_builtin_regular():
     f = filters.PythonBuiltinsFilter(None)
-    assert f._skip('print')
+    assert f._skip("print")
 
 
 def test_acronym():
-    text = 'a front-end for DBM-style databases'
-    t = get_tokenizer('en_US', [])
+    text = "a front-end for DBM-style databases"
+    t = get_tokenizer("en_US", [])
     f = filters.AcronymFilter(t)
     words = [w[0] for w in f(text)]
-    assert 'DBM' not in words, 'Failed to filter out acronym'
+    assert "DBM" not in words, "Failed to filter out acronym"
 
 
 def test_acronym_unicode():
-    text = 'a front-end for DBM-style databases'
-    t = get_tokenizer('en_US', [])
+    text = "a front-end for DBM-style databases"
+    t = get_tokenizer("en_US", [])
     f = filters.AcronymFilter(t)
     words = [w[0] for w in f(text)]
-    assert 'DBM' not in words, 'Failed to filter out acronym'
+    assert "DBM" not in words, "Failed to filter out acronym"
 
 
 @helpers.require_git_repo
@@ -77,7 +76,7 @@ def test_acronym_unicode():
         "Timotheus",
         "Tobias",
         "Tricoli",
-    ]
+    ],
 )
 def test_contributors(name):
     f = filters.ContributorFilter(None)
@@ -87,11 +86,11 @@ def test_contributors(name):
 @pytest.mark.parametrize(
     "word,expected",
     [
-        ('os', True),
-        ('os.name', False),
-        ('__main__', False),
+        ("os", True),
+        ("os.name", False),
+        ("__main__", False),
         ("don't", False),
-    ]
+    ],
 )
 def test_importable_module_skip(word, expected):
     f = filters.ImportableModuleFilter(None)
@@ -110,42 +109,45 @@ def import_path(new_path):
 
 
 def test_importable_module_with_side_effets(tmpdir):
-    logging.debug('tmpdir %r', tmpdir)
-    logging.debug('cwd %r', os.getcwd())
+    logging.debug("tmpdir %r", tmpdir)
+    logging.debug("cwd %r", os.getcwd())
 
-    parentdir = tmpdir.join('parent')
+    parentdir = tmpdir.join("parent")
     parentdir.mkdir()
 
-    parentdir.join('__init__.py').write(
-        'raise SystemExit("exit as side-effect")\n'
-    )
-    parentdir.join('child.py').write('')
+    parentdir.join("__init__.py").write('raise SystemExit("exit as side-effect")\n')
+    parentdir.join("child.py").write("")
 
     with import_path([str(tmpdir)] + sys.path):
         f = filters.ImportableModuleFilter(None)
-        skip_parent = f._skip('parent')
-        skip_both = f._skip('parent.child')
+        skip_parent = f._skip("parent")
+        skip_both = f._skip("parent.child")
 
     # The parent module name is valid because it is not imported, only
     # discovered.
     assert skip_parent is True
-    assert 'parent' in f.found_modules
+    assert "parent" in f.found_modules
 
     # The child module name is not valid because the parent is
     # imported to find the child and that triggers the side-effect.
     assert skip_both is False
-    assert 'parent.child' not in f.found_modules
+    assert "parent.child" not in f.found_modules
 
 
 def test_importable_module_with_system_exit(tmpdir):
-    path = tmpdir.join('mytestmodule.py')
+    path = tmpdir.join("mytestmodule.py")
     path.write('raise SystemExit("exit as side-effect")\n')
 
     with import_path([str(tmpdir)] + sys.path):
         f = filters.ImportableModuleFilter(None)
-        skip = f._skip('mytestmodule')
+        skip = f._skip("mytestmodule")
 
     # The filter does not actually import the module in this case, so
     # it shows up as a valid word.
     assert skip is True
-    assert 'mytestmodule' in f.found_modules
+    assert "mytestmodule" in f.found_modules
+
+
+def test_pypi_filter_factory():
+    f = filters.PyPIFilterFactory()
+    assert "sphinxcontrib-spelling" in f.words