diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bcc36732..371e7545 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,12 +1,12 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.2.2 + rev: v0.7.4 hooks: - id: ruff - id: ruff-format args: ["--check"] - repo: https://github.com/pre-commit/mirrors-mypy - rev: "v1.8.0" + rev: "v1.13.0" hooks: - id: mypy additional_dependencies: diff --git a/pyproject.toml b/pyproject.toml index 0ece6c64..41a7e071 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,8 +64,8 @@ [project.optional-dependencies] dev = [ - "ruff==0.2.2", - "mypy==1.8.0", + "ruff==0.7.4", + "mypy==1.13.0", "types-PyYAML", "types-python-dateutil", "types-requests", @@ -119,6 +119,7 @@ src = ["src", "tests"] line-length = 120 target-version = "py310" + extend-exclude = ["*.ipynb"] [tool.ruff.lint.mccabe] max-complexity = 10 diff --git a/requirements/all_requirements.txt b/requirements/all_requirements.txt index 13693653..f94889f9 100644 --- a/requirements/all_requirements.txt +++ b/requirements/all_requirements.txt @@ -2,7 +2,7 @@ accessible-pygments==0.0.4 # via pydata-sphinx-theme aiohappyeyeballs==2.4.0 # via aiohttp -aiohttp==3.10.2 +aiohttp==3.10.11 # via # datasets # fsspec @@ -114,6 +114,8 @@ datasets==2.15.0 # setfit datashader==0.16.0 # via umap-learn +dateparser==1.2.0 + # via sec-certs (./../pyproject.toml) debugpy==1.8.0 # via ipykernel decorator==5.1.1 @@ -309,7 +311,7 @@ murmurhash==1.0.10 # preshed # spacy # thinc -mypy==1.8.0 +mypy==1.13.0 # via sec-certs (./../pyproject.toml) mypy-extensions==1.0.0 # via mypy @@ -467,6 +469,8 @@ preshed==3.0.9 # thinc prompt-toolkit==3.0.41 # via ipython +propcache==0.2.0 + # via yarl psutil==5.9.6 # via # ipykernel @@ -535,6 +539,7 @@ pytest-profiling==1.7.0 # via sec-certs (./../pyproject.toml) python-dateutil==2.8.2 # via + # dateparser # jupyter-client # matplotlib # pandas @@ -542,7 +547,9 @@ python-dateutil==2.8.2 python-dotenv==1.0.0 # via pydantic-settings pytz==2023.3.post1 - # via pandas + # via + # dateparser + # pandas pyviz-comms==3.0.0 # via # holoviews @@ -572,6 +579,7 @@ referencing==0.31.0 # jsonschema-specifications regex==2023.10.3 # via + # dateparser # nltk # transformers requests==2.32.0 @@ -596,7 +604,7 @@ rpds-py==0.13.1 # via # jsonschema # referencing -ruff==0.2.2 +ruff==0.7.4 # via sec-certs (./../pyproject.toml) safetensors==0.4.5 # via transformers @@ -728,7 +736,7 @@ toolz==0.12.0 # dask # datashader # partd -torch==2.2.0 +torch==2.1.1 # via # sentence-transformers # torchvision @@ -792,6 +800,8 @@ typing-extensions==4.8.0 # typer tzdata==2023.3 # via pandas +tzlocal==5.2 + # via dateparser uc-micro-py==1.0.2 # via linkify-it-py umap-learn[plot]==0.5.5 @@ -834,7 +844,7 @@ xyzservices==2023.10.1 # via # bokeh # panel -yarl==1.9.3 +yarl==1.17.2 # via aiohttp zipp==3.19.1 # via importlib-metadata diff --git a/requirements/dev_requirements.txt b/requirements/dev_requirements.txt index 4ea46d2e..4db7d080 100644 --- a/requirements/dev_requirements.txt +++ b/requirements/dev_requirements.txt @@ -2,7 +2,7 @@ accessible-pygments==0.0.4 # via pydata-sphinx-theme aiohappyeyeballs==2.4.0 # via aiohttp -aiohttp==3.10.2 +aiohttp==3.10.11 # via # datasets # fsspec @@ -12,8 +12,14 @@ alabaster==0.7.13 # via sphinx annotated-types==0.6.0 # via pydantic +appnope==0.1.4 + # via + # ipykernel + # ipython asttokens==2.4.1 # via stack-data +async-timeout==5.0.1 + # via aiohttp attrs==23.1.0 # via # aiohttp @@ -27,7 +33,7 @@ babel==2.13.1 beautifulsoup4==4.12.2 # via # pydata-sphinx-theme - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) blis==0.7.11 # via thinc build==1.0.3 @@ -49,7 +55,7 @@ click==8.1.7 # via # jupyter-cache # pip-tools - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) # typer cloudpathlib==0.16.0 # via weasel @@ -64,7 +70,9 @@ confection==0.1.3 contourpy==1.2.0 # via matplotlib coverage[toml]==7.3.2 - # via pytest-cov + # via + # coverage + # pytest-cov cryptography==43.0.1 # via pypdf cycler==0.12.1 @@ -75,9 +83,9 @@ cymem==2.0.8 # spacy # thinc datasets==2.15.0 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) dateparser==1.2.0 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) debugpy==1.8.0 # via ipykernel decorator==5.1.1 @@ -97,6 +105,10 @@ docutils==0.19 # myst-parser # pydata-sphinx-theme # sphinx +exceptiongroup==1.2.2 + # via + # ipython + # pytest executing==2.0.1 # via stack-data fastjsonschema==2.19.0 @@ -114,13 +126,12 @@ frozenlist==1.4.0 fsspec[http]==2023.10.0 # via # datasets + # fsspec # huggingface-hub gprof2dot==2022.7.29 # via pytest-profiling -greenlet==3.1.1 - # via sqlalchemy html5lib==1.1 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) huggingface-hub==0.19.4 # via datasets identify==2.5.32 @@ -140,15 +151,15 @@ iniconfig==2.0.0 ipykernel==6.27.0 # via # myst-nb - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) ipython==8.17.2 # via # ipykernel # ipywidgets # myst-nb - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) ipywidgets==8.1.1 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) jedi==0.19.1 # via ipython jinja2==3.1.4 @@ -161,7 +172,7 @@ joblib==1.3.2 jsonschema==4.20.0 # via # nbformat - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) jsonschema-specifications==2023.11.1 # via jsonschema jupyter-cache==1.0.0 @@ -185,7 +196,7 @@ langcodes==3.3.0 lxml==4.9.3 # via # pikepdf - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) markdown-it-py==3.0.0 # via # mdit-py-plugins @@ -196,7 +207,7 @@ matplotlib==3.8.2 # via # pysankeybeta # seaborn - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) matplotlib-inline==0.1.6 # via # ipykernel @@ -218,12 +229,12 @@ murmurhash==1.0.10 # preshed # spacy # thinc -mypy==1.8.0 - # via sec-certs (../pyproject.toml) +mypy==1.13.0 + # via sec-certs (./../pyproject.toml) mypy-extensions==1.0.0 # via mypy myst-nb==1.0.0 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) myst-parser==2.0.0 # via myst-nb nbclient==0.9.0 @@ -238,7 +249,7 @@ nbformat==5.9.2 nest-asyncio==1.5.8 # via ipykernel networkx==3.2.1 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) nodeenv==1.8.0 # via pre-commit numpy==1.26.2 @@ -253,7 +264,7 @@ numpy==1.26.2 # scikit-learn # scipy # seaborn - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) # spacy # tabula-py # thinc @@ -278,26 +289,26 @@ pandas==2.1.3 # datasets # pysankeybeta # seaborn - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) # tabula-py parso==0.8.3 # via jedi pdftotext==2.2.2 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) pexpect==4.8.0 # via ipython pikepdf==8.7.1 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) pillow==10.3.0 # via # matplotlib # pikepdf # pytesseract - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) pip-tools==7.3.0 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) pkgconfig==1.5.5 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) platformdirs==4.0.0 # via # jupyter-core @@ -305,19 +316,21 @@ platformdirs==4.0.0 pluggy==1.3.0 # via pytest pre-commit==3.5.0 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) preshed==3.0.9 # via # spacy # thinc prompt-toolkit==3.0.41 # via ipython +propcache==0.2.0 + # via yarl psutil==5.9.6 # via # ipykernel # memory-profiler # pytest-monitor - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) ptyprocess==0.7.0 # via pexpect pure-eval==0.2.2 @@ -332,14 +345,14 @@ pydantic==2.5.2 # via # confection # pydantic-settings - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) # spacy # thinc # weasel pydantic-core==2.14.5 # via pydantic pydantic-settings==2.1.0 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) pydata-sphinx-theme==0.14.3 # via sphinx-book-theme pygments==2.17.2 @@ -351,32 +364,34 @@ pygments==2.17.2 pyparsing==3.1.1 # via matplotlib pypdf[crypto]==3.17.1 - # via sec-certs (../pyproject.toml) + # via + # pypdf + # sec-certs (./../pyproject.toml) pyproject-hooks==1.0.0 # via build pysankeybeta==1.4.1 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) pytesseract==0.3.10 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) pytest==7.4.3 # via # pytest-cov # pytest-monitor # pytest-profiling - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) pytest-cov==4.1.0 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) pytest-monitor==1.6.6 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) pytest-profiling==1.7.0 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) python-dateutil==2.8.2 # via # dateparser # jupyter-client # matplotlib # pandas - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) python-dotenv==1.0.0 # via pydantic-settings pytz==2023.3.post1 @@ -391,13 +406,13 @@ pyyaml==6.0.1 # myst-nb # myst-parser # pre-commit - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) pyzmq==25.1.1 # via # ipykernel # jupyter-client rapidfuzz==3.5.2 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) referencing==0.31.0 # via # jsonschema @@ -410,7 +425,7 @@ requests==2.32.0 # fsspec # huggingface-hub # pytest-monitor - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) # spacy # sphinx # weasel @@ -418,20 +433,20 @@ rpds-py==0.13.1 # via # jsonschema # referencing -ruff==0.2.2 - # via sec-certs (../pyproject.toml) +ruff==0.7.4 + # via sec-certs (./../pyproject.toml) scikit-learn==1.5.0 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) scipy==1.11.4 # via # scikit-learn - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) seaborn==0.13.0 # via # pysankeybeta - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) setuptools-scm==8.0.4 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) six==1.16.0 # via # asttokens @@ -447,7 +462,7 @@ snowballstemmer==2.2.0 soupsieve==2.5 # via beautifulsoup4 spacy==3.7.2 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) spacy-legacy==3.0.12 # via spacy spacy-loggers==1.0.5 @@ -457,7 +472,7 @@ sphinx==6.2.1 # myst-nb # myst-parser # pydata-sphinx-theme - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) # sphinx-book-theme # sphinx-copybutton # sphinx-design @@ -467,11 +482,11 @@ sphinx==6.2.1 # sphinxcontrib-qthelp # sphinxcontrib-serializinghtml sphinx-book-theme==1.0.1 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) sphinx-copybutton==0.5.2 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) sphinx-design==0.5.0 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) sphinxcontrib-applehelp==1.0.7 # via sphinx sphinxcontrib-devhelp==1.0.5 @@ -495,13 +510,22 @@ srsly==2.4.8 stack-data==0.6.3 # via ipython tabula-py==2.9.0 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) tabulate==0.9.0 # via jupyter-cache thinc==8.2.1 # via spacy threadpoolctl==3.2.0 # via scikit-learn +tomli==2.1.0 + # via + # build + # coverage + # mypy + # pip-tools + # pyproject-hooks + # pytest + # setuptools-scm tornado==6.4.1 # via # ipykernel @@ -510,7 +534,7 @@ tqdm==4.66.3 # via # datasets # huggingface-hub - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) # spacy traitlets==5.13.0 # via @@ -528,13 +552,14 @@ typer==0.9.0 # spacy # weasel types-python-dateutil==2.8.19.14 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) types-pyyaml==6.0.12.12 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) types-requests==2.31.0.10 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) typing-extensions==4.8.0 # via + # cloudpathlib # huggingface-hub # mypy # myst-nb @@ -575,7 +600,7 @@ wrapt==1.16.0 # via deprecated xxhash==3.4.1 # via datasets -yarl==1.9.3 +yarl==1.17.2 # via aiohttp zipp==3.19.1 # via importlib-metadata diff --git a/requirements/nlp_requirements.txt b/requirements/nlp_requirements.txt index b6646873..a8868053 100644 --- a/requirements/nlp_requirements.txt +++ b/requirements/nlp_requirements.txt @@ -1,6 +1,6 @@ aiohappyeyeballs==2.4.0 # via aiohttp -aiohttp==3.10.2 +aiohttp==3.10.11 # via # datasets # fsspec @@ -92,6 +92,8 @@ datasets==2.15.0 # setfit datashader==0.16.0 # via umap-learn +dateparser==1.2.0 + # via sec-certs (./../pyproject.toml) debugpy==1.8.0 # via ipykernel decorator==5.1.1 @@ -372,6 +374,8 @@ preshed==3.0.9 # thinc prompt-toolkit==3.0.41 # via ipython +propcache==0.2.0 + # via yarl psutil==5.9.6 # via # ipykernel @@ -418,6 +422,7 @@ pytesseract==0.3.10 # via sec-certs (./../pyproject.toml) python-dateutil==2.8.2 # via + # dateparser # jupyter-client # matplotlib # pandas @@ -425,7 +430,9 @@ python-dateutil==2.8.2 python-dotenv==1.0.0 # via pydantic-settings pytz==2023.3.post1 - # via pandas + # via + # dateparser + # pandas pyviz-comms==3.0.0 # via # holoviews @@ -451,6 +458,7 @@ referencing==0.31.0 # jsonschema-specifications regex==2023.10.3 # via + # dateparser # nltk # transformers requests==2.32.0 @@ -558,7 +566,7 @@ toolz==0.12.0 # dask # datashader # partd -torch==2.2.0 +torch==2.1.1 # via # sentence-transformers # torchvision @@ -611,6 +619,8 @@ typing-extensions==4.8.0 # typer tzdata==2023.3 # via pandas +tzlocal==5.2 + # via dateparser uc-micro-py==1.0.2 # via linkify-it-py umap-learn[plot]==0.5.5 @@ -646,7 +656,7 @@ xyzservices==2023.10.1 # via # bokeh # panel -yarl==1.9.3 +yarl==1.17.2 # via aiohttp zipp==3.19.1 # via importlib-metadata diff --git a/requirements/requirements.txt b/requirements/requirements.txt index ddfcd939..a89cf985 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -1,5 +1,9 @@ annotated-types==0.6.0 # via pydantic +appnope==0.1.4 + # via + # ipykernel + # ipython asttokens==2.4.1 # via stack-data attrs==23.1.0 @@ -7,7 +11,7 @@ attrs==23.1.0 # jsonschema # referencing beautifulsoup4==4.12.2 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) blis==0.7.11 # via thinc catalogue==2.0.10 @@ -23,7 +27,7 @@ charset-normalizer==3.3.2 # via requests click==8.1.7 # via - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) # typer cloudpathlib==0.16.0 # via weasel @@ -47,7 +51,7 @@ cymem==2.0.8 # spacy # thinc dateparser==1.2.0 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) debugpy==1.8.0 # via ipykernel decorator==5.1.1 @@ -56,22 +60,24 @@ deprecated==1.2.14 # via pikepdf distro==1.8.0 # via tabula-py +exceptiongroup==1.2.2 + # via ipython executing==2.0.1 # via stack-data fonttools==4.45.0 # via matplotlib html5lib==1.1 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) idna==3.7 # via requests ipykernel==6.27.0 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) ipython==8.17.2 # via # ipykernel # ipywidgets ipywidgets==8.1.1 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) jedi==0.19.1 # via ipython jinja2==3.1.4 @@ -79,7 +85,7 @@ jinja2==3.1.4 joblib==1.3.2 # via scikit-learn jsonschema==4.20.0 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) jsonschema-specifications==2023.11.1 # via jsonschema jupyter-client==8.6.0 @@ -97,14 +103,14 @@ langcodes==3.3.0 lxml==4.9.3 # via # pikepdf - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) markupsafe==2.1.3 # via jinja2 matplotlib==3.8.2 # via # pysankeybeta # seaborn - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) matplotlib-inline==0.1.6 # via # ipykernel @@ -117,7 +123,7 @@ murmurhash==1.0.10 nest-asyncio==1.5.8 # via ipykernel networkx==3.2.1 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) numpy==1.26.2 # via # blis @@ -128,7 +134,7 @@ numpy==1.26.2 # scikit-learn # scipy # seaborn - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) # spacy # tabula-py # thinc @@ -146,24 +152,24 @@ pandas==2.1.3 # via # pysankeybeta # seaborn - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) # tabula-py parso==0.8.3 # via jedi pdftotext==2.2.2 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) pexpect==4.8.0 # via ipython pikepdf==8.7.1 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) pillow==10.3.0 # via # matplotlib # pikepdf # pytesseract - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) pkgconfig==1.5.5 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) platformdirs==4.0.0 # via jupyter-core preshed==3.0.9 @@ -175,7 +181,7 @@ prompt-toolkit==3.0.41 psutil==5.9.6 # via # ipykernel - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) ptyprocess==0.7.0 # via pexpect pure-eval==0.2.2 @@ -186,31 +192,33 @@ pydantic==2.5.2 # via # confection # pydantic-settings - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) # spacy # thinc # weasel pydantic-core==2.14.5 # via pydantic pydantic-settings==2.1.0 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) pygments==2.17.2 # via ipython pyparsing==3.1.1 # via matplotlib pypdf[crypto]==3.17.1 - # via sec-certs (../pyproject.toml) + # via + # pypdf + # sec-certs (./../pyproject.toml) pysankeybeta==1.4.1 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) pytesseract==0.3.10 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) python-dateutil==2.8.2 # via # dateparser # jupyter-client # matplotlib # pandas - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) python-dotenv==1.0.0 # via pydantic-settings pytz==2023.3.post1 @@ -218,13 +226,13 @@ pytz==2023.3.post1 # dateparser # pandas pyyaml==6.0.1 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) pyzmq==25.1.1 # via # ipykernel # jupyter-client rapidfuzz==3.5.2 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) referencing==0.31.0 # via # jsonschema @@ -233,7 +241,7 @@ regex==2024.9.11 # via dateparser requests==2.32.0 # via - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) # spacy # weasel rpds-py==0.13.1 @@ -241,17 +249,17 @@ rpds-py==0.13.1 # jsonschema # referencing scikit-learn==1.5.0 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) scipy==1.11.4 # via # scikit-learn - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) seaborn==0.13.0 # via # pysankeybeta - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) setuptools-scm==8.0.4 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) six==1.16.0 # via # asttokens @@ -264,7 +272,7 @@ smart-open==6.4.0 soupsieve==2.5 # via beautifulsoup4 spacy==3.7.2 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) spacy-legacy==3.0.12 # via spacy spacy-loggers==1.0.5 @@ -278,18 +286,20 @@ srsly==2.4.8 stack-data==0.6.3 # via ipython tabula-py==2.9.0 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) thinc==8.2.1 # via spacy threadpoolctl==3.2.0 # via scikit-learn +tomli==2.1.0 + # via setuptools-scm tornado==6.4.1 # via # ipykernel # jupyter-client tqdm==4.66.3 # via - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) # spacy traitlets==5.13.0 # via @@ -306,6 +316,7 @@ typer==0.9.0 # weasel typing-extensions==4.8.0 # via + # cloudpathlib # pydantic # pydantic-core # setuptools-scm diff --git a/requirements/test_requirements.txt b/requirements/test_requirements.txt index ab3bf762..8b5b5f68 100644 --- a/requirements/test_requirements.txt +++ b/requirements/test_requirements.txt @@ -1,5 +1,9 @@ annotated-types==0.6.0 # via pydantic +appnope==0.1.4 + # via + # ipykernel + # ipython asttokens==2.4.1 # via stack-data attrs==23.1.0 @@ -7,7 +11,7 @@ attrs==23.1.0 # jsonschema # referencing beautifulsoup4==4.12.2 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) blis==0.7.11 # via thinc catalogue==2.0.10 @@ -23,7 +27,7 @@ charset-normalizer==3.3.2 # via requests click==8.1.7 # via - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) # typer cloudpathlib==0.16.0 # via weasel @@ -40,7 +44,7 @@ contourpy==1.2.0 coverage[toml]==7.3.2 # via # pytest-cov - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) cryptography==43.0.1 # via pypdf cycler==0.12.1 @@ -51,7 +55,7 @@ cymem==2.0.8 # spacy # thinc dateparser==1.2.0 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) debugpy==1.8.0 # via ipykernel decorator==5.1.1 @@ -60,24 +64,28 @@ deprecated==1.2.14 # via pikepdf distro==1.8.0 # via tabula-py +exceptiongroup==1.2.2 + # via + # ipython + # pytest executing==2.0.1 # via stack-data fonttools==4.45.0 # via matplotlib html5lib==1.1 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) idna==3.7 # via requests iniconfig==2.0.0 # via pytest ipykernel==6.27.0 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) ipython==8.17.2 # via # ipykernel # ipywidgets ipywidgets==8.1.1 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) jedi==0.19.1 # via ipython jinja2==3.1.4 @@ -85,7 +93,7 @@ jinja2==3.1.4 joblib==1.3.2 # via scikit-learn jsonschema==4.20.0 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) jsonschema-specifications==2023.11.1 # via jsonschema jupyter-client==8.6.0 @@ -103,14 +111,14 @@ langcodes==3.3.0 lxml==4.9.3 # via # pikepdf - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) markupsafe==2.1.3 # via jinja2 matplotlib==3.8.2 # via # pysankeybeta # seaborn - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) matplotlib-inline==0.1.6 # via # ipykernel @@ -123,7 +131,7 @@ murmurhash==1.0.10 nest-asyncio==1.5.8 # via ipykernel networkx==3.2.1 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) numpy==1.26.2 # via # blis @@ -134,7 +142,7 @@ numpy==1.26.2 # scikit-learn # scipy # seaborn - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) # spacy # tabula-py # thinc @@ -153,24 +161,24 @@ pandas==2.1.3 # via # pysankeybeta # seaborn - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) # tabula-py parso==0.8.3 # via jedi pdftotext==2.2.2 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) pexpect==4.8.0 # via ipython pikepdf==8.7.1 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) pillow==10.3.0 # via # matplotlib # pikepdf # pytesseract - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) pkgconfig==1.5.5 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) platformdirs==4.0.0 # via jupyter-core pluggy==1.3.0 @@ -184,7 +192,7 @@ prompt-toolkit==3.0.41 psutil==5.9.6 # via # ipykernel - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) ptyprocess==0.7.0 # via pexpect pure-eval==0.2.2 @@ -195,37 +203,39 @@ pydantic==2.5.2 # via # confection # pydantic-settings - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) # spacy # thinc # weasel pydantic-core==2.14.5 # via pydantic pydantic-settings==2.1.0 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) pygments==2.17.2 # via ipython pyparsing==3.1.1 # via matplotlib pypdf[crypto]==3.17.1 - # via sec-certs (../pyproject.toml) + # via + # pypdf + # sec-certs (./../pyproject.toml) pysankeybeta==1.4.1 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) pytesseract==0.3.10 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) pytest==7.4.3 # via # pytest-cov - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) pytest-cov==4.1.0 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) python-dateutil==2.8.2 # via # dateparser # jupyter-client # matplotlib # pandas - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) python-dotenv==1.0.0 # via pydantic-settings pytz==2023.3.post1 @@ -233,13 +243,13 @@ pytz==2023.3.post1 # dateparser # pandas pyyaml==6.0.1 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) pyzmq==25.1.1 # via # ipykernel # jupyter-client rapidfuzz==3.5.2 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) referencing==0.31.0 # via # jsonschema @@ -248,7 +258,7 @@ regex==2024.9.11 # via dateparser requests==2.32.0 # via - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) # spacy # weasel rpds-py==0.13.1 @@ -256,17 +266,17 @@ rpds-py==0.13.1 # jsonschema # referencing scikit-learn==1.5.0 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) scipy==1.11.4 # via # scikit-learn - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) seaborn==0.13.0 # via # pysankeybeta - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) setuptools-scm==8.0.4 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) six==1.16.0 # via # asttokens @@ -279,7 +289,7 @@ smart-open==6.4.0 soupsieve==2.5 # via beautifulsoup4 spacy==3.7.2 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) spacy-legacy==3.0.12 # via spacy spacy-loggers==1.0.5 @@ -293,18 +303,23 @@ srsly==2.4.8 stack-data==0.6.3 # via ipython tabula-py==2.9.0 - # via sec-certs (../pyproject.toml) + # via sec-certs (./../pyproject.toml) thinc==8.2.1 # via spacy threadpoolctl==3.2.0 # via scikit-learn +tomli==2.1.0 + # via + # coverage + # pytest + # setuptools-scm tornado==6.4.1 # via # ipykernel # jupyter-client tqdm==4.66.3 # via - # sec-certs (../pyproject.toml) + # sec-certs (./../pyproject.toml) # spacy traitlets==5.13.0 # via @@ -321,6 +336,7 @@ typer==0.9.0 # weasel typing-extensions==4.8.0 # via + # cloudpathlib # pydantic # pydantic-core # setuptools-scm diff --git a/src/sec_certs/dataset/dataset.py b/src/sec_certs/dataset/dataset.py index c906fd59..602f1d9e 100644 --- a/src/sec_certs/dataset/dataset.py +++ b/src/sec_certs/dataset/dataset.py @@ -544,9 +544,7 @@ def filter_condition(cpe: CPE) -> bool: return False if re.match(constants.RELEASE_CANDIDATE_REGEX, cpe.update): return False - if cpe in WINDOWS_WEAK_CPES: - return False - return True + return cpe in WINDOWS_WEAK_CPES if not self.auxiliary_datasets.cpe_dset: self.auxiliary_datasets.cpe_dset = self._prepare_cpe_dataset() diff --git a/src/sec_certs/sample/sar.py b/src/sec_certs/sample/sar.py index ea4a6e14..771e24bc 100644 --- a/src/sec_certs/sample/sar.py +++ b/src/sec_certs/sample/sar.py @@ -43,9 +43,7 @@ def from_string(cls, string: str) -> SAR: @staticmethod def contains_level(string: str) -> bool: - if len(string.split(".")) == 1: - return False - return True + return len(string.split(".")) != 1 @staticmethod def matches_re(string: str) -> bool: diff --git a/src/sec_certs/utils/pdf.py b/src/sec_certs/utils/pdf.py index f2c2c58e..3195e67e 100644 --- a/src/sec_certs/utils/pdf.py +++ b/src/sec_certs/utils/pdf.py @@ -279,6 +279,4 @@ def text_is_garbage(text: str) -> bool: if every_second < GARBAGE_EVERY_SECOND_CHAR_THRESHOLD: return True # If there is a small ratio of alphanumeric chars to all chars, this is garbage. - if alpha < GARBAGE_ALPHA_CHARS_THRESHOLD: - return True - return False + return alpha < GARBAGE_ALPHA_CHARS_THRESHOLD