diff --git a/.drone.yml b/.drone.yml index bed252b..ebb1c67 100644 --- a/.drone.yml +++ b/.drone.yml @@ -14,25 +14,25 @@ trigger: steps: - name: lint-and-type-check - image: python:3.9-slim + image: python:3.10-slim environment: AWS_ACCOUNT_ID: from_secret: AWS_ACCOUNT_ID commands: - - pip install -U pip==21.3.1 + - pip install -U pip==23.3.2 - pip install awscli - aws codeartifact login --tool pip --repository globality-pypi-local --domain globality --domain-owner $AWS_ACCOUNT_ID --region us-east-1 - ./entrypoint.sh lint - ./entrypoint.sh typehinting - - name: test-py39-latest - image: python:3.9-slim + - name: test-py310-latest + image: python:3.10-slim environment: AWS_ACCOUNT_ID: from_secret: AWS_ACCOUNT_ID commands: - - pip install -U pip==21.3.1 + - pip install -U pip==23.3.2 - pip install awscli - aws codeartifact login --tool pip --repository globality-pypi-local --domain globality --domain-owner $AWS_ACCOUNT_ID --region us-east-1 - ./entrypoint.sh test @@ -51,7 +51,7 @@ steps: -Dsonar.branch.name=${DRONE_BRANCH} depends_on: - lint-and-type-check - - test-py39-latest + - test-py310-latest - name: release-python-library-codeartifact image: python:3.10-slim @@ -65,7 +65,7 @@ steps: depends_on: - sonar-scanner commands: - - pip install -U pip==23.2.1 + - pip install -U pip==23.3.2 - pip install --quiet awscli twine==4.0.2 packaging==23.1 bumpversion - bumpversion minor --allow-dirty --new-version `/bin/date +%Y.%-V`.${DRONE_BUILD_NUMBER} - export version=$(cat .bumpversion.cfg | awk '/current_version / {print $3}') @@ -108,7 +108,7 @@ trigger: steps: - name: dependency-validation-dummy pull: always - image: python:3.9-slim + image: python:3.10-slim commands: - echo "Dummy step to trigger dependency-validation" diff --git a/.globality/build.json b/.globality/build.json index 05b7e63..c71db3f 100644 --- a/.globality/build.json +++ b/.globality/build.json @@ -4,7 +4,7 @@ "core_packages": "locales libicu-dev libgomp1 build-essential wget libffi-dev", "dependency_tool": "pip-tools", "docker": { - "docker_tag": "python:3.9-slim" + "docker_tag": "python:3.10-slim" }, "entrypoint": { "pre_typehinting_commands": [ @@ -33,7 +33,7 @@ ], "repository": "pypi" }, - "sonar_python_versions": "3.9,3.10", + "sonar_python_versions": "3.10", "test_command": "pytest", "test_py37": false, "use_globality_black": true diff --git a/.idea/misc.xml b/.idea/misc.xml index 1d0512a..6020153 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,4 +1,7 @@ - + + + \ No newline at end of file diff --git a/deboiler/dataset/avro_dataset.py b/deboiler/dataset/avro_dataset.py index 3d8f4a8..7b21095 100644 --- a/deboiler/dataset/avro_dataset.py +++ b/deboiler/dataset/avro_dataset.py @@ -4,7 +4,6 @@ from logging import Logger from pathlib import Path -from typing import Optional from fastavro import reader @@ -21,8 +20,8 @@ def __init__( self, file_path: str, content_key: str = "content", - status_key: Optional[str] = "status", - content_type_key: Optional[str] = "content_type", + status_key: str | None = "status", + content_type_key: str | None = "content_type", verbose: bool = True, ): super().__init__(verbose=verbose) diff --git a/deboiler/dataset/base.py b/deboiler/dataset/base.py index d7adc7a..586d5a4 100644 --- a/deboiler/dataset/base.py +++ b/deboiler/dataset/base.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod, abstractproperty -from typing import Mapping, Optional +from collections.abc import Mapping from tqdm import tqdm @@ -19,9 +19,9 @@ class DeboilerDataset(ABC): def __init__( self, - content_key: Optional[str] = "content", - status_key: Optional[str] = "status", - content_type_key: Optional[str] = "content_type", + content_key: str | None = "content", + status_key: str | None = "status", + content_type_key: str | None = "content_type", verbose: bool = True, ): self.cached_pages: Mapping[str, ParsedPage] = dict() diff --git a/deboiler/dataset/dataframe_dataset.py b/deboiler/dataset/dataframe_dataset.py index d4da944..97d709d 100644 --- a/deboiler/dataset/dataframe_dataset.py +++ b/deboiler/dataset/dataframe_dataset.py @@ -1,5 +1,4 @@ from logging import Logger -from typing import Optional import pandas as pd @@ -20,8 +19,8 @@ def __init__( self, records: pd.DataFrame, content_key: str = "content", - status_key: Optional[str] = "status", - content_type_key: Optional[str] = "content_type", + status_key: str | None = "status", + content_type_key: str | None = "content_type", verbose: bool = True, ): super().__init__( diff --git a/deboiler/dataset/json_dataset.py b/deboiler/dataset/json_dataset.py index 1879a80..c33fcb2 100644 --- a/deboiler/dataset/json_dataset.py +++ b/deboiler/dataset/json_dataset.py @@ -1,7 +1,6 @@ import json from logging import Logger from pathlib import Path -from typing import Optional, Union from deboiler.dataset.base import DeboilerDataset from deboiler.logger import logger @@ -18,10 +17,10 @@ class JsonDataset(DeboilerDataset): def __init__( self, - file_path: Union[str, Path], + file_path: str | Path, content_key: str = "content", - status_key: Optional[str] = "status", - content_type_key: Optional[str] = "content_type", + status_key: str | None = "status", + content_type_key: str | None = "content_type", verbose: bool = True, ): super().__init__( diff --git a/deboiler/dataset/list_dataset.py b/deboiler/dataset/list_dataset.py index 6f49be5..d87ecbb 100644 --- a/deboiler/dataset/list_dataset.py +++ b/deboiler/dataset/list_dataset.py @@ -1,5 +1,4 @@ from logging import Logger -from typing import Optional from deboiler.dataset.base import DeboilerDataset from deboiler.logger import logger @@ -18,8 +17,8 @@ def __init__( self, records: list[dict], content_key: str = "content", - status_key: Optional[str] = "status", - content_type_key: Optional[str] = "content_type", + status_key: str | None = "status", + content_type_key: str | None = "content_type", verbose: bool = True, ): super().__init__( diff --git a/deboiler/deboiler.py b/deboiler/deboiler.py index 388c5b4..9e0dcab 100644 --- a/deboiler/deboiler.py +++ b/deboiler/deboiler.py @@ -1,11 +1,11 @@ from collections import defaultdict +from collections.abc import Iterable from contextlib import contextmanager from enum import Enum from functools import lru_cache, partial from logging import Logger from multiprocessing import Pool from time import time -from typing import Iterable, Optional import langdetect import numpy as np @@ -208,29 +208,23 @@ def fit(self, dataset: DeboilerDataset, chunksize: int = 100) -> None: } self.logger.debug( - ( - f"Number of shared elements that did not meet the occurrence threshold {self._domain_desc}: " - f"{len(boilerplate_elements_counter) - len(self.boilerplate_elements)}" - ) + f"Number of shared elements that did not meet the occurrence threshold {self._domain_desc}: " + f"{len(boilerplate_elements_counter) - len(self.boilerplate_elements)}" ) self.logger.debug( - ( - f"Number of similar pairs that were excluded from boilerplate identification {self._domain_desc}: " - f"{n_similar_pairs:,}" - ) + f"Number of similar pairs that were excluded from boilerplate identification {self._domain_desc}: " + f"{n_similar_pairs:,}" ) self.logger.info( - ( - f"Total number of boilerplate elements found for the domain {self._domain_desc}: " - f"{len(self.boilerplate_elements):,}" - ) + f"Total number of boilerplate elements found for the domain {self._domain_desc}: " + f"{len(self.boilerplate_elements):,}" ) self.logger.info( f"Boilerplate identification took {time() - start_time:,.1f} seconds {self._domain_desc}" ) @classmethod - def detect_language(cls, page: LxmlTree, cleaned_text: str) -> Optional[str]: + def detect_language(cls, page: LxmlTree, cleaned_text: str) -> str | None: """ First, tries to detect language based on page metadata. If that is not available, uses the heuristic detection algorithm from langdetect. @@ -303,7 +297,10 @@ def _domain_desc(self): return f"({self.domain})" if self.domain else "for domain" def transform( - self, dataset: DeboilerDataset, chunksize: int = 100, include_cleaned_html: bool = False + self, + dataset: DeboilerDataset, + chunksize: int = 100, + include_cleaned_html: bool = False, ) -> Iterable[OutputPage]: """ Transforms the input dataset and yields an OutputPage object for each page. @@ -316,10 +313,8 @@ def transform( assert len(dataset.cached_pages) == len(dataset) except AssertionError: raise AssertionError( - ( - "In `performance` mode, the same dataset passed to the `fit` method should be " - "passed to the `transform` method" - ) + "In `performance` mode, the same dataset passed to the `fit` method should be " + "passed to the `transform` method" ) start_time = time() @@ -345,8 +340,6 @@ def transform( self.logger.info(f" * Number of pages: {len(page_len_delats):,}") self.logger.info(f" * Time taken: {time() - start_time:,.1f} seconds") self.logger.info( - ( - " * Noise reduction per page (characters): " - f"{np.mean(page_len_delats):.1f} mean, {np.median(page_len_delats):.1f} median" - ) + " * Noise reduction per page (characters): " + f"{np.mean(page_len_delats):.1f} mean, {np.median(page_len_delats):.1f} median" ) diff --git a/deboiler/models/lxml_node.py b/deboiler/models/lxml_node.py index fa44e5f..627dbac 100644 --- a/deboiler/models/lxml_node.py +++ b/deboiler/models/lxml_node.py @@ -1,13 +1,9 @@ import cgi import re import unicodedata +from collections.abc import Generator, Mapping from logging import debug -from typing import ( - Generator, - Mapping, - Optional, - Union, -) +from typing import Optional from lxml.etree import _Comment, _Element, _ElementTree from lxml.html import tostring as html_tostring @@ -129,11 +125,11 @@ class LxmlNode: title, text, headings, lists, breadcrumbs, etc. """ - def __init__(self, node: Union[_ElementTree, _Element], tree: LxmlTree): + def __init__(self, node: _ElementTree | _Element, tree: LxmlTree): self.node = node self.tree = tree # page tree - self._normalized_representation_cache: Optional[str] = None + self._normalized_representation_cache: str | None = None def __iter__(self): for child in self.node: @@ -173,14 +169,7 @@ def clear_cache(self): self._normalized_representation_cache = None def _remove_spaces(self, text: str) -> str: - return ( - text - .replace("\t", "") - .replace("\n", "") - .replace(" ", "") - .lower() - .strip() - ) + return text.replace("\t", "").replace("\n", "").replace(" ", "").lower().strip() def normalized_representation(self, is_root: bool = True) -> str: """ @@ -229,7 +218,7 @@ def _normalized_representation(self) -> str: return f"<{self.tag}{attribute_string}>{text}{optional_text_space}{internal_elements}" @staticmethod - def _normalize_string(text: Optional[str], lower_case: bool = False) -> str: + def _normalize_string(text: str | None, lower_case: bool = False) -> str: """ Normalizes an input string by removing extra spaces, tabs, multiple new lines, etc. """ diff --git a/deboiler/models/page.py b/deboiler/models/page.py index 2f83cde..ab13f45 100644 --- a/deboiler/models/page.py +++ b/deboiler/models/page.py @@ -2,7 +2,6 @@ from dataclasses import dataclass from io import StringIO from logging import Logger -from typing import Optional, Union from lxml.etree import HTMLParser, _Element, parse as parse_html @@ -21,7 +20,7 @@ class RawPage: """ url: str - content: Union[bytes, str] + content: bytes | str def __repr__(self): return f"RawPage(url={self.url}, content={self.content[:20]}...)" @@ -43,7 +42,7 @@ class ParsedPage: logger: Logger parser = HTMLParser(remove_comments=True) - def __init__(self, url: str, content: Union[bytes, str]): + def __init__(self, url: str, content: bytes | str): self.url = url self.content: LxmlTree = self.parse(content) self.nodes: set[str] = { @@ -55,7 +54,7 @@ def __init__(self, url: str, content: Union[bytes, str]): def __repr__(self): return f"ParsedPage(url={self.url})" - def parse(self, content: Union[str, bytes]) -> LxmlTree: + def parse(self, content: str | bytes) -> LxmlTree: """ Parses the input html string/bytes into an LxmlTree. """ @@ -147,5 +146,5 @@ class OutputPage: headings: str lists: str breadcrumbs: str - language: Optional[str] = None - cleaned_html: Optional[str] = None + language: str | None = None + cleaned_html: str | None = None diff --git a/deboiler/tests/fixtures/__init__.py b/deboiler/tests/fixtures/__init__.py index a97e7d9..744f964 100644 --- a/deboiler/tests/fixtures/__init__.py +++ b/deboiler/tests/fixtures/__init__.py @@ -1,9 +1,9 @@ -from pkg_resources import resource_filename +from importlib import resources from pathlib import Path def get_fixture_path(fixture_name: str = "") -> Path: - return Path( - resource_filename(__name__, fixture_name), - ) + ref = resources.files(__name__) / fixture_name + with resources.as_file(ref) as path: + return path diff --git a/deboiler/tests/test_language_detection.py b/deboiler/tests/test_language_detection.py index 9fead1e..9c1b800 100644 --- a/deboiler/tests/test_language_detection.py +++ b/deboiler/tests/test_language_detection.py @@ -32,19 +32,16 @@ template.substitute(language='lang="en"', title="page title", text=TEXT), "en", ), - # english with locale from meta ( template.substitute(language='lang="en-us"', title="page title", text=TEXT), "en-us", ), - # english from text ( template.substitute(language="", title="page title", text=TEXT), "en", ), - # non-english from text ( template.substitute(language="", title="título de la página", text="algún texto"), diff --git a/setup.cfg b/setup.cfg index 0a7be44..c1f1701 100644 --- a/setup.cfg +++ b/setup.cfg @@ -20,13 +20,6 @@ skip = __init__.py [mypy] ignore_missing_imports = True -[nosetests] -with-coverage = True -cover-package = deboiler -cover-html = True -cover-html-dir = coverage -cover-erase = True - [tool:pytest] addopts = --cov deboiler diff --git a/sonar-project.properties b/sonar-project.properties index d17a290..f15f781 100644 --- a/sonar-project.properties +++ b/sonar-project.properties @@ -8,7 +8,7 @@ sonar.projectKey=globality-corp_deboiler sonar.projectName=deboiler sonar.github.repository=globality-corp/deboiler sonar.host.url=https://sonarqube.globality.cloud -sonar.python.version=3.9,3.10 +sonar.python.version=3.10 sonar.python.coverage.reportPaths=deboiler/tests/coverage/cov.xml,deboiler/tests/coverage/backwards-compat.xml,deboiler/tests/coverage/daemon-contract.xml sonar.python.xunit.reportPath=deboiler/tests/test-results/pytest/junit.xml sonar.exclusions=deboiler/migrations/*