Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Upgrade to Python 3.10 #1

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions .drone.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,25 +14,25 @@ trigger:

steps:
- name: lint-and-type-check
image: python:3.9-slim
image: python:3.10-slim
environment:
AWS_ACCOUNT_ID:
from_secret: AWS_ACCOUNT_ID
commands:
- pip install -U pip==21.3.1
- pip install -U pip==23.3.2
- pip install awscli
- aws codeartifact login --tool pip --repository globality-pypi-local --domain globality --domain-owner $AWS_ACCOUNT_ID --region us-east-1
- ./entrypoint.sh lint
- ./entrypoint.sh typehinting


- name: test-py39-latest
image: python:3.9-slim
- name: test-py310-latest
image: python:3.10-slim
environment:
AWS_ACCOUNT_ID:
from_secret: AWS_ACCOUNT_ID
commands:
- pip install -U pip==21.3.1
- pip install -U pip==23.3.2
- pip install awscli
- aws codeartifact login --tool pip --repository globality-pypi-local --domain globality --domain-owner $AWS_ACCOUNT_ID --region us-east-1
- ./entrypoint.sh test
Expand All @@ -51,7 +51,7 @@ steps:
-Dsonar.branch.name=${DRONE_BRANCH}
depends_on:
- lint-and-type-check
- test-py39-latest
- test-py310-latest

- name: release-python-library-codeartifact
image: python:3.10-slim
Expand All @@ -65,7 +65,7 @@ steps:
depends_on:
- sonar-scanner
commands:
- pip install -U pip==23.2.1
- pip install -U pip==23.3.2
- pip install --quiet awscli twine==4.0.2 packaging==23.1 bumpversion
- bumpversion minor --allow-dirty --new-version `/bin/date +%Y.%-V`.${DRONE_BUILD_NUMBER}
- export version=$(cat .bumpversion.cfg | awk '/current_version / {print $3}')
Expand Down Expand Up @@ -108,7 +108,7 @@ trigger:
steps:
- name: dependency-validation-dummy
pull: always
image: python:3.9-slim
image: python:3.10-slim
commands:
- echo "Dummy step to trigger dependency-validation"

Expand Down
4 changes: 2 additions & 2 deletions .globality/build.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"core_packages": "locales libicu-dev libgomp1 build-essential wget libffi-dev",
"dependency_tool": "pip-tools",
"docker": {
"docker_tag": "python:3.9-slim"
"docker_tag": "python:3.10-slim"
},
"entrypoint": {
"pre_typehinting_commands": [
Expand Down Expand Up @@ -33,7 +33,7 @@
],
"repository": "pypi"
},
"sonar_python_versions": "3.9,3.10",
"sonar_python_versions": "3.10",
"test_command": "pytest",
"test_py37": false,
"use_globality_black": true
Expand Down
5 changes: 4 additions & 1 deletion .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 2 additions & 3 deletions deboiler/dataset/avro_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

from logging import Logger
from pathlib import Path
from typing import Optional

from fastavro import reader

Expand All @@ -21,8 +20,8 @@ def __init__(
self,
file_path: str,
content_key: str = "content",
status_key: Optional[str] = "status",
content_type_key: Optional[str] = "content_type",
status_key: str | None = "status",
content_type_key: str | None = "content_type",
verbose: bool = True,
):
super().__init__(verbose=verbose)
Expand Down
8 changes: 4 additions & 4 deletions deboiler/dataset/base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from abc import ABC, abstractmethod, abstractproperty
from typing import Mapping, Optional
from collections.abc import Mapping

from tqdm import tqdm

Expand All @@ -19,9 +19,9 @@ class DeboilerDataset(ABC):

def __init__(
self,
content_key: Optional[str] = "content",
status_key: Optional[str] = "status",
content_type_key: Optional[str] = "content_type",
content_key: str | None = "content",
status_key: str | None = "status",
content_type_key: str | None = "content_type",
verbose: bool = True,
):
self.cached_pages: Mapping[str, ParsedPage] = dict()
Expand Down
5 changes: 2 additions & 3 deletions deboiler/dataset/dataframe_dataset.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from logging import Logger
from typing import Optional

import pandas as pd

Expand All @@ -20,8 +19,8 @@ def __init__(
self,
records: pd.DataFrame,
content_key: str = "content",
status_key: Optional[str] = "status",
content_type_key: Optional[str] = "content_type",
status_key: str | None = "status",
content_type_key: str | None = "content_type",
verbose: bool = True,
):
super().__init__(
Expand Down
7 changes: 3 additions & 4 deletions deboiler/dataset/json_dataset.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import json
from logging import Logger
from pathlib import Path
from typing import Optional, Union

from deboiler.dataset.base import DeboilerDataset
from deboiler.logger import logger
Expand All @@ -18,10 +17,10 @@ class JsonDataset(DeboilerDataset):

def __init__(
self,
file_path: Union[str, Path],
file_path: str | Path,
content_key: str = "content",
status_key: Optional[str] = "status",
content_type_key: Optional[str] = "content_type",
status_key: str | None = "status",
content_type_key: str | None = "content_type",
verbose: bool = True,
):
super().__init__(
Expand Down
5 changes: 2 additions & 3 deletions deboiler/dataset/list_dataset.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from logging import Logger
from typing import Optional

from deboiler.dataset.base import DeboilerDataset
from deboiler.logger import logger
Expand All @@ -18,8 +17,8 @@ def __init__(
self,
records: list[dict],
content_key: str = "content",
status_key: Optional[str] = "status",
content_type_key: Optional[str] = "content_type",
status_key: str | None = "status",
content_type_key: str | None = "content_type",
verbose: bool = True,
):
super().__init__(
Expand Down
39 changes: 16 additions & 23 deletions deboiler/deboiler.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from collections import defaultdict
from collections.abc import Iterable
from contextlib import contextmanager
from enum import Enum
from functools import lru_cache, partial
from logging import Logger
from multiprocessing import Pool
from time import time
from typing import Iterable, Optional

import langdetect
import numpy as np
Expand Down Expand Up @@ -208,29 +208,23 @@ def fit(self, dataset: DeboilerDataset, chunksize: int = 100) -> None:
}

self.logger.debug(
(
f"Number of shared elements that did not meet the occurrence threshold {self._domain_desc}: "
f"{len(boilerplate_elements_counter) - len(self.boilerplate_elements)}"
)
f"Number of shared elements that did not meet the occurrence threshold {self._domain_desc}: "
f"{len(boilerplate_elements_counter) - len(self.boilerplate_elements)}"
)
self.logger.debug(
(
f"Number of similar pairs that were excluded from boilerplate identification {self._domain_desc}: "
f"{n_similar_pairs:,}"
)
f"Number of similar pairs that were excluded from boilerplate identification {self._domain_desc}: "
f"{n_similar_pairs:,}"
)
self.logger.info(
(
f"Total number of boilerplate elements found for the domain {self._domain_desc}: "
f"{len(self.boilerplate_elements):,}"
)
f"Total number of boilerplate elements found for the domain {self._domain_desc}: "
f"{len(self.boilerplate_elements):,}"
)
self.logger.info(
f"Boilerplate identification took {time() - start_time:,.1f} seconds {self._domain_desc}"
)

@classmethod
def detect_language(cls, page: LxmlTree, cleaned_text: str) -> Optional[str]:
def detect_language(cls, page: LxmlTree, cleaned_text: str) -> str | None:
"""
First, tries to detect language based on page metadata.
If that is not available, uses the heuristic detection algorithm from langdetect.
Expand Down Expand Up @@ -303,7 +297,10 @@ def _domain_desc(self):
return f"({self.domain})" if self.domain else "for domain"

def transform(
self, dataset: DeboilerDataset, chunksize: int = 100, include_cleaned_html: bool = False
self,
dataset: DeboilerDataset,
chunksize: int = 100,
include_cleaned_html: bool = False,
) -> Iterable[OutputPage]:
"""
Transforms the input dataset and yields an OutputPage object for each page.
Expand All @@ -316,10 +313,8 @@ def transform(
assert len(dataset.cached_pages) == len(dataset)
except AssertionError:
raise AssertionError(
(
"In `performance` mode, the same dataset passed to the `fit` method should be "
"passed to the `transform` method"
)
"In `performance` mode, the same dataset passed to the `fit` method should be "
"passed to the `transform` method"
)

start_time = time()
Expand All @@ -345,8 +340,6 @@ def transform(
self.logger.info(f" * Number of pages: {len(page_len_delats):,}")
self.logger.info(f" * Time taken: {time() - start_time:,.1f} seconds")
self.logger.info(
(
" * Noise reduction per page (characters): "
f"{np.mean(page_len_delats):.1f} mean, {np.median(page_len_delats):.1f} median"
)
" * Noise reduction per page (characters): "
f"{np.mean(page_len_delats):.1f} mean, {np.median(page_len_delats):.1f} median"
)
23 changes: 6 additions & 17 deletions deboiler/models/lxml_node.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,9 @@
import cgi
import re
import unicodedata
from collections.abc import Generator, Mapping
from logging import debug
from typing import (
Generator,
Mapping,
Optional,
Union,
)
from typing import Optional

from lxml.etree import _Comment, _Element, _ElementTree
from lxml.html import tostring as html_tostring
Expand Down Expand Up @@ -129,11 +125,11 @@ class LxmlNode:
title, text, headings, lists, breadcrumbs, etc.
"""

def __init__(self, node: Union[_ElementTree, _Element], tree: LxmlTree):
def __init__(self, node: _ElementTree | _Element, tree: LxmlTree):
self.node = node
self.tree = tree # page tree

self._normalized_representation_cache: Optional[str] = None
self._normalized_representation_cache: str | None = None

def __iter__(self):
for child in self.node:
Expand Down Expand Up @@ -173,14 +169,7 @@ def clear_cache(self):
self._normalized_representation_cache = None

def _remove_spaces(self, text: str) -> str:
return (
text
.replace("\t", "")
.replace("\n", "")
.replace(" ", "")
.lower()
.strip()
)
return text.replace("\t", "").replace("\n", "").replace(" ", "").lower().strip()

def normalized_representation(self, is_root: bool = True) -> str:
"""
Expand Down Expand Up @@ -229,7 +218,7 @@ def _normalized_representation(self) -> str:
return f"<{self.tag}{attribute_string}>{text}{optional_text_space}{internal_elements}</{self.tag}>"

@staticmethod
def _normalize_string(text: Optional[str], lower_case: bool = False) -> str:
def _normalize_string(text: str | None, lower_case: bool = False) -> str:
"""
Normalizes an input string by removing extra spaces, tabs, multiple new lines, etc.
"""
Expand Down
11 changes: 5 additions & 6 deletions deboiler/models/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from dataclasses import dataclass
from io import StringIO
from logging import Logger
from typing import Optional, Union

from lxml.etree import HTMLParser, _Element, parse as parse_html

Expand All @@ -21,7 +20,7 @@ class RawPage:
"""

url: str
content: Union[bytes, str]
content: bytes | str

def __repr__(self):
return f"RawPage(url={self.url}, content={self.content[:20]}...)"
Expand All @@ -43,7 +42,7 @@ class ParsedPage:
logger: Logger
parser = HTMLParser(remove_comments=True)

def __init__(self, url: str, content: Union[bytes, str]):
def __init__(self, url: str, content: bytes | str):
self.url = url
self.content: LxmlTree = self.parse(content)
self.nodes: set[str] = {
Expand All @@ -55,7 +54,7 @@ def __init__(self, url: str, content: Union[bytes, str]):
def __repr__(self):
return f"ParsedPage(url={self.url})"

def parse(self, content: Union[str, bytes]) -> LxmlTree:
def parse(self, content: str | bytes) -> LxmlTree:
"""
Parses the input html string/bytes into an LxmlTree.
"""
Expand Down Expand Up @@ -147,5 +146,5 @@ class OutputPage:
headings: str
lists: str
breadcrumbs: str
language: Optional[str] = None
cleaned_html: Optional[str] = None
language: str | None = None
cleaned_html: str | None = None
8 changes: 4 additions & 4 deletions deboiler/tests/fixtures/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from pkg_resources import resource_filename
from importlib import resources

from pathlib import Path


def get_fixture_path(fixture_name: str = "") -> Path:
return Path(
resource_filename(__name__, fixture_name),
)
ref = resources.files(__name__) / fixture_name
with resources.as_file(ref) as path:
return path
3 changes: 0 additions & 3 deletions deboiler/tests/test_language_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,19 +32,16 @@
template.substitute(language='lang="en"', title="page title", text=TEXT),
"en",
),

# english with locale from meta
(
template.substitute(language='lang="en-us"', title="page title", text=TEXT),
"en-us",
),

# english from text
(
template.substitute(language="", title="page title", text=TEXT),
"en",
),

# non-english from text
(
template.substitute(language="", title="título de la página", text="algún texto"),
Expand Down
Loading