Skip to content

Commit

Permalink
Ruff fixes (#40)
Browse files Browse the repository at this point in the history
* Reformatted codebase with new ruff preview
* Ruff fixes for utils
* ruff: more fixes
* ruff: remaining utils issues
* Fix Path usages in utils
* 2 more ruff ignores
  • Loading branch information
ubmarco authored Feb 22, 2024
1 parent 29fe5a3 commit 518ead3
Show file tree
Hide file tree
Showing 32 changed files with 271 additions and 185 deletions.
2 changes: 1 addition & 1 deletion libpdf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from libpdf.core import main_cli

# define importable objects
__all__ = ["load", "__version__", "__summary__"]
__all__ = ["__summary__", "__version__", "load"]

# Enable running
# python -m libpdf.__init__
Expand Down
1 change: 1 addition & 0 deletions libpdf/_import_forks.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
These 2 methods take time, so below solution is a short-term workaround.
"""

import os
import sys

Expand Down
19 changes: 9 additions & 10 deletions libpdf/catalog.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""PDF catalog extraction."""

import logging
import re
from typing import Any, Dict, List, Union
Expand Down Expand Up @@ -211,14 +212,12 @@ def chapter_number_giver(

if chapter_number:
# The assumption is that only one match is found
chapters_in_outline[idx_chapter].update({"number": chapter_number[0]})
chapters_in_outline[idx_chapter].update(
{"title": chapter_title.replace(chapter_number[0], "", 1).strip()}
)
chapter.update({"number": chapter_number[0]})
chapter.update({
"title": chapter_title.replace(chapter_number[0], "", 1).strip()
})
else:
chapters_in_outline[idx_chapter].update(
{"number": f"virt.{new_hierarchical_level}"}
)
chapter.update({"number": f"virt.{new_hierarchical_level}"})

if chapter["content"]:
# next deeper level
Expand Down Expand Up @@ -653,9 +652,9 @@ def _resolve_pdf_obj_refs(
)
resolved_dict[key] = ret_list
else:
resolved_dict[
key
] = resolved # add resolved element to dictionary
resolved_dict[key] = (
resolved # add resolved element to dictionary
)
else:
# leave other types as they are
resolved_dict[key] = value
Expand Down
4 changes: 2 additions & 2 deletions libpdf/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
# not importing load(), so no circular import when importing from root __init__.py
from libpdf import __summary__, __version__, parameters # pylint: disable=cyclic-import
from libpdf.apiobjects import ApiObjects
from libpdf.extract import LibpdfException, extract
from libpdf.extract import LibpdfError, extract
from libpdf.log import config_logger, get_level_name, set_log_level
from libpdf.parameters import RENDER_ELEMENTS
from libpdf.process import output_dump
Expand Down Expand Up @@ -139,7 +139,7 @@ def main( # pylint: disable=too-many-arguments,too-many-locals # no reasonable
no_rects,
overall_pbar,
)
except LibpdfException:
except LibpdfError:
if cli_usage:
LOG.critical("Exiting with code 1")
sys.exit(1)
Expand Down
9 changes: 8 additions & 1 deletion libpdf/exceptions.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
"""Libpdf exceptions."""


class LibpdfException(Exception):
class LibpdfError(Exception):
"""Generic libpdf exception class."""


class TextContainsNewlineError(ValueError):
"""Text cannot contain newline character."""

def __init__(self, text: str):
super().__init__(f'Input text "{text}" contains a new line character.')
27 changes: 15 additions & 12 deletions libpdf/extract.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Core routines for PDF extraction."""

import itertools
import logging
import os
Expand All @@ -14,7 +15,7 @@
from libpdf import process as pro
from libpdf.apiobjects import ApiObjects
from libpdf.catalog import catalog, extract_catalog
from libpdf.exceptions import LibpdfException
from libpdf.exceptions import LibpdfError
from libpdf.log import logging_needed
from libpdf.models.figure import Figure
from libpdf.models.file import File
Expand Down Expand Up @@ -86,7 +87,7 @@ def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-sta
:param no_rects: flag triggering the exclusion of rects
:param overall_pbar: total progress bar for whole libpdf run
:return: instance of Objects class
:raise LibpdfException: PDF contains no pages
:raise LibpdfError: PDF contains no pages
"""
LOG.info("PDF extraction started ...")

Expand Down Expand Up @@ -116,7 +117,7 @@ def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-sta
if len(pdf.pages) == 0:
message = "Page range selection: no pages left in the PDF to analyze."
LOG.critical(message)
raise LibpdfException(message)
raise LibpdfError(message)

overall_pbar.update(5)
pdf = delete_page_ann(pdf)
Expand All @@ -131,7 +132,7 @@ def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-sta
pages_list = extract_page_metadata(pdf)

if not pages_list:
raise LibpdfException("PDF contains no pages")
raise LibpdfError("PDF contains no pages")

overall_pbar.update(1)

Expand Down Expand Up @@ -530,15 +531,15 @@ def _get_datetime_format(date: str):
if "CreationDate" in pdf.metadata:
preprocessed_date = _time_preprocess(pdf.metadata["CreationDate"])
time_format = _get_datetime_format(preprocessed_date)
file_meta_params.update(
{"creation_date": datetime.strptime(preprocessed_date, time_format)}
)
file_meta_params.update({
"creation_date": datetime.strptime(preprocessed_date, time_format)
})
if "ModDate" in pdf.metadata:
preprocessed_date = _time_preprocess(pdf.metadata["ModDate"])
time_format = _get_datetime_format(preprocessed_date)
file_meta_params.update(
{"modified_date": datetime.strptime(preprocessed_date, time_format)}
)
file_meta_params.update({
"modified_date": datetime.strptime(preprocessed_date, time_format)
})
if "Trapped" in pdf.metadata:
file_meta_params.update({"trapped": pdf.metadata["Trapped"]})

Expand Down Expand Up @@ -705,7 +706,7 @@ def extract_rects(
)

LOG.info(
f"found rect at {rect_bbox} at page {idx_page+1}: color {non_stroking_color}"
f"found rect at {rect_bbox} at page {idx_page + 1}: color {non_stroking_color}"
)
lt_textbox = lt_textbox_crop(
rect_bbox,
Expand All @@ -722,7 +723,9 @@ def extract_rects(
rect_list.append(rect)

else:
LOG.info(f"found no rects on page {idx_page+1}: {page_crop.objects.keys()}")
LOG.info(
f"found no rects on page {idx_page + 1}: {page_crop.objects.keys()}"
)

# return figure_list
return rect_list
Expand Down
1 change: 1 addition & 0 deletions libpdf/models/element.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Definition for PDF elements."""

from abc import ABC, abstractmethod
from typing import TYPE_CHECKING

Expand Down
1 change: 1 addition & 0 deletions libpdf/models/figure.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Definition for PDF figures."""

from typing import TYPE_CHECKING, List

from libpdf.models.element import Element
Expand Down
1 change: 1 addition & 0 deletions libpdf/models/file.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Definition for PDF file."""

from typing import TYPE_CHECKING

from libpdf.models.file_meta import FileMeta
Expand Down
1 change: 1 addition & 0 deletions libpdf/models/file_meta.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Definition for PDF file meta data."""

from datetime import datetime
from typing import TYPE_CHECKING

Expand Down
66 changes: 34 additions & 32 deletions libpdf/models/horizontal_box.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Definition of HorizontalBox to contain text in the PDF."""

from typing import List
from __future__ import annotations


class Char: # pylint: disable=too-few-public-methods # simplicity is good.
Expand All @@ -11,30 +11,32 @@ class Char: # pylint: disable=too-few-public-methods # simplicity is good.
:vartype text: str
:ivar x0: distance from the left of the page to the left edge of the character
:vartype x0: float
:ivar y0: distance from the bottom of the page to the lower edge of the character (less than y1)
:ivar y0: distance from the bottom of the page to the lower edge of the character
(less than y1)
:vartype y0: float
:ivar x1: distance from the left of the page to the right edge of the character
:vartype x1: float
:ivar y1: distance from the bottom of the page to the upper edge of the character (greater than y0)
:ivar y1: distance from the bottom of the page to the upper edge of the character
(greater than y0)
:vartype y1: float
"""

def __init__(
self,
text: str,
x0: float = None,
y0: float = None,
x1: float = None,
y1: float = None,
x0: float | None = None,
y0: float | None = None,
x1: float | None = None,
y1: float | None = None,
):
"""Init the class with plain char of a character and its rectangular coordinates."""
"""Init with plain char of a character and its rectangular coordinates."""
self.x0 = x0
self.y0 = y0
self.x1 = x1
self.y1 = y1
self.text = text

def __repr__(self):
def __repr__(self) -> str:
"""Make the text part of the repr for better debugging."""
return f"{type(self).__name__}({self.text})"

Expand All @@ -51,11 +53,11 @@ class Word:

def __init__(
self,
chars: List[Char],
x0: float = None,
y0: float = None,
x1: float = None,
y1: float = None,
chars: list[Char],
x0: float | None = None,
y0: float | None = None,
x1: float | None = None,
y1: float | None = None,
):
"""Init the class with plain text of a word and its rectangular coordinates."""
self.x0 = x0
Expand All @@ -71,11 +73,11 @@ def __init__(
self.y1 = max(text_obj.y1 for text_obj in self.chars)

@property
def text(self):
def text(self) -> str:
"""Return plain text."""
return "".join([x.text for x in self.chars])

def __repr__(self):
def __repr__(self) -> str:
"""Make the text part of the repr for better debugging."""
return f"{type(self).__name__}({self.text})"

Expand All @@ -92,13 +94,13 @@ class HorizontalLine:

def __init__(
self,
words: List[Word],
x0: float = None,
y0: float = None,
x1: float = None,
y1: float = None,
words: list[Word],
x0: float | None = None,
y0: float | None = None,
x1: float | None = None,
y1: float | None = None,
):
"""Init the class with plain text of a horizontal line and its rectangular coordinates."""
"""Init with plain text of a horizontal line and its rectangular coordinates."""
self.x0 = x0
self.y0 = y0
self.x1 = x1
Expand All @@ -112,11 +114,11 @@ def __init__(
self.y1 = max(text_obj.y1 for text_obj in self.words)

@property
def text(self):
def text(self) -> str:
"""Return plain text."""
return " ".join([x.text for x in self.words])

def __repr__(self):
def __repr__(self) -> str:
"""Make the text part of the repr for better debugging."""
return f"{type(self).__name__}({self.text})"

Expand All @@ -133,13 +135,13 @@ class HorizontalBox:

def __init__(
self,
lines: List[HorizontalLine],
x0: float = None,
y0: float = None,
x1: float = None,
y1: float = None,
lines: list[HorizontalLine],
x0: float | None = None,
y0: float | None = None,
x1: float | None = None,
y1: float | None = None,
):
"""Init the class with plain text of a horizontal box and its rectangular coordinates."""
"""Init with plain text of a horizontal box and its rectangular coordinates."""
self.x0 = x0
self.y0 = y0
self.x1 = x1
Expand All @@ -153,11 +155,11 @@ def __init__(
self.y1 = max(text_obj.y1 for text_obj in self.lines)

@property
def text(self):
def text(self) -> str:
"""Return plain text."""
return "\n".join([x.text for x in self.lines])

def __repr__(self):
def __repr__(self) -> str | None:
"""Make the text part of the repr for better debugging."""
if self.lines:
return f"{type(self).__name__}({self.text})"
Expand Down
1 change: 1 addition & 0 deletions libpdf/models/link.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Definition for PDF linked text."""

from typing import TYPE_CHECKING, Dict, Union

from libpdf.models.model_base import ModelBase
Expand Down
1 change: 1 addition & 0 deletions libpdf/models/model_base.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Base class for all PDF model classes."""

import logging

LOG = logging.getLogger(__name__)
Expand Down
1 change: 1 addition & 0 deletions libpdf/models/page.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Definition for PDF pages."""

from typing import TYPE_CHECKING, List, Union

from libpdf.models.model_base import ModelBase
Expand Down
1 change: 1 addition & 0 deletions libpdf/models/paragraph.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Definition for PDF textblocks."""

from typing import TYPE_CHECKING, List

from libpdf.models.element import Element
Expand Down
1 change: 1 addition & 0 deletions libpdf/models/position.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Definition of positions in the PDF."""

from typing import TYPE_CHECKING

from libpdf.parameters import TARGET_COOR_TOLERANCE
Expand Down
1 change: 1 addition & 0 deletions libpdf/models/rect.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Definition for PDF rects."""

from __future__ import annotations

from typing import TYPE_CHECKING
Expand Down
1 change: 1 addition & 0 deletions libpdf/models/root.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Definition for PDF root element."""

from typing import List, Union

from libpdf.models.chapter import Chapter
Expand Down
1 change: 1 addition & 0 deletions libpdf/models/table.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Definition for PDF tables."""

from operator import attrgetter
from typing import List

Expand Down
Loading

0 comments on commit 518ead3

Please sign in to comment.