diff --git a/docs/contents/visual_debugging.rst b/docs/contents/visual_debugging.rst index 002c894..5c9cc5f 100644 --- a/docs/contents/visual_debugging.rst +++ b/docs/contents/visual_debugging.rst @@ -1,2 +1,10 @@ Visual debugging -================ \ No newline at end of file +================ + +Requirements +------------ + +The Visual debugging feature requires Ghostscript to extract images from pdf. +This is required as a separated install on your system. + +See `Ghostscript `_ for installation instructions. diff --git a/libpdf/apiobjects.py b/libpdf/apiobjects.py index 9ca56cf..cc3a223 100644 --- a/libpdf/apiobjects.py +++ b/libpdf/apiobjects.py @@ -61,13 +61,12 @@ def __init__( # pylint: disable=too-many-arguments # the parameters are needed if pdfminer is not None: # take argument first self.pdfminer = pdfminer + elif pdfplumber is not None: + # set from pdfplumber document + self.pdfminer = pdfplumber.doc else: - if pdfplumber is not None: - # set from pdfplumber document - self.pdfminer = pdfplumber.doc - else: - # nothing available - self.pdfminer = None + # nothing available + self.pdfminer = None class Flattened(NamedTuple): diff --git a/libpdf/catalog.py b/libpdf/catalog.py index fedc50b..6e940c4 100644 --- a/libpdf/catalog.py +++ b/libpdf/catalog.py @@ -124,7 +124,7 @@ def resolve_name_obj(name_tree_kids): """ temp_list = [] for kid in name_tree_kids: - if "Kids" in kid and kid["Kids"]: + if kid.get("Kids"): temp_list.extend([kid_kid.resolve() for kid_kid in kid["Kids"]]) elif "Names" in kid: return name_tree_kids @@ -311,14 +311,12 @@ def resolve_outline(outline_obj, outline_list, des_dict, pdf): # pylint: disabl raise RuntimeError( f"Page {outline_obj['Dest'][0]} is not an indirect reference to a page object" ) + elif isinstance(outline_obj["Dest"], PSLiteral): + # PDF 1.1 name object + outline_dest = outline_obj["Dest"].name else: - # named destination - if isinstance(outline_obj["Dest"], PSLiteral): - # PDF 1.1 name object - outline_dest = outline_obj["Dest"].name - else: - # PDF 1.2 byte string - outline_dest = outline_obj["Dest"].decode("utf-8") + # PDF 1.2 byte string + outline_dest = outline_obj["Dest"].decode("utf-8") title_bytes = outline_obj["Title"] else: raise ValueError("No key A and Dest in outline.") diff --git a/libpdf/extract.py b/libpdf/extract.py index 311710d..d2f65ba 100644 --- a/libpdf/extract.py +++ b/libpdf/extract.py @@ -429,9 +429,8 @@ def check_false_positive_header_footer(pdf, elements_list): # pylint: disable=t # recursively check again, to find the next min_low_pos, which will determine the header/footer boundary if elements_list: return check_false_positive_header_footer(pdf, elements_list) - else: - if len(elements_list) == 1: - elements_list.pop() + elif len(elements_list) == 1: + elements_list.pop() else: for idx, element in enumerate(elements_list): if float(f"{element.position.y0:.4f}") == header_low_pos: @@ -853,12 +852,9 @@ def check_and_filter_figures(figures_list): # pylint: disable=too-many-branches "remove filtered figure fig0 due to partially overlap" ) filtered_figures.remove(fig0) - else: - if fig1 in filtered_figures: - LOG.debug( - "remove filtered figure fig1 due to partially overlap" - ) - filtered_figures.remove(fig1) + elif fig1 in filtered_figures: + LOG.debug("remove filtered figure fig1 due to partially overlap") + filtered_figures.remove(fig1) if len(filtered_figures) < len(figures_list): LOG.debug( diff --git a/libpdf/log.py b/libpdf/log.py index ca1c9f4..7239cc5 100644 --- a/libpdf/log.py +++ b/libpdf/log.py @@ -71,14 +71,13 @@ def config_logger(cli=True): init_tqdm = True else: init_basic = True - else: # API usage - if TQDM_AVAILABLE: - # this needs to be documented so any API user is not surprised that the libpdf logger has an attached - # handler; users may delete it if unwanted or it could be configurable later if tqdm handler should be - # used or the user wants to define something else - init_tqdm = True - else: # don't init anything, it's up to the user - pass + elif TQDM_AVAILABLE: + # this needs to be documented so any API user is not surprised that the libpdf logger has an attached + # handler; users may delete it if unwanted or it could be configurable later if tqdm handler should be + # used or the user wants to define something else + init_tqdm = True + else: # don't init anything, it's up to the user + pass log_format = "[%(levelname)5s] %(name)s - %(message)s" if init_tqdm: diff --git a/libpdf/models/model_base.py b/libpdf/models/model_base.py index 3f99327..a54dd9b 100644 --- a/libpdf/models/model_base.py +++ b/libpdf/models/model_base.py @@ -18,9 +18,8 @@ def to_dict(self): for key, value in vars_dict.items(): if key.startswith("b_"): delete_backref_keys.append(key) - else: - if isinstance(value, ModelBase): - vars_dict[key] = value.to_dict() + elif isinstance(value, ModelBase): + vars_dict[key] = value.to_dict() # delete back references for key in delete_backref_keys: del vars_dict[key] diff --git a/libpdf/process.py b/libpdf/process.py index 6500aec..f723c77 100644 --- a/libpdf/process.py +++ b/libpdf/process.py @@ -309,21 +309,20 @@ def fill_elements_content( id_dict = {"table": 1, "figure": 1, "paragraph": 1, "rect": 1} content = elements_in_outline[index_element].content index_b_chapter = index_element + elif "content" in locals(): + element.idx = id_dict[element.type] + element.b_chapter = elements_in_outline[index_b_chapter] + content.append(element) + id_dict[element.type] += 1 else: - if "content" in locals(): - element.idx = id_dict[element.type] - element.b_chapter = elements_in_outline[index_b_chapter] - content.append(element) - id_dict[element.type] += 1 - else: - # TODO 1. this exception is not caught in libpdf code and will go all the way up to the user (wanted?) - # 2. the message is unclear - # 3. if it's a programming error, fix the code - # 4. if it's a real runtime issue coming from wrong PDF input, catch the error one level above - # and log an understandable, critical error - raise ValueError( - "elements can not fill into the content because it does not exist" - ) + # TODO 1. this exception is not caught in libpdf code and will go all the way up to the user (wanted?) + # 2. the message is unclear + # 3. if it's a programming error, fix the code + # 4. if it's a real runtime issue coming from wrong PDF input, catch the error one level above + # and log an understandable, critical error + raise ValueError( + "elements can not fill into the content because it does not exist" + ) chapters_content = list( filter(lambda x: isinstance(x, Chapter), elements_in_outline) @@ -435,20 +434,19 @@ def libpdf_target_explorer( # pylint: disable=too-many-nested-blocks # local al for link in element.links: target_id = find_target_id(link, pages_list, element) link.libpdf_target = target_id + elif isinstance(element, Cell): + # Cell is not considered as element + pass else: - if isinstance(element, Cell): - # Cell is not considered as element - pass - else: - # TODO reason about the overall logic; which cases can be removed? distinguish between - # programming errors (raise RuntimeErrors) and cases that actually may exist in the - # wild and write human-readable log messages (e.g. - # The link on page xy with text xy cannot be resolved to a libpdf element; linking - # to the target page position instead - LOG.error( - "The source link in the paragraph %s is missing", - repr(element), - ) + # TODO reason about the overall logic; which cases can be removed? distinguish between + # programming errors (raise RuntimeErrors) and cases that actually may exist in the + # wild and write human-readable log messages (e.g. + # The link on page xy with text xy cannot be resolved to a libpdf element; linking + # to the target page position instead + LOG.error( + "The source link in the paragraph %s is missing", + repr(element), + ) def elements_with_anno_finder( diff --git a/libpdf/textbox.py b/libpdf/textbox.py index e4db528..9794826 100644 --- a/libpdf/textbox.py +++ b/libpdf/textbox.py @@ -79,10 +79,9 @@ def extract_paragraphs_chapters( chapter_list = [] if no_chapters: LOG.info("Excluding chapters extraction") - else: - if catalog["outline"]: - LOG.info("Extracting chapters ...") - chapter_list = render_chapters(extracted_lt_textboxes, page_list, pdf) + elif catalog["outline"]: + LOG.info("Extracting chapters ...") + chapter_list = render_chapters(extracted_lt_textboxes, page_list, pdf) paragraph_list = [] if no_paragraphs: @@ -782,17 +781,15 @@ def first_last_char_in_anno_marker( # pylint: disable=too-many-branches # bette # the incoming char is outside the anno-rectangle pass - else: - # the char is LTAnno - if idx_char == len(ltobjs_in_lttextline) - 1: - # the last char of the textline + elif idx_char == len(ltobjs_in_lttextline) - 1: + # the last char of the textline + anno_complete = True + elif isinstance(ltobjs_in_lttextline[idx_char + 1], LTChar): + if ltobjs_in_lttextline[idx_char + 1].x0 > anno["rect"][2]: + # the next char is outside of the current anno-rectangle anno_complete = True - elif isinstance(ltobjs_in_lttextline[idx_char + 1], LTChar): - if ltobjs_in_lttextline[idx_char + 1].x0 > anno["rect"][2]: - # the next char is outside of the current anno-rectangle - anno_complete = True - else: - raise ValueError("two LTAnno occurs in a row") + else: + raise ValueError("two LTAnno occurs in a row") return anno_complete diff --git a/libpdf/utils.py b/libpdf/utils.py index b593191..24ca914 100644 --- a/libpdf/utils.py +++ b/libpdf/utils.py @@ -303,43 +303,40 @@ def find_lt_obj_in_bbox( ): # This is the case when a LT object is neither inside nor intersected with the given bounding box. pass - else: - # This is the case when a LT object is intersected with the given box. In this case, the LT objects inside the - # given bounding box need to be hierarchically and recursively found. - if hasattr(lt_obj, "_objs"): - # All the downwards hierarchical LT objects are stored in the attribute "_objs". - # If the _objs attribute doesn't exist, it means it's the bottom of the hierarchy. - text_inside_bbox = False # True on LTTextLine level when the first LTChar is inside the BBOX - for item in lt_obj._objs: # pylint: disable=protected-access - if isinstance(item, LTAnno): - # special treatment of LTAnno because it is virtual with no position data - if text_inside_bbox: - # LTAnno is added because an LTChar was inside the bbox before + elif hasattr(lt_obj, "_objs"): + # All the downwards hierarchical LT objects are stored in the attribute "_objs". + # If the _objs attribute doesn't exist, it means it's the bottom of the hierarchy. + text_inside_bbox = ( + False # True on LTTextLine level when the first LTChar is inside the BBOX + ) + for item in lt_obj._objs: # pylint: disable=protected-access + if isinstance(item, LTAnno): + # special treatment of LTAnno because it is virtual with no position data + if text_inside_bbox: + # LTAnno is added because an LTChar was inside the bbox before + lt_objs_in_bbox.append(item) + elif isinstance(item, LTChar): + # check if the first and last LTChar have shown in the given bbox to decide if the trailing + # LTAnno should be added + ltchar_inside = check_lt_obj_in_bbox(item, bbox) + if text_inside_bbox: + if ltchar_inside: lt_objs_in_bbox.append(item) - else: - if isinstance(item, LTChar): - # check if the first and last LTChar have shown in the given bbox to decide if the trailing - # LTAnno should be added - ltchar_inside = check_lt_obj_in_bbox(item, bbox) - if text_inside_bbox: - if ltchar_inside: - lt_objs_in_bbox.append(item) - else: - # the bbox just ended and can't enter again - break - else: - if ltchar_inside: - lt_objs_in_bbox.append(item) - text_inside_bbox = True - else: - # no LTChar was added before, so not in BBOX yet - pass else: - # it is not an LTAnno nor an LTChar, so recurse and break it further down - find_lt_obj_in_bbox(lt_objs_in_bbox, item, bbox) - else: - # no attribute "_objs" exists. It reaches the bottom of the hierarchy - pass + # the bbox just ended and can't enter again + break + elif ltchar_inside: + lt_objs_in_bbox.append(item) + text_inside_bbox = True + else: + # no LTChar was added before, so not in BBOX yet + pass + else: + # it is not an LTAnno nor an LTChar, so recurse and break it further down + find_lt_obj_in_bbox(lt_objs_in_bbox, item, bbox) + else: + # no attribute "_objs" exists. It reaches the bottom of the hierarchy + pass def lt_page_crop( diff --git a/poetry.lock b/poetry.lock index 6354f62..8cce306 100644 --- a/poetry.lock +++ b/poetry.lock @@ -874,28 +874,28 @@ files = [ [[package]] name = "ruff" -version = "0.1.13" +version = "0.2.0" description = "An extremely fast Python linter and code formatter, written in Rust." optional = false python-versions = ">=3.7" files = [ - {file = "ruff-0.1.13-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:e3fd36e0d48aeac672aa850045e784673449ce619afc12823ea7868fcc41d8ba"}, - {file = "ruff-0.1.13-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:9fb6b3b86450d4ec6a6732f9f60c4406061b6851c4b29f944f8c9d91c3611c7a"}, - {file = "ruff-0.1.13-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b13ba5d7156daaf3fd08b6b993360a96060500aca7e307d95ecbc5bb47a69296"}, - {file = "ruff-0.1.13-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9ebb40442f7b531e136d334ef0851412410061e65d61ca8ce90d894a094feb22"}, - {file = "ruff-0.1.13-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:226b517f42d59a543d6383cfe03cccf0091e3e0ed1b856c6824be03d2a75d3b6"}, - {file = "ruff-0.1.13-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:5f0312ba1061e9b8c724e9a702d3c8621e3c6e6c2c9bd862550ab2951ac75c16"}, - {file = "ruff-0.1.13-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2f59bcf5217c661254bd6bc42d65a6fd1a8b80c48763cb5c2293295babd945dd"}, - {file = "ruff-0.1.13-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e6894b00495e00c27b6ba61af1fc666f17de6140345e5ef27dd6e08fb987259d"}, - {file = "ruff-0.1.13-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9a1600942485c6e66119da294c6294856b5c86fd6df591ce293e4a4cc8e72989"}, - {file = "ruff-0.1.13-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:ee3febce7863e231a467f90e681d3d89210b900d49ce88723ce052c8761be8c7"}, - {file = "ruff-0.1.13-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:dcaab50e278ff497ee4d1fe69b29ca0a9a47cd954bb17963628fa417933c6eb1"}, - {file = "ruff-0.1.13-py3-none-musllinux_1_2_i686.whl", hash = "sha256:f57de973de4edef3ad3044d6a50c02ad9fc2dff0d88587f25f1a48e3f72edf5e"}, - {file = "ruff-0.1.13-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:7a36fa90eb12208272a858475ec43ac811ac37e91ef868759770b71bdabe27b6"}, - {file = "ruff-0.1.13-py3-none-win32.whl", hash = "sha256:a623349a505ff768dad6bd57087e2461be8db58305ebd5577bd0e98631f9ae69"}, - {file = "ruff-0.1.13-py3-none-win_amd64.whl", hash = "sha256:f988746e3c3982bea7f824c8fa318ce7f538c4dfefec99cd09c8770bd33e6539"}, - {file = "ruff-0.1.13-py3-none-win_arm64.whl", hash = "sha256:6bbbc3042075871ec17f28864808540a26f0f79a4478c357d3e3d2284e832998"}, - {file = "ruff-0.1.13.tar.gz", hash = "sha256:e261f1baed6291f434ffb1d5c6bd8051d1c2a26958072d38dfbec39b3dda7352"}, + {file = "ruff-0.2.0-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:638ea3294f800d18bae84a492cb5a245c8d29c90d19a91d8e338937a4c27fca0"}, + {file = "ruff-0.2.0-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:3ff35433fcf4dff6d610738712152df6b7d92351a1bde8e00bd405b08b3d5759"}, + {file = "ruff-0.2.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf9faafbdcf4f53917019f2c230766da437d4fd5caecd12ddb68bb6a17d74399"}, + {file = "ruff-0.2.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8153a3e4128ed770871c47545f1ae7b055023e0c222ff72a759f5a341ee06483"}, + {file = "ruff-0.2.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e8a75a98ae989a27090e9c51f763990ad5bbc92d20626d54e9701c7fe597f399"}, + {file = "ruff-0.2.0-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:87057dd2fdde297130ff99553be8549ca38a2965871462a97394c22ed2dfc19d"}, + {file = "ruff-0.2.0-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6d232f99d3ab00094ebaf88e0fb7a8ccacaa54cc7fa3b8993d9627a11e6aed7a"}, + {file = "ruff-0.2.0-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3d3c641f95f435fc6754b05591774a17df41648f0daf3de0d75ad3d9f099ab92"}, + {file = "ruff-0.2.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3826fb34c144ef1e171b323ed6ae9146ab76d109960addca730756dc19dc7b22"}, + {file = "ruff-0.2.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:eceab7d85d09321b4de18b62d38710cf296cb49e98979960a59c6b9307c18cfe"}, + {file = "ruff-0.2.0-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:30ad74687e1f4a9ff8e513b20b82ccadb6bd796fe5697f1e417189c5cde6be3e"}, + {file = "ruff-0.2.0-py3-none-musllinux_1_2_i686.whl", hash = "sha256:a7e3818698f8460bd0f8d4322bbe99db8327e9bc2c93c789d3159f5b335f47da"}, + {file = "ruff-0.2.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:edf23041242c48b0d8295214783ef543847ef29e8226d9f69bf96592dba82a83"}, + {file = "ruff-0.2.0-py3-none-win32.whl", hash = "sha256:e155147199c2714ff52385b760fe242bb99ea64b240a9ffbd6a5918eb1268843"}, + {file = "ruff-0.2.0-py3-none-win_amd64.whl", hash = "sha256:ba918e01cdd21e81b07555564f40d307b0caafa9a7a65742e98ff244f5035c59"}, + {file = "ruff-0.2.0-py3-none-win_arm64.whl", hash = "sha256:3fbaff1ba9564a2c5943f8f38bc221f04bac687cc7485e45237579fee7ccda79"}, + {file = "ruff-0.2.0.tar.gz", hash = "sha256:63856b91837606c673537d2889989733d7dffde553828d3b0f0bacfa6def54be"}, ] [[package]] @@ -1244,4 +1244,4 @@ tqdm = ["tqdm"] [metadata] lock-version = "2.0" python-versions = "^3.8" -content-hash = "c1685b3c0330bc060c0c174892c686176eac896e4dd3eb4d4b66bc91b4abbff8" +content-hash = "9c1eef38de5b33209d134a69d9c8c36cb66cdc159738482e05108e702ee3c34b" diff --git a/pyproject.toml b/pyproject.toml index d810eeb..102ec1a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -84,7 +84,7 @@ tox = "*" pytest-xdist = "*" # parallelisation # linting, formatting -ruff = "^0.1.13" +ruff = "^0.2.0" # docs sphinx = "*" diff --git a/tests/conftest.py b/tests/conftest.py index 9d21e3b..1fd8395 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,72 +1,43 @@ """Pytest conftest module containing common test configuration and fixtures.""" -import os +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from libpdf.apiobjects import ApiObjects import pytest from libpdf import load # test PDFs from pdfplumber -PDF_LOREM_IPSUM = os.path.join(os.path.dirname(__file__), "pdf", "lorem-ipsum.pdf") -PDF_TWO_COLUMNS = os.path.join(os.path.dirname(__file__), "pdf", "two_colums_sampe.pdf") -PDF_WITH_EMPTY_OUTLINE = os.path.join( - os.path.dirname(__file__), "pdf", "issue-67-example.pdf" -) -PDF_OUTLINE_NO_DEST = os.path.join(os.path.dirname(__file__), "pdf", "pdffill-demo.pdf") -PDF_FIGURE_WITH_INVALID_BBOX = os.path.join( - os.path.dirname(__file__), "pdf", "pr-138-example.pdf" -) -PDF_CHAPTER_DETECTION = os.path.join( - os.path.dirname(__file__), "pdf", "DS93-chapter-issue-fix.pdf" -) +PDF_LOREM_IPSUM = Path(__file__).parent / "pdf" / "lorem-ipsum.pdf" +PDF_TWO_COLUMNS = Path(__file__).parent / "pdf" / "two_colums_sampe.pdf" +PDF_WITH_EMPTY_OUTLINE = Path(__file__).parent / "pdf" / "issue-67-example.pdf" + +PDF_OUTLINE_NO_DEST = Path(__file__).parent / "pdf" / "pdffill-demo.pdf" +PDF_FIGURE_WITH_INVALID_BBOX = Path(__file__).parent / "pdf" / "pr-138-example.pdf" +PDF_CHAPTER_DETECTION = Path(__file__).parent / "pdf" / "DS93-chapter-issue-fix.pdf" # full features PDF -PDF_FULL_FEATURES = os.path.join(os.path.dirname(__file__), "pdf", "full_features.pdf") -PDF_FIGURES_EXTRACTION = os.path.join( - os.path.dirname(__file__), "pdf", "test_figures_extraction.pdf" -) -PDF_SMART_HEADER_FOOTER_DETECTION = os.path.join( - os.path.dirname(__file__), "pdf", "test_header_footer_detection.pdf" +PDF_FULL_FEATURES = Path(__file__).parent / "pdf" / "full_features.pdf" +PDF_FIGURES_EXTRACTION = Path(__file__).parent / "pdf" / "test_figures_extraction.pdf" +PDF_SMART_HEADER_FOOTER_DETECTION = ( + Path(__file__).parent / "pdf" / "test_header_footer_detection.pdf" ) # test PDFs from official python documentation -PDF_PYTHON_LOGGING = os.path.join(os.path.dirname(__file__), "pdf", "howto-logging.pdf") - - -def obj_equal(class_type, instance1, instance2): - """ - Do a attribute based comparison of instances. - - :param class_type: both instances must be of this type - :param instance1: first object - :param instance2: second object - :return: True if all attributes are equal else False - """ - if not isinstance(instance1, class_type) or not isinstance(instance2, class_type): - # don't attempt to compare against unrelated types - return NotImplemented +PDF_PYTHON_LOGGING = Path(__file__).parent / "pdf" / "howto-logging.pdf" - # get attributes of each and exclude special names and back references - self_attr = [ - attr - for attr in dir(instance1) - if (not attr.startswith("__") and not attr.startswith("b_")) - ] - other_attr = [ - attr - for attr in dir(instance2) - if (not attr.startswith("__") and not attr.startswith("b_")) - ] - if set(self_attr) == set(other_attr): - for attr in self_attr: - if getattr(instance1, attr) != getattr(instance1, attr): - # TODO this uses the equality operator which might fail for referred elements like page on Position - return False - return True - return False +# test PDF for rect extraction generateby by sphinx-simplepdf +PDF_RECTS_EXTRACTION = Path(__file__).parent / "pdf" / "test_rects_extraction.pdf" @pytest.fixture(scope="session") -def load_full_features_pdf(tmpdir_factory, request): +def load_full_features_pdf( + tmpdir_factory: pytest.TempPathFactory, request: pytest.FixtureRequest +) -> tuple(str, ApiObjects | None): """Load test pdf and return temporary directory path and the libpdf object.""" tmpdir = tmpdir_factory.mktemp("full_features_pdf") tmpdir_path = str(tmpdir) @@ -74,5 +45,5 @@ def load_full_features_pdf(tmpdir_factory, request): return tmpdir_path, load( PDF_FULL_FEATURES, save_figures=save_figures, - figure_dir=os.path.join(tmpdir_path, "figures"), + figure_dir=Path(tmpdir_path) / "figures", ) diff --git a/tests/pdf/test_rects_extraction.pdf b/tests/pdf/test_rects_extraction.pdf new file mode 100644 index 0000000..b8b9096 Binary files /dev/null and b/tests/pdf/test_rects_extraction.pdf differ diff --git a/tests/test_catalog.py b/tests/test_catalog.py index 5987933..6a7aced 100644 --- a/tests/test_catalog.py +++ b/tests/test_catalog.py @@ -12,7 +12,7 @@ def test_catalog_with_empty_outline(): """Check if catalog extracted correctly with pdf that has empty outline.""" runner = CliRunner() - result = runner.invoke(libpdf.core.main_cli, [PDF_WITH_EMPTY_OUTLINE]) + result = runner.invoke(libpdf.core.main_cli, [str(PDF_WITH_EMPTY_OUTLINE)]) assert result.exit_code == 0 objects = libpdf.load(PDF_WITH_EMPTY_OUTLINE) diff --git a/tests/test_cli.py b/tests/test_cli.py index d2a6a8b..eec1c55 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -13,6 +13,8 @@ def test_cli_ok(path): """Check if CLI exits with code 0 when no errors occur.""" runner = CliRunner() - result = runner.invoke(main_cli, [path, "-o", "out.yaml", "-f", "yaml"]) + result = runner.invoke( + main_cli, [str(path.absolute()), "-o", "out.yaml", "-f", "yaml"] + ) assert result.exception is None assert result.exit_code == 0 diff --git a/tests/test_figures.py b/tests/test_figures.py index 7054eb2..f5d662f 100644 --- a/tests/test_figures.py +++ b/tests/test_figures.py @@ -12,7 +12,7 @@ def test_figures_extract_with_invalid_bbox(): """Check if figures extraction correctly when figures have invalid bbox.""" runner = CliRunner() - result = runner.invoke(libpdf.core.main_cli, [PDF_FIGURE_WITH_INVALID_BBOX]) + result = runner.invoke(libpdf.core.main_cli, [str(PDF_FIGURE_WITH_INVALID_BBOX)]) assert result.exit_code == 0 objects = libpdf.load(PDF_FIGURE_WITH_INVALID_BBOX) diff --git a/tests/test_rects.py b/tests/test_rects.py new file mode 100644 index 0000000..6393151 --- /dev/null +++ b/tests/test_rects.py @@ -0,0 +1,233 @@ +"""Test rects extraction.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Sequence + + from libpdf.apiobjects import ApiObjects, Chapter, Paragraph, Rect, Table + +import libpdf +from tests.conftest import ( + PDF_RECTS_EXTRACTION, +) + + +def find_chapter(objects: ApiObjects, chapter_name: str) -> Chapter: + """ + search for given chapter in the objects. + + :return: found chapter + """ + chapters = objects.flattened.chapters + assert len(chapters) > 0 + + chapter = next((c for c in chapters if c.title == chapter_name), None) + + assert chapter is not None + return chapter + + +def check_chapter_contains_text_paragraph( + chapter: Chapter, text: str +) -> [Paragraph | None]: + """ + check for text in chapter paragraphs. + + :return: found paragraph + """ + assert chapter.content is not None + + for content in chapter.content: + if content.type == "paragraph" and text in content.textbox.text: + return content + + return None + + +def check_chapter_contains_text_rect(chapter: Chapter, text: str) -> [Paragraph | None]: + """ + check for text in chapter rects. + + :return: found rect + """ + assert chapter.content is not None + + for content in chapter.content: + if content.type == "rect" and text in content.textbox.text: + return content + + return None + + +def check_chapter_rects_count(chapter: Chapter) -> int: + """ + check for number of rects in chapter. + + :return: number of rects + """ + assert chapter.content is not None + count = 0 + + for content in chapter.content: + if content.type == "rect": + count += 1 + + return count + + +def check_content_color(content: Rect, color: Sequence[int]) -> bool: + """ + check rect color is equal given color. + + :return: True if equal + """ + return content.non_stroking_color == color + + +def check_content_margins_equal(rect: Rect, paragraph: Paragraph) -> bool: + """ + check for text margins between rect and paragraph. + + :return: True if x0 equals + """ + return paragraph.textbox.x0 == rect.textbox.x0 + + +def check_content_margins_greater( + rect: Rect, paragraph: Paragraph, offset: int +) -> bool: + """ + check for text margins of given rect.x0 + offset is greater or equal paragraph x0. + + :return: found paragraph + """ + return rect.textbox.x0 + offset >= paragraph.textbox.x0 + + +def find_tables(chapter: Chapter) -> [Table]: + """ + check for number of rects in chapter. + + :return: List of Tables + """ + assert chapter.content is not None + + return [content for content in chapter.content if content.type == "table"] + + +def test_rects_extraction_code_block() -> None: + """Test rect extraction of multiline codeblock.""" + smart_page_crop = ( + True # remove header and footers so rects IN chapters are left only. + ) + + objects = libpdf.load(PDF_RECTS_EXTRACTION, smart_page_crop=smart_page_crop) + assert objects.flattened.rects is not None + + chapter = find_chapter(objects, "Code Block Highlighting") + + assert chapter is not None + + assert check_chapter_rects_count(chapter) == 1 + + paragraph = check_chapter_contains_text_paragraph( + chapter, "def decode_title(obj_bytes: bytes) -> str:" + ) + assert paragraph is not None + + rect = check_chapter_contains_text_rect( + chapter, "def decode_title(obj_bytes: bytes) -> str:" + ) + assert rect is not None + + assert check_content_color(rect, (0.941176, 0.941176, 0.941176)) + + assert check_content_margins_equal(paragraph, rect) + + +def test_rects_extraction_code_inline() -> None: + """Test rect extraction of inline codeblock.""" + smart_page_crop = ( + True # remove header and footers so rects IN chapters are left only. + ) + + objects = libpdf.load(PDF_RECTS_EXTRACTION, smart_page_crop=smart_page_crop) + assert objects.flattened.rects is not None + + chapter = find_chapter(objects, "Code Inline Highlighting") + + # 2 inline code blocks, but the first one is broken in two lines + assert check_chapter_rects_count(chapter) == 1 * 3 + + paragraph = check_chapter_contains_text_paragraph( + chapter, "from pathlib import Path" + ) + assert paragraph is not None + + rect = check_chapter_contains_text_rect(chapter, "from pathlib import Path") + assert rect is not None + assert rect.textbox.text == "from pathlib import Path" + assert check_content_color(rect, (0.945098, 0.945098, 0.945098)) + assert check_content_margins_greater(rect, paragraph, 234) + + assert ( + check_chapter_contains_text_rect( + chapter, "decode_title(obj_bytes: bytes) -> str" + ) + is None + ) + rect = check_chapter_contains_text_rect(chapter, "decode_title(obj_bytes: bytes)") + assert rect is not None + rect_str = check_chapter_contains_text_rect(chapter, "str ") + assert rect_str is not None + + assert rect_str.textbox.x0 < rect.textbox.x0 + + +def test_rects_extraction_adminition() -> None: + """Test rect extraction of 3 admonitions.""" + smart_page_crop = ( + True # remove header and footers so rects IN chapters are left only. + ) + + objects = libpdf.load(PDF_RECTS_EXTRACTION, smart_page_crop=smart_page_crop) + assert objects.flattened.rects is not None + + chapter = find_chapter(objects, "Adminition") + + assert ( + check_chapter_rects_count(chapter) == 3 * 2 + ) # 2 rects per admonition, 3 types of admonition + + rect = check_chapter_contains_text_rect(chapter, "A very importing Adminition") + assert rect is not None + assert check_content_color(rect, (0.858824, 0.980392, 0.956863)) + + rect_inner = check_chapter_contains_text_rect(chapter, "Wichtig") + assert rect_inner is not None + assert check_content_color(rect, (0.858824, 0.980392, 0.956863)) + + +def test_rects_extraction_table() -> None: + """Test rect extraction of table colored cells.""" + smart_page_crop = ( + True # remove header and footers so rects IN chapters are left only. + ) + + objects = libpdf.load(PDF_RECTS_EXTRACTION, smart_page_crop=smart_page_crop) + assert objects.flattened.rects is not None + + chapter = find_chapter(objects, "Tables") + + tables = find_tables(chapter) + + assert len(tables) == 1 + + table = tables[0] + assert table.columns_count == 1 * 3 + assert table.rows_count == 1 + + assert check_chapter_rects_count(chapter) == 1 * 5 diff --git a/tox.ini b/tox.ini index e7e178a..e649c91 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ isolated_build = True # python 3.11 and 3.12 test ok on Linux but fail on Windows due to Pillow # Currently used Pillow 9.0.1 only supports Python 3.10 according to # https://pillow.readthedocs.io/en/latest/installation.html#python-support -envlist = py{38,39,310,311,312}, docs, black +envlist = py{38,39,310,311,312}, docs, lint, format-check [testenv] allowlist_externals = poetry @@ -36,7 +36,8 @@ basepython = python3.12 commands= poetry install --no-root poetry run ruff format --check {[testenv]py_folders} - poetry run ruff {[testenv]py_folders} + # extend the file list once a file is touched so lint issues get fixed over time + poetry run ruff tests/conftest.py tests/test_rects.py [testenv:format-check] envdir = {toxworkdir}/py312