Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding tests for rect extractions #36

Merged
merged 7 commits into from
Feb 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion docs/contents/visual_debugging.rst
Original file line number Diff line number Diff line change
@@ -1,2 +1,10 @@
Visual debugging
================
================

Requirements
------------

The Visual debugging feature requires Ghostscript to extract images from pdf.
This is required as a separated install on your system.

See `Ghostscript <https://www.ghostscript.com/>`_ for installation instructions.
11 changes: 5 additions & 6 deletions libpdf/apiobjects.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,13 +61,12 @@ def __init__( # pylint: disable=too-many-arguments # the parameters are needed
if pdfminer is not None:
# take argument first
self.pdfminer = pdfminer
elif pdfplumber is not None:
# set from pdfplumber document
self.pdfminer = pdfplumber.doc
else:
if pdfplumber is not None:
# set from pdfplumber document
self.pdfminer = pdfplumber.doc
else:
# nothing available
self.pdfminer = None
# nothing available
self.pdfminer = None


class Flattened(NamedTuple):
Expand Down
14 changes: 6 additions & 8 deletions libpdf/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def resolve_name_obj(name_tree_kids):
"""
temp_list = []
for kid in name_tree_kids:
if "Kids" in kid and kid["Kids"]:
if kid.get("Kids"):
temp_list.extend([kid_kid.resolve() for kid_kid in kid["Kids"]])
elif "Names" in kid:
return name_tree_kids
Expand Down Expand Up @@ -311,14 +311,12 @@ def resolve_outline(outline_obj, outline_list, des_dict, pdf): # pylint: disabl
raise RuntimeError(
f"Page {outline_obj['Dest'][0]} is not an indirect reference to a page object"
)
elif isinstance(outline_obj["Dest"], PSLiteral):
# PDF 1.1 name object
outline_dest = outline_obj["Dest"].name
else:
# named destination
if isinstance(outline_obj["Dest"], PSLiteral):
# PDF 1.1 name object
outline_dest = outline_obj["Dest"].name
else:
# PDF 1.2 byte string
outline_dest = outline_obj["Dest"].decode("utf-8")
# PDF 1.2 byte string
outline_dest = outline_obj["Dest"].decode("utf-8")
title_bytes = outline_obj["Title"]
else:
raise ValueError("No key A and Dest in outline.")
Expand Down
14 changes: 5 additions & 9 deletions libpdf/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,9 +429,8 @@ def check_false_positive_header_footer(pdf, elements_list): # pylint: disable=t
# recursively check again, to find the next min_low_pos, which will determine the header/footer boundary
if elements_list:
return check_false_positive_header_footer(pdf, elements_list)
else:
if len(elements_list) == 1:
elements_list.pop()
elif len(elements_list) == 1:
elements_list.pop()
else:
for idx, element in enumerate(elements_list):
if float(f"{element.position.y0:.4f}") == header_low_pos:
Expand Down Expand Up @@ -853,12 +852,9 @@ def check_and_filter_figures(figures_list): # pylint: disable=too-many-branches
"remove filtered figure fig0 due to partially overlap"
)
filtered_figures.remove(fig0)
else:
if fig1 in filtered_figures:
LOG.debug(
"remove filtered figure fig1 due to partially overlap"
)
filtered_figures.remove(fig1)
elif fig1 in filtered_figures:
LOG.debug("remove filtered figure fig1 due to partially overlap")
filtered_figures.remove(fig1)

if len(filtered_figures) < len(figures_list):
LOG.debug(
Expand Down
15 changes: 7 additions & 8 deletions libpdf/log.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,14 +71,13 @@ def config_logger(cli=True):
init_tqdm = True
else:
init_basic = True
else: # API usage
if TQDM_AVAILABLE:
# this needs to be documented so any API user is not surprised that the libpdf logger has an attached
# handler; users may delete it if unwanted or it could be configurable later if tqdm handler should be
# used or the user wants to define something else
init_tqdm = True
else: # don't init anything, it's up to the user
pass
elif TQDM_AVAILABLE:
# this needs to be documented so any API user is not surprised that the libpdf logger has an attached
# handler; users may delete it if unwanted or it could be configurable later if tqdm handler should be
# used or the user wants to define something else
init_tqdm = True
else: # don't init anything, it's up to the user
pass

log_format = "[%(levelname)5s] %(name)s - %(message)s"
if init_tqdm:
Expand Down
5 changes: 2 additions & 3 deletions libpdf/models/model_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,8 @@ def to_dict(self):
for key, value in vars_dict.items():
if key.startswith("b_"):
delete_backref_keys.append(key)
else:
if isinstance(value, ModelBase):
vars_dict[key] = value.to_dict()
elif isinstance(value, ModelBase):
vars_dict[key] = value.to_dict()
# delete back references
for key in delete_backref_keys:
del vars_dict[key]
Expand Down
52 changes: 25 additions & 27 deletions libpdf/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,21 +309,20 @@ def fill_elements_content(
id_dict = {"table": 1, "figure": 1, "paragraph": 1, "rect": 1}
content = elements_in_outline[index_element].content
index_b_chapter = index_element
elif "content" in locals():
element.idx = id_dict[element.type]
element.b_chapter = elements_in_outline[index_b_chapter]
content.append(element)
id_dict[element.type] += 1
else:
if "content" in locals():
element.idx = id_dict[element.type]
element.b_chapter = elements_in_outline[index_b_chapter]
content.append(element)
id_dict[element.type] += 1
else:
# TODO 1. this exception is not caught in libpdf code and will go all the way up to the user (wanted?)
# 2. the message is unclear
# 3. if it's a programming error, fix the code
# 4. if it's a real runtime issue coming from wrong PDF input, catch the error one level above
# and log an understandable, critical error
raise ValueError(
"elements can not fill into the content because it does not exist"
)
# TODO 1. this exception is not caught in libpdf code and will go all the way up to the user (wanted?)
# 2. the message is unclear
# 3. if it's a programming error, fix the code
# 4. if it's a real runtime issue coming from wrong PDF input, catch the error one level above
# and log an understandable, critical error
raise ValueError(
"elements can not fill into the content because it does not exist"
)

chapters_content = list(
filter(lambda x: isinstance(x, Chapter), elements_in_outline)
Expand Down Expand Up @@ -435,20 +434,19 @@ def libpdf_target_explorer( # pylint: disable=too-many-nested-blocks # local al
for link in element.links:
target_id = find_target_id(link, pages_list, element)
link.libpdf_target = target_id
elif isinstance(element, Cell):
# Cell is not considered as element
pass
else:
if isinstance(element, Cell):
# Cell is not considered as element
pass
else:
# TODO reason about the overall logic; which cases can be removed? distinguish between
# programming errors (raise RuntimeErrors) and cases that actually may exist in the
# wild and write human-readable log messages (e.g.
# The link on page xy with text xy cannot be resolved to a libpdf element; linking
# to the target page position instead
LOG.error(
"The source link in the paragraph %s is missing",
repr(element),
)
# TODO reason about the overall logic; which cases can be removed? distinguish between
# programming errors (raise RuntimeErrors) and cases that actually may exist in the
# wild and write human-readable log messages (e.g.
# The link on page xy with text xy cannot be resolved to a libpdf element; linking
# to the target page position instead
LOG.error(
"The source link in the paragraph %s is missing",
repr(element),
)


def elements_with_anno_finder(
Expand Down
25 changes: 11 additions & 14 deletions libpdf/textbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,9 @@ def extract_paragraphs_chapters(
chapter_list = []
if no_chapters:
LOG.info("Excluding chapters extraction")
else:
if catalog["outline"]:
LOG.info("Extracting chapters ...")
chapter_list = render_chapters(extracted_lt_textboxes, page_list, pdf)
elif catalog["outline"]:
LOG.info("Extracting chapters ...")
chapter_list = render_chapters(extracted_lt_textboxes, page_list, pdf)

paragraph_list = []
if no_paragraphs:
Expand Down Expand Up @@ -782,17 +781,15 @@ def first_last_char_in_anno_marker( # pylint: disable=too-many-branches # bette
# the incoming char is outside the anno-rectangle
pass

else:
# the char is LTAnno
if idx_char == len(ltobjs_in_lttextline) - 1:
# the last char of the textline
elif idx_char == len(ltobjs_in_lttextline) - 1:
# the last char of the textline
anno_complete = True
elif isinstance(ltobjs_in_lttextline[idx_char + 1], LTChar):
if ltobjs_in_lttextline[idx_char + 1].x0 > anno["rect"][2]:
# the next char is outside of the current anno-rectangle
anno_complete = True
elif isinstance(ltobjs_in_lttextline[idx_char + 1], LTChar):
if ltobjs_in_lttextline[idx_char + 1].x0 > anno["rect"][2]:
# the next char is outside of the current anno-rectangle
anno_complete = True
else:
raise ValueError("two LTAnno occurs in a row")
else:
raise ValueError("two LTAnno occurs in a row")

return anno_complete

Expand Down
67 changes: 32 additions & 35 deletions libpdf/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,43 +303,40 @@ def find_lt_obj_in_bbox(
):
# This is the case when a LT object is neither inside nor intersected with the given bounding box.
pass
else:
# This is the case when a LT object is intersected with the given box. In this case, the LT objects inside the
# given bounding box need to be hierarchically and recursively found.
if hasattr(lt_obj, "_objs"):
# All the downwards hierarchical LT objects are stored in the attribute "_objs".
# If the _objs attribute doesn't exist, it means it's the bottom of the hierarchy.
text_inside_bbox = False # True on LTTextLine level when the first LTChar is inside the BBOX
for item in lt_obj._objs: # pylint: disable=protected-access
if isinstance(item, LTAnno):
# special treatment of LTAnno because it is virtual with no position data
if text_inside_bbox:
# LTAnno is added because an LTChar was inside the bbox before
elif hasattr(lt_obj, "_objs"):
# All the downwards hierarchical LT objects are stored in the attribute "_objs".
# If the _objs attribute doesn't exist, it means it's the bottom of the hierarchy.
text_inside_bbox = (
False # True on LTTextLine level when the first LTChar is inside the BBOX
)
for item in lt_obj._objs: # pylint: disable=protected-access
if isinstance(item, LTAnno):
# special treatment of LTAnno because it is virtual with no position data
if text_inside_bbox:
# LTAnno is added because an LTChar was inside the bbox before
lt_objs_in_bbox.append(item)
elif isinstance(item, LTChar):
# check if the first and last LTChar have shown in the given bbox to decide if the trailing
# LTAnno should be added
ltchar_inside = check_lt_obj_in_bbox(item, bbox)
if text_inside_bbox:
if ltchar_inside:
lt_objs_in_bbox.append(item)
else:
if isinstance(item, LTChar):
# check if the first and last LTChar have shown in the given bbox to decide if the trailing
# LTAnno should be added
ltchar_inside = check_lt_obj_in_bbox(item, bbox)
if text_inside_bbox:
if ltchar_inside:
lt_objs_in_bbox.append(item)
else:
# the bbox just ended and can't enter again
break
else:
if ltchar_inside:
lt_objs_in_bbox.append(item)
text_inside_bbox = True
else:
# no LTChar was added before, so not in BBOX yet
pass
else:
# it is not an LTAnno nor an LTChar, so recurse and break it further down
find_lt_obj_in_bbox(lt_objs_in_bbox, item, bbox)
else:
# no attribute "_objs" exists. It reaches the bottom of the hierarchy
pass
# the bbox just ended and can't enter again
break
elif ltchar_inside:
lt_objs_in_bbox.append(item)
text_inside_bbox = True
else:
# no LTChar was added before, so not in BBOX yet
pass
else:
# it is not an LTAnno nor an LTChar, so recurse and break it further down
find_lt_obj_in_bbox(lt_objs_in_bbox, item, bbox)
else:
# no attribute "_objs" exists. It reaches the bottom of the hierarchy
pass


def lt_page_crop(
Expand Down
Loading