diff --git a/CHANGES.md b/CHANGES.md
index 0bf11524..a09699aa 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,9 +1,17 @@
# Change Log
-## Changes in version 0.0.15
+## Changes in version 0.0.17
+
+### Fixes:
+* [147](https://github.com/pymupdf/RAG/issues/147) - Error when page contains nothing but a table.
+* [81](https://github.com/pymupdf/RAG/issues/81) - Issues with bullet points in PDFs.
+* [78](https://github.com/pymupdf/RAG/issues/78) - multi column pdf file text extraction.
+
+
+## Changes in version 0.0.15
### Fixes:
diff --git a/pymupdf4llm/pymupdf4llm/__init__.py b/pymupdf4llm/pymupdf4llm/__init__.py
index 690ef694..6861f438 100644
--- a/pymupdf4llm/pymupdf4llm/__init__.py
+++ b/pymupdf4llm/pymupdf4llm/__init__.py
@@ -1,6 +1,6 @@
from .helpers.pymupdf_rag import IdentifyHeaders, to_markdown
-__version__ = "0.0.16"
+__version__ = "0.0.17"
version = __version__
version_tuple = tuple(map(int, version.split(".")))
diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
index 49045389..6be39bf1 100644
--- a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
+++ b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
@@ -33,26 +33,30 @@
from pymupdf4llm.helpers.get_text_lines import get_raw_lines, is_white
from pymupdf4llm.helpers.multi_column import column_boxes
from pymupdf4llm.helpers.progress import ProgressBar
+from dataclasses import dataclass
+# Characters recognized as bullets when starting a line.
bullet = tuple(
- [
- "- ",
- "* ",
- chr(0xF0A7),
- chr(0xF0B7),
- chr(0xB7),
- chr(0xBE),
- chr(0xBE) + " ",
- chr(8226),
- ]
- + list(map(chr, range(9642, 9680)))
+ ["- ", "* ", "> ", chr(0xB6), chr(0xB7), chr(8226), chr(0xF0A7), chr(0xF0B7)]
+ + list(map(chr, range(9632, 9680)))
)
GRAPHICS_TEXT = "\n![](%s)\n"
class IdentifyHeaders:
- """Compute data for identifying header text."""
+ """Compute data for identifying header text.
+
+ All non-white text from all selected pages is extracted and its font size
+ noted as a rounded value.
+ The most frequent font size (and all smaller ones) is taken as body text
+ font size.
+ Larger font sizes are mapped to strings of multiples of '#', the header
+ tag in Markdown, which in turn is Markdown's representation of HTML's
+ header tags
to .
+ Larger font sizes than body text but smaller than the font size are
+ represented as .
+ """
def __init__(
self,
@@ -126,18 +130,28 @@ def get_header_id(self, span: dict, page=None) -> str:
markdown header prefix string of 0 to n concatenated '#' characters.
"""
fontsize = round(span["size"]) # compute fontsize
+ if fontsize <= self.body_limit: # shortcut for body text
+ return ""
hdr_id = self.header_id.get(fontsize, "")
+ # If no header but larger than body text, assign .
if not hdr_id and fontsize > self.body_limit:
hdr_id = "###### "
return hdr_id
-def poly_area(points):
+# store relevant parameters here
+@dataclass
+class Parameters:
+ pass
+
+
+def poly_area(points: list) -> float:
"""Compute the area of the polygon represented by the given points.
We are using the "shoelace" algorithm (Gauss) for this.
+ Accepts a list of Point items and returns a float.
"""
- # make a local copy of points (avoid changing the original)
+ # make a local copy of points (do not change the original)
pts = points[:]
# remove duplicated connector points first
for i in range(len(pts) - 1, 0, -1):
@@ -153,7 +167,12 @@ def poly_area(points):
def refine_boxes(boxes):
- """Join any rectangles with a pairwise non-empty overlap."""
+ """Join any rectangles with a pairwise non-empty overlap.
+
+ Accepts and returns a list of Rect items.
+ Note that rectangles that only "touch" each other (common point or edge)
+ are not considered as overlapping.
+ """
new_rects = []
# list of all vector graphic rectangles
prects = boxes[:]
@@ -167,8 +186,9 @@ def refine_boxes(boxes):
if r.intersects(prects[i]): # enlarge first rect with this
r |= prects[i]
del prects[i] # delete this rect
- repeat = True # indicate we must try again
+ repeat = True # indicate must try again
+ # first rect now includes all overlaps
new_rects.append(r)
del prects[0]
@@ -179,9 +199,10 @@ def refine_boxes(boxes):
def is_significant(box, paths):
"""Check whether the rectangle "box" contains 'signifiant' drawings.
- For this to be true, at least one path must cover an area,
- which is smaller than 90% of box. Otherwise we assume
- that the graphic is decoration (highlighting, border-only etc.).
+ 'Significant' means that at least one stroked path must cover an area
+ less than 90% of box.
+ Not significant means that the graphic is decoration only (highlighting,
+ border-only etc.). It will not be considered further.
"""
box_area = abs(box) * 0.9 # 90% of area of box
@@ -264,8 +285,8 @@ def to_markdown(
if embed_images is True:
write_images = False
image_path = ""
- if not 0 < image_size_limit < 1:
- raise ValueError("'image_size_limit' must be positive and less than 1.")
+ if not 0 <= image_size_limit < 1:
+ raise ValueError("'image_size_limit' must be non-negative and less than 1.")
DPI = dpi
IGNORE_CODE = ignore_code
IMG_EXTENSION = image_format
@@ -372,15 +393,12 @@ def save_image(page, rect, i):
return ""
def write_text(
- page: pymupdf.Page,
- textpage: pymupdf.TextPage,
+ parms,
clip: pymupdf.Rect,
tabs=None,
- tab_rects: dict = None,
- img_rects: dict = None,
- links: list = None,
+ tab_rects: dict = {},
+ img_rects: list = [],
force_text=force_text,
- line_rects=None,
) -> string:
"""Output the text found inside the given clip.
@@ -391,7 +409,7 @@ def write_text(
There is also some effort for list supported (ordered / unordered) in
that typical characters are replaced by respective markdown characters.
- 'tab_rects'/'img_rects' are dictionaries of table, respectively image
+ 'tab_rects'/'img_rects' are dict / list of table, respectively image
or vector graphic rectangles.
General Markdown text generation skips these areas. Tables are written
via their own 'to_markdown' method. Images and vector graphics are
@@ -399,17 +417,20 @@ def write_text(
"""
if clip is None:
- clip = textpage.rect
+ clip = parms.clip
out_string = ""
- # This is a list of tuples (linerect, spanlist)
- nlines = get_raw_lines(textpage, clip=clip, tolerance=3)
+ tab_rects0 = list(tab_rects.values())
+ img_rects0 = img_rects
- line_rects.extend([l[0] for l in nlines]) # store line rectangles
+ # This is a list of tuples (linerect, spanlist)
+ nlines = [
+ l
+ for l in get_raw_lines(parms.textpage, clip=clip, tolerance=3)
+ if not intersects_rects(l[0], tab_rects0)
+ ]
- tab_rects0 = list(tab_rects.values())
- img_rects0 = list(img_rects.values())
- line_rects.extend(
+ parms.line_rects.extend(
[l[0] for l in nlines if not intersects_rects(l[0], tab_rects0)]
) # store line rectangles
@@ -420,9 +441,7 @@ def write_text(
for lrect, spans in nlines:
# there may be tables or images inside the text block: skip them
- if intersects_rects(lrect, tab_rects0) or intersects_rects(
- lrect, img_rects0
- ):
+ if intersects_rects(lrect, img_rects0):
continue
# ------------------------------------------------------------
@@ -436,55 +455,50 @@ def write_text(
],
key=lambda j: (j[1].y1, j[1].x0),
):
- out_string += "\n" + tabs[i].to_markdown(clean=False) + "\n"
+ out_string += "\n" + parms.tabs[i].to_markdown(clean=False) + "\n"
if EXTRACT_WORDS:
# for "words" extraction, add table cells as line rects
cells = sorted(
set(
[
pymupdf.Rect(c)
- for c in tabs[i].header.cells + tabs[i].cells
+ for c in parms.tabs[i].header.cells
+ + parms.tabs[i].cells
if c is not None
]
),
key=lambda c: (c.y1, c.x0),
)
- line_rects.extend(cells)
+ parms.line_rects.extend(cells)
del tab_rects[i]
# ------------------------------------------------------------
# Pick up images / graphics ABOVE this text block
# ------------------------------------------------------------
for i, temp_rect in sorted(
- [
- j
- for j in img_rects.items()
- if j[1].y1 <= lrect.y0 and not (j[1] & clip).is_empty
- ],
+ [j for j in img_rects if j.y1 <= lrect.y0 and not (j & clip).is_empty],
key=lambda j: (j[1].y1, j[1].x0),
):
- pathname = save_image(page, temp_rect, i)
+ pathname = save_image(parms.page, temp_rect, i)
if pathname:
out_string += GRAPHICS_TEXT % pathname
# recursive invocation
if force_text:
img_txt = write_text(
- page,
- textpage,
- clip=temp_rect,
+ parms,
+ temp_rect,
tabs=None,
tab_rects={},
- img_rects={},
- links=links,
+ img_rects=[],
force_text=True,
- line_rects=line_rects,
)
if not is_white(img_txt):
out_string += img_txt
del img_rects[i]
+ parms.line_rects.append(lrect)
text = " ".join([s["text"] for s in spans])
# full line mono-spaced?
@@ -522,7 +536,7 @@ def write_text(
prev_lrect = lrect
# if line is a header, this will return multiple "#" characters
- hdr_string = get_header_id(span0, page=page)
+ hdr_string = get_header_id(span0, page=parms.page)
# intercept if header text has been broken in multiple lines
if hdr_string and hdr_string == prev_hdr_string:
@@ -560,14 +574,23 @@ def write_text(
suffix = "_" + suffix
# convert intersecting link into markdown syntax
- ltext = resolve_links(links, s)
+ ltext = resolve_links(parms.links, s)
if ltext:
text = f"{hdr_string}{prefix}{ltext}{suffix} "
else:
text = f"{hdr_string}{prefix}{s['text'].strip()}{suffix} "
-
if text.startswith(bullet):
- text = "- " + text[1:]
+ text = text[1:]
+ if len(text) > 1 and text[1] == " ":
+ t = "-"
+ else:
+ t = "- "
+ text = t + text[1:]
+ dist = span0["bbox"][0] - clip.x0
+ cwidth = (span0["bbox"][2] - span0["bbox"][0]) / len(
+ span0["text"]
+ )
+ text = " " * int(round(dist / cwidth)) + text
out_string += text
if not code:
out_string += "\n"
@@ -595,105 +618,96 @@ def intersects_rects(rect, rect_list):
return i
return 0
- def output_tables(tabs, text_rect, tab_rects, line_rects, textpage):
- """Output tables above a text rectangle."""
- this_md = "" # markdown string for table content
+ def output_tables(parms, text_rect):
+ """Output tables above given text rectangle."""
+ this_md = "" # markdown string for table(s) content
if text_rect is not None: # select tables above the text block
for i, trect in sorted(
- [j for j in tab_rects.items() if j[1].y1 <= text_rect.y0],
+ [j for j in parms.tab_rects.items() if j[1].y1 <= text_rect.y0],
key=lambda j: (j[1].y1, j[1].x0),
):
- this_md += tabs[i].to_markdown(clean=False)
+ this_md += parms.tabs[i].to_markdown(clean=False)
if EXTRACT_WORDS:
# for "words" extraction, add table cells as line rects
cells = sorted(
set(
[
pymupdf.Rect(c)
- for c in tabs[i].header.cells + tabs[i].cells
+ for c in parms.tabs[i].header.cells
+ + parms.tabs[i].cells
if c is not None
]
),
key=lambda c: (c.y1, c.x0),
)
- line_rects.extend(cells)
- del tab_rects[i] # do not touch this table twice
+ parms.line_rects.extend(cells)
+ del parms.tab_rects[i] # do not touch this table twice
else: # output all remaining tables
for i, trect in sorted(
- tab_rects.items(),
+ parms.tab_rects.items(),
key=lambda j: (j[1].y1, j[1].x0),
):
- this_md += tabs[i].to_markdown(clean=False)
+ this_md += parms.tabs[i].to_markdown(clean=False)
if EXTRACT_WORDS:
# for "words" extraction, add table cells as line rects
cells = sorted(
set(
[
pymupdf.Rect(c)
- for c in tabs[i].header.cells + tabs[i].cells
+ for c in parms.tabs[i].header.cells
+ + parms.tabs[i].cells
if c is not None
]
),
key=lambda c: (c.y1, c.x0),
)
- line_rects.extend(cells)
- del tab_rects[i] # do not touch this table twice
+ parms.line_rects.extend(cells)
+ del parms.tab_rects[i] # do not touch this table twice
return this_md
- def output_images(page, textpage, text_rect, img_rects, line_rects):
+ def output_images(parms, text_rect):
"""Output images and graphics above text rectangle."""
- if img_rects is None:
+ if not parms.img_rects:
return ""
this_md = "" # markdown string
if text_rect is not None: # select images above the text block
- for i, img_rect in sorted(
- [j for j in img_rects.items() if j[1].y1 <= text_rect.y0],
- key=lambda j: (j[1].y1, j[1].x0),
- ):
- pathname = save_image(page, img_rect, i)
+ for i, img_rect in enumerate(parms.img_rects):
+ if not img_rect.y1 <= text_rect.y0:
+ continue
+ pathname = save_image(parms.page, img_rect, i)
if pathname:
this_md += GRAPHICS_TEXT % pathname
if force_text:
img_txt = write_text(
- page,
- textpage,
- clip=img_rect,
+ parms,
+ img_rect,
tabs=None,
tab_rects={}, # we have no tables here
- img_rects={}, # we have no other images here
- links=[], # rely on explicit HTML syntax
+ img_rects=[], # we have no other images here
force_text=True,
- line_rects=line_rects,
)
if not is_white(img_txt): # was there text at all?
this_md += img_txt
-
- del img_rects[i] # do not touch this image twice
+ del parms.img_rects[i] # do not touch this image twice
else: # output all remaining images
- for i, img_rect in sorted(
- img_rects.items(),
- key=lambda j: (j[1].y1, j[1].x0),
- ):
- pathname = save_image(page, img_rect, i)
+ for i, img_rect in enumerate(parms.img_rects):
+ pathname = save_image(parms.page, img_rect, i)
if pathname:
this_md += GRAPHICS_TEXT % pathname
if force_text:
img_txt = write_text(
- page,
- textpage,
- clip=img_rect,
+ parms,
+ img_rect,
tabs=None,
tab_rects={}, # we have no tables here
- img_rects={}, # we have no other images here
- links=[], # rely on explicit HTML syntax
+ img_rects=[], # we have no other images here
force_text=True,
- line_rects=line_rects,
)
if not is_white(img_txt):
this_md += img_txt
- del img_rects[i] # do not touch this image twice
+ del parms.img_rects[i] # do not touch this image twice
return this_md
def get_metadata(doc, pno):
@@ -703,7 +717,16 @@ def get_metadata(doc, pno):
meta["page"] = pno + 1
return meta
- def sort_words(words):
+ def sort_words(words: list) -> list:
+ """Reorder words in lines.
+
+ The argument list must be presorted by bottom, then left coordinates.
+
+ Words with similar top / bottom coordinates are assumed to belong to
+ the same line and will be sorted left to right within that line.
+ """
+ if not words:
+ return []
nwords = []
line = [words[0]]
lrect = pymupdf.Rect(words[0][:4])
@@ -734,74 +757,86 @@ def get_page_output(doc, pno, margins, textflags):
"""
page = doc[pno]
page.remove_rotation() # make sure we work on rotation=0
- md_string = ""
+ parms = Parameters() # all page information
+ parms.page = page
+ parms.md_string = ""
+ parms.images = []
+ parms.tables = []
+ parms.graphics = []
+ parms.words = []
+ parms.line_rects = []
+
+ # catch too-many-graphics situation
if GRAPHICS_LIMIT is not None:
- test_paths = page.get_cdrawings()
+ test_paths = page.get_cdrawings() # fastest access to graphics
if (excess := len(test_paths)) > GRAPHICS_LIMIT:
- md_string = (
+ parms.md_string = (
f"\n**Ignoring page {page.number} with {excess}+ vector graphics.**"
)
- md_string += "\n\n-----\n\n"
- return md_string, [], [], []
+ parms.md_string += "\n\n-----\n\n"
+ return parms
left, top, right, bottom = margins
- clip = page.rect + (left, top, -right, -bottom)
+ parms.clip = page.rect + (left, top, -right, -bottom)
+
# extract external links on page
- links = [l for l in page.get_links() if l["kind"] == pymupdf.LINK_URI]
+ parms.links = [l for l in page.get_links() if l["kind"] == pymupdf.LINK_URI]
# make a TextPage for all later extractions
- textpage = page.get_textpage(flags=textflags, clip=clip)
+ parms.textpage = page.get_textpage(flags=textflags, clip=parms.clip)
# extract images on page
- # ignore images contained in another one (simplified mechanism)
- img_info = page.get_image_info()[:]
+ # ignore images contained in some other one (simplified mechanism)
+ img_info = page.get_image_info()
+ for i in range(len(img_info)):
+ item = img_info[i]
+ item["bbox"] = pymupdf.Rect(item["bbox"]) & parms.clip
+ img_info[i] = item
+
# sort descending by image area size
- img_info.sort(key=lambda i: abs(pymupdf.Rect(i["bbox"])), reverse=True)
+ img_info.sort(key=lambda i: abs(i["bbox"]), reverse=True)
# run from back to front (= small to large)
for i in range(len(img_info) - 1, 0, -1):
- img1 = img_info[i]
- img0 = img_info[i - 1]
- if (
- pymupdf.Rect(img1["bbox"]) & page.rect
- in pymupdf.Rect(img0["bbox"]) & page.rect
- ):
- del img_info[i] # contained in some larger image
- images = img_info
- tables = []
- graphics = []
- line_rects = []
+ r = img_info[i]["bbox"]
+ if r.is_empty:
+ del img_info[i]
+ continue
+ for j in range(i): # image areas larger than r
+ if r in img_info[j]["bbox"]:
+ del img_info[i] # contained in some larger image
+ break
+ parms.images = img_info
+ parms.img_rects = [i["bbox"] for i in parms.images]
# Locate all tables on page
- tabs = page.find_tables(clip=clip, strategy=table_strategy)
-
+ parms.tabs = page.find_tables(clip=parms.clip, strategy=table_strategy)
# Make a list of table boundary boxes.
# Must include the header bbox (which may exist outside tab.bbox)
tab_rects = {}
- for i, t in enumerate(tabs):
+ for i, t in enumerate(parms.tabs):
tab_rects[i] = pymupdf.Rect(t.bbox) | pymupdf.Rect(t.header.bbox)
tab_dict = {
"bbox": tuple(tab_rects[i]),
"rows": t.row_count,
"columns": t.col_count,
}
- tables.append(tab_dict)
-
+ parms.tables.append(tab_dict)
+ parms.tab_rects = tab_rects
# list of table rectangles
- tab_rects0 = list(tab_rects.values())
+ parms.tab_rects0 = list(tab_rects.values())
# Select paths not contained in any table
# ignore full page graphics
- page_clip = page.rect + (36, 36, -36, -36)
paths = [
p
for p in page.get_drawings()
- if not intersects_rects(p["rect"], tab_rects0)
- and p["rect"] in page_clip
- and p["rect"].width < page_clip.width
- and p["rect"].height < page_clip.height
+ if not intersects_rects(p["rect"], parms.tab_rects0)
+ and p["rect"] in parms.clip
+ and p["rect"].width < parms.clip.width
+ and p["rect"].height < parms.clip.height
]
- # We also ignore vector graphics that only represent "text
- # emphasizing sugar".
+ # We also ignore vector graphics that only represent
+ # "text emphasizing sugar".
vg_clusters0 = [] # worthwhile vector graphics go here
# walk through all vector graphics outside any table
@@ -810,74 +845,67 @@ def get_page_output(doc, pno, margins, textflags):
vg_clusters0.append(bbox)
# remove paths that are not in some relevant graphic
- actual_paths = [p for p in paths if is_in_rects(p["rect"], vg_clusters0)]
+ parms.actual_paths = [p for p in paths if is_in_rects(p["rect"], vg_clusters0)]
# also add image rectangles to the list
- vg_clusters0 += [pymupdf.Rect(i["bbox"]) for i in img_info]
+ vg_clusters0.extend(parms.img_rects)
# these may no longer be pairwise disjoint:
# remove area overlaps by joining into larger rects
- vg_clusters0 = refine_boxes(vg_clusters0)
+ parms.vg_clusters0 = refine_boxes(vg_clusters0)
- vg_clusters = dict((i, r) for i, r in enumerate(vg_clusters0))
+ parms.vg_clusters = dict((i, r) for i, r in enumerate(parms.vg_clusters0))
# identify text bboxes on page, avoiding tables, images and graphics
text_rects = column_boxes(
- page,
- paths=actual_paths,
+ parms.page,
+ paths=parms.actual_paths,
no_image_text=True,
- textpage=textpage,
- avoid=tab_rects0 + vg_clusters0,
+ textpage=parms.textpage,
+ avoid=parms.tab_rects0 + parms.vg_clusters0,
footer_margin=margins[3],
header_margin=margins[1],
)
- """Extract markdown text iterating over text rectangles.
+ """
+ ------------------------------------------------------------------
+ Extract markdown text iterating over text rectangles.
We also output any tables. They may live above, below or inside
the text rectangles.
+ ------------------------------------------------------------------
"""
for text_rect in text_rects:
- # output tables above this block of text
- md_string += output_tables(tabs, text_rect, tab_rects, line_rects, textpage)
- md_string += output_images(
- page, textpage, text_rect, vg_clusters, line_rects
- )
+ # output tables above this rectangle
+ parms.md_string += output_tables(parms, text_rect)
+ parms.md_string += output_images(parms, text_rect)
# output text inside this rectangle
- md_string += write_text(
- page,
- textpage,
- text_rect,
- tabs=tabs,
- tab_rects=tab_rects,
- img_rects=vg_clusters,
- links=links,
- force_text=force_text,
- line_rects=line_rects,
- )
+ parms.md_string += write_text(parms, text_rect, force_text=force_text)
- md_string = md_string.replace(" ,", ",").replace("-\n", "")
+ parms.md_string = parms.md_string.replace(" ,", ",").replace("-\n", "")
# write any remaining tables and images
- md_string += output_tables(tabs, None, tab_rects, line_rects, textpage)
- md_string += output_images(page, textpage, None, vg_clusters, line_rects)
- md_string += "\n-----\n\n"
- while md_string.startswith("\n"):
- md_string = md_string[1:]
- md_string = md_string.replace(chr(0), chr(0xFFFD))
+
+ parms.md_string += output_tables(parms, None)
+
+ parms.md_string += output_images(parms, None)
+
+ parms.md_string += "\n-----\n\n"
+ while parms.md_string.startswith("\n"):
+ parms.md_string = parms.md_string[1:]
+ parms.md_string = parms.md_string.replace(chr(0), chr(0xFFFD))
if EXTRACT_WORDS is True:
# output words in sequence compliant with Markdown text
- rawwords = textpage.extractWORDS()
+ rawwords = parms.textpage.extractWORDS()
rawwords.sort(key=lambda w: (w[3], w[0]))
+
words = []
- for lrect in line_rects:
+ for lrect in parms.line_rects:
lwords = []
for w in rawwords:
wrect = pymupdf.Rect(w[:4])
if wrect in lrect:
lwords.append(w)
- # append sorted words of this line
- # words.extend(sorted(lwords, key=lambda w: w[0]))
words.extend(sort_words(lwords))
# remove word duplicates without spoiling the sequence
@@ -890,7 +918,8 @@ def get_page_output(doc, pno, margins, textflags):
else:
words = []
- return md_string, images, tables, graphics, words
+ parms.words = words
+ return parms
if page_chunks is False:
document_output = ""
@@ -904,11 +933,9 @@ def get_page_output(doc, pno, margins, textflags):
print(f"Processing {doc.name}...")
pages = ProgressBar(pages)
for pno in pages:
- page_output, images, tables, graphics, words = get_page_output(
- doc, pno, margins, textflags
- )
+ parms = get_page_output(doc, pno, margins, textflags)
if page_chunks is False:
- document_output += page_output
+ document_output += parms.md_string
else:
# build subet of TOC for this page
page_tocs = [t for t in toc if t[-1] == pno + 1]
@@ -918,13 +945,14 @@ def get_page_output(doc, pno, margins, textflags):
{
"metadata": metadata,
"toc_items": page_tocs,
- "tables": tables,
- "images": images,
- "graphics": graphics,
- "text": page_output,
- "words": words,
+ "tables": parms.tables,
+ "images": parms.images,
+ "graphics": parms.graphics,
+ "text": parms.md_string,
+ "words": parms.words,
}
)
+ del parms
return document_output
diff --git a/pymupdf4llm/setup.py b/pymupdf4llm/setup.py
index 9633f207..2081dedc 100644
--- a/pymupdf4llm/setup.py
+++ b/pymupdf4llm/setup.py
@@ -13,11 +13,11 @@
"Programming Language :: Python :: 3",
"Topic :: Utilities",
]
-requires = ["pymupdf>=1.24.3"]
+requires = ["pymupdf>=1.24.10"]
setuptools.setup(
name="pymupdf4llm",
- version="0.0.16",
+ version="0.0.17",
author="Artifex",
author_email="support@artifex.com",
description="PyMuPDF Utilities for LLM/RAG",