diff --git a/CHANGES.md b/CHANGES.md index 0bf11524..a09699aa 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,9 +1,17 @@ # Change Log -## Changes in version 0.0.15 +## Changes in version 0.0.17 + +### Fixes: +* [147](https://github.com/pymupdf/RAG/issues/147) - Error when page contains nothing but a table. +* [81](https://github.com/pymupdf/RAG/issues/81) - Issues with bullet points in PDFs. +* [78](https://github.com/pymupdf/RAG/issues/78) - multi column pdf file text extraction. + + +## Changes in version 0.0.15 ### Fixes: diff --git a/pymupdf4llm/pymupdf4llm/__init__.py b/pymupdf4llm/pymupdf4llm/__init__.py index 690ef694..6861f438 100644 --- a/pymupdf4llm/pymupdf4llm/__init__.py +++ b/pymupdf4llm/pymupdf4llm/__init__.py @@ -1,6 +1,6 @@ from .helpers.pymupdf_rag import IdentifyHeaders, to_markdown -__version__ = "0.0.16" +__version__ = "0.0.17" version = __version__ version_tuple = tuple(map(int, version.split("."))) diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py index 49045389..6be39bf1 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py +++ b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py @@ -33,26 +33,30 @@ from pymupdf4llm.helpers.get_text_lines import get_raw_lines, is_white from pymupdf4llm.helpers.multi_column import column_boxes from pymupdf4llm.helpers.progress import ProgressBar +from dataclasses import dataclass +# Characters recognized as bullets when starting a line. bullet = tuple( - [ - "- ", - "* ", - chr(0xF0A7), - chr(0xF0B7), - chr(0xB7), - chr(0xBE), - chr(0xBE) + " ", - chr(8226), - ] - + list(map(chr, range(9642, 9680))) + ["- ", "* ", "> ", chr(0xB6), chr(0xB7), chr(8226), chr(0xF0A7), chr(0xF0B7)] + + list(map(chr, range(9632, 9680))) ) GRAPHICS_TEXT = "\n![](%s)\n" class IdentifyHeaders: - """Compute data for identifying header text.""" + """Compute data for identifying header text. + + All non-white text from all selected pages is extracted and its font size + noted as a rounded value. + The most frequent font size (and all smaller ones) is taken as body text + font size. + Larger font sizes are mapped to strings of multiples of '#', the header + tag in Markdown, which in turn is Markdown's representation of HTML's + header tags

to

. + Larger font sizes than body text but smaller than the
font size are + represented as
. + """ def __init__( self, @@ -126,18 +130,28 @@ def get_header_id(self, span: dict, page=None) -> str: markdown header prefix string of 0 to n concatenated '#' characters. """ fontsize = round(span["size"]) # compute fontsize + if fontsize <= self.body_limit: # shortcut for body text + return "" hdr_id = self.header_id.get(fontsize, "") + # If no header but larger than body text, assign
. if not hdr_id and fontsize > self.body_limit: hdr_id = "###### " return hdr_id -def poly_area(points): +# store relevant parameters here +@dataclass +class Parameters: + pass + + +def poly_area(points: list) -> float: """Compute the area of the polygon represented by the given points. We are using the "shoelace" algorithm (Gauss) for this. + Accepts a list of Point items and returns a float. """ - # make a local copy of points (avoid changing the original) + # make a local copy of points (do not change the original) pts = points[:] # remove duplicated connector points first for i in range(len(pts) - 1, 0, -1): @@ -153,7 +167,12 @@ def poly_area(points): def refine_boxes(boxes): - """Join any rectangles with a pairwise non-empty overlap.""" + """Join any rectangles with a pairwise non-empty overlap. + + Accepts and returns a list of Rect items. + Note that rectangles that only "touch" each other (common point or edge) + are not considered as overlapping. + """ new_rects = [] # list of all vector graphic rectangles prects = boxes[:] @@ -167,8 +186,9 @@ def refine_boxes(boxes): if r.intersects(prects[i]): # enlarge first rect with this r |= prects[i] del prects[i] # delete this rect - repeat = True # indicate we must try again + repeat = True # indicate must try again + # first rect now includes all overlaps new_rects.append(r) del prects[0] @@ -179,9 +199,10 @@ def refine_boxes(boxes): def is_significant(box, paths): """Check whether the rectangle "box" contains 'signifiant' drawings. - For this to be true, at least one path must cover an area, - which is smaller than 90% of box. Otherwise we assume - that the graphic is decoration (highlighting, border-only etc.). + 'Significant' means that at least one stroked path must cover an area + less than 90% of box. + Not significant means that the graphic is decoration only (highlighting, + border-only etc.). It will not be considered further. """ box_area = abs(box) * 0.9 # 90% of area of box @@ -264,8 +285,8 @@ def to_markdown( if embed_images is True: write_images = False image_path = "" - if not 0 < image_size_limit < 1: - raise ValueError("'image_size_limit' must be positive and less than 1.") + if not 0 <= image_size_limit < 1: + raise ValueError("'image_size_limit' must be non-negative and less than 1.") DPI = dpi IGNORE_CODE = ignore_code IMG_EXTENSION = image_format @@ -372,15 +393,12 @@ def save_image(page, rect, i): return "" def write_text( - page: pymupdf.Page, - textpage: pymupdf.TextPage, + parms, clip: pymupdf.Rect, tabs=None, - tab_rects: dict = None, - img_rects: dict = None, - links: list = None, + tab_rects: dict = {}, + img_rects: list = [], force_text=force_text, - line_rects=None, ) -> string: """Output the text found inside the given clip. @@ -391,7 +409,7 @@ def write_text( There is also some effort for list supported (ordered / unordered) in that typical characters are replaced by respective markdown characters. - 'tab_rects'/'img_rects' are dictionaries of table, respectively image + 'tab_rects'/'img_rects' are dict / list of table, respectively image or vector graphic rectangles. General Markdown text generation skips these areas. Tables are written via their own 'to_markdown' method. Images and vector graphics are @@ -399,17 +417,20 @@ def write_text( """ if clip is None: - clip = textpage.rect + clip = parms.clip out_string = "" - # This is a list of tuples (linerect, spanlist) - nlines = get_raw_lines(textpage, clip=clip, tolerance=3) + tab_rects0 = list(tab_rects.values()) + img_rects0 = img_rects - line_rects.extend([l[0] for l in nlines]) # store line rectangles + # This is a list of tuples (linerect, spanlist) + nlines = [ + l + for l in get_raw_lines(parms.textpage, clip=clip, tolerance=3) + if not intersects_rects(l[0], tab_rects0) + ] - tab_rects0 = list(tab_rects.values()) - img_rects0 = list(img_rects.values()) - line_rects.extend( + parms.line_rects.extend( [l[0] for l in nlines if not intersects_rects(l[0], tab_rects0)] ) # store line rectangles @@ -420,9 +441,7 @@ def write_text( for lrect, spans in nlines: # there may be tables or images inside the text block: skip them - if intersects_rects(lrect, tab_rects0) or intersects_rects( - lrect, img_rects0 - ): + if intersects_rects(lrect, img_rects0): continue # ------------------------------------------------------------ @@ -436,55 +455,50 @@ def write_text( ], key=lambda j: (j[1].y1, j[1].x0), ): - out_string += "\n" + tabs[i].to_markdown(clean=False) + "\n" + out_string += "\n" + parms.tabs[i].to_markdown(clean=False) + "\n" if EXTRACT_WORDS: # for "words" extraction, add table cells as line rects cells = sorted( set( [ pymupdf.Rect(c) - for c in tabs[i].header.cells + tabs[i].cells + for c in parms.tabs[i].header.cells + + parms.tabs[i].cells if c is not None ] ), key=lambda c: (c.y1, c.x0), ) - line_rects.extend(cells) + parms.line_rects.extend(cells) del tab_rects[i] # ------------------------------------------------------------ # Pick up images / graphics ABOVE this text block # ------------------------------------------------------------ for i, temp_rect in sorted( - [ - j - for j in img_rects.items() - if j[1].y1 <= lrect.y0 and not (j[1] & clip).is_empty - ], + [j for j in img_rects if j.y1 <= lrect.y0 and not (j & clip).is_empty], key=lambda j: (j[1].y1, j[1].x0), ): - pathname = save_image(page, temp_rect, i) + pathname = save_image(parms.page, temp_rect, i) if pathname: out_string += GRAPHICS_TEXT % pathname # recursive invocation if force_text: img_txt = write_text( - page, - textpage, - clip=temp_rect, + parms, + temp_rect, tabs=None, tab_rects={}, - img_rects={}, - links=links, + img_rects=[], force_text=True, - line_rects=line_rects, ) if not is_white(img_txt): out_string += img_txt del img_rects[i] + parms.line_rects.append(lrect) text = " ".join([s["text"] for s in spans]) # full line mono-spaced? @@ -522,7 +536,7 @@ def write_text( prev_lrect = lrect # if line is a header, this will return multiple "#" characters - hdr_string = get_header_id(span0, page=page) + hdr_string = get_header_id(span0, page=parms.page) # intercept if header text has been broken in multiple lines if hdr_string and hdr_string == prev_hdr_string: @@ -560,14 +574,23 @@ def write_text( suffix = "_" + suffix # convert intersecting link into markdown syntax - ltext = resolve_links(links, s) + ltext = resolve_links(parms.links, s) if ltext: text = f"{hdr_string}{prefix}{ltext}{suffix} " else: text = f"{hdr_string}{prefix}{s['text'].strip()}{suffix} " - if text.startswith(bullet): - text = "- " + text[1:] + text = text[1:] + if len(text) > 1 and text[1] == " ": + t = "-" + else: + t = "- " + text = t + text[1:] + dist = span0["bbox"][0] - clip.x0 + cwidth = (span0["bbox"][2] - span0["bbox"][0]) / len( + span0["text"] + ) + text = " " * int(round(dist / cwidth)) + text out_string += text if not code: out_string += "\n" @@ -595,105 +618,96 @@ def intersects_rects(rect, rect_list): return i return 0 - def output_tables(tabs, text_rect, tab_rects, line_rects, textpage): - """Output tables above a text rectangle.""" - this_md = "" # markdown string for table content + def output_tables(parms, text_rect): + """Output tables above given text rectangle.""" + this_md = "" # markdown string for table(s) content if text_rect is not None: # select tables above the text block for i, trect in sorted( - [j for j in tab_rects.items() if j[1].y1 <= text_rect.y0], + [j for j in parms.tab_rects.items() if j[1].y1 <= text_rect.y0], key=lambda j: (j[1].y1, j[1].x0), ): - this_md += tabs[i].to_markdown(clean=False) + this_md += parms.tabs[i].to_markdown(clean=False) if EXTRACT_WORDS: # for "words" extraction, add table cells as line rects cells = sorted( set( [ pymupdf.Rect(c) - for c in tabs[i].header.cells + tabs[i].cells + for c in parms.tabs[i].header.cells + + parms.tabs[i].cells if c is not None ] ), key=lambda c: (c.y1, c.x0), ) - line_rects.extend(cells) - del tab_rects[i] # do not touch this table twice + parms.line_rects.extend(cells) + del parms.tab_rects[i] # do not touch this table twice else: # output all remaining tables for i, trect in sorted( - tab_rects.items(), + parms.tab_rects.items(), key=lambda j: (j[1].y1, j[1].x0), ): - this_md += tabs[i].to_markdown(clean=False) + this_md += parms.tabs[i].to_markdown(clean=False) if EXTRACT_WORDS: # for "words" extraction, add table cells as line rects cells = sorted( set( [ pymupdf.Rect(c) - for c in tabs[i].header.cells + tabs[i].cells + for c in parms.tabs[i].header.cells + + parms.tabs[i].cells if c is not None ] ), key=lambda c: (c.y1, c.x0), ) - line_rects.extend(cells) - del tab_rects[i] # do not touch this table twice + parms.line_rects.extend(cells) + del parms.tab_rects[i] # do not touch this table twice return this_md - def output_images(page, textpage, text_rect, img_rects, line_rects): + def output_images(parms, text_rect): """Output images and graphics above text rectangle.""" - if img_rects is None: + if not parms.img_rects: return "" this_md = "" # markdown string if text_rect is not None: # select images above the text block - for i, img_rect in sorted( - [j for j in img_rects.items() if j[1].y1 <= text_rect.y0], - key=lambda j: (j[1].y1, j[1].x0), - ): - pathname = save_image(page, img_rect, i) + for i, img_rect in enumerate(parms.img_rects): + if not img_rect.y1 <= text_rect.y0: + continue + pathname = save_image(parms.page, img_rect, i) if pathname: this_md += GRAPHICS_TEXT % pathname if force_text: img_txt = write_text( - page, - textpage, - clip=img_rect, + parms, + img_rect, tabs=None, tab_rects={}, # we have no tables here - img_rects={}, # we have no other images here - links=[], # rely on explicit HTML syntax + img_rects=[], # we have no other images here force_text=True, - line_rects=line_rects, ) if not is_white(img_txt): # was there text at all? this_md += img_txt - - del img_rects[i] # do not touch this image twice + del parms.img_rects[i] # do not touch this image twice else: # output all remaining images - for i, img_rect in sorted( - img_rects.items(), - key=lambda j: (j[1].y1, j[1].x0), - ): - pathname = save_image(page, img_rect, i) + for i, img_rect in enumerate(parms.img_rects): + pathname = save_image(parms.page, img_rect, i) if pathname: this_md += GRAPHICS_TEXT % pathname if force_text: img_txt = write_text( - page, - textpage, - clip=img_rect, + parms, + img_rect, tabs=None, tab_rects={}, # we have no tables here - img_rects={}, # we have no other images here - links=[], # rely on explicit HTML syntax + img_rects=[], # we have no other images here force_text=True, - line_rects=line_rects, ) if not is_white(img_txt): this_md += img_txt - del img_rects[i] # do not touch this image twice + del parms.img_rects[i] # do not touch this image twice return this_md def get_metadata(doc, pno): @@ -703,7 +717,16 @@ def get_metadata(doc, pno): meta["page"] = pno + 1 return meta - def sort_words(words): + def sort_words(words: list) -> list: + """Reorder words in lines. + + The argument list must be presorted by bottom, then left coordinates. + + Words with similar top / bottom coordinates are assumed to belong to + the same line and will be sorted left to right within that line. + """ + if not words: + return [] nwords = [] line = [words[0]] lrect = pymupdf.Rect(words[0][:4]) @@ -734,74 +757,86 @@ def get_page_output(doc, pno, margins, textflags): """ page = doc[pno] page.remove_rotation() # make sure we work on rotation=0 - md_string = "" + parms = Parameters() # all page information + parms.page = page + parms.md_string = "" + parms.images = [] + parms.tables = [] + parms.graphics = [] + parms.words = [] + parms.line_rects = [] + + # catch too-many-graphics situation if GRAPHICS_LIMIT is not None: - test_paths = page.get_cdrawings() + test_paths = page.get_cdrawings() # fastest access to graphics if (excess := len(test_paths)) > GRAPHICS_LIMIT: - md_string = ( + parms.md_string = ( f"\n**Ignoring page {page.number} with {excess}+ vector graphics.**" ) - md_string += "\n\n-----\n\n" - return md_string, [], [], [] + parms.md_string += "\n\n-----\n\n" + return parms left, top, right, bottom = margins - clip = page.rect + (left, top, -right, -bottom) + parms.clip = page.rect + (left, top, -right, -bottom) + # extract external links on page - links = [l for l in page.get_links() if l["kind"] == pymupdf.LINK_URI] + parms.links = [l for l in page.get_links() if l["kind"] == pymupdf.LINK_URI] # make a TextPage for all later extractions - textpage = page.get_textpage(flags=textflags, clip=clip) + parms.textpage = page.get_textpage(flags=textflags, clip=parms.clip) # extract images on page - # ignore images contained in another one (simplified mechanism) - img_info = page.get_image_info()[:] + # ignore images contained in some other one (simplified mechanism) + img_info = page.get_image_info() + for i in range(len(img_info)): + item = img_info[i] + item["bbox"] = pymupdf.Rect(item["bbox"]) & parms.clip + img_info[i] = item + # sort descending by image area size - img_info.sort(key=lambda i: abs(pymupdf.Rect(i["bbox"])), reverse=True) + img_info.sort(key=lambda i: abs(i["bbox"]), reverse=True) # run from back to front (= small to large) for i in range(len(img_info) - 1, 0, -1): - img1 = img_info[i] - img0 = img_info[i - 1] - if ( - pymupdf.Rect(img1["bbox"]) & page.rect - in pymupdf.Rect(img0["bbox"]) & page.rect - ): - del img_info[i] # contained in some larger image - images = img_info - tables = [] - graphics = [] - line_rects = [] + r = img_info[i]["bbox"] + if r.is_empty: + del img_info[i] + continue + for j in range(i): # image areas larger than r + if r in img_info[j]["bbox"]: + del img_info[i] # contained in some larger image + break + parms.images = img_info + parms.img_rects = [i["bbox"] for i in parms.images] # Locate all tables on page - tabs = page.find_tables(clip=clip, strategy=table_strategy) - + parms.tabs = page.find_tables(clip=parms.clip, strategy=table_strategy) # Make a list of table boundary boxes. # Must include the header bbox (which may exist outside tab.bbox) tab_rects = {} - for i, t in enumerate(tabs): + for i, t in enumerate(parms.tabs): tab_rects[i] = pymupdf.Rect(t.bbox) | pymupdf.Rect(t.header.bbox) tab_dict = { "bbox": tuple(tab_rects[i]), "rows": t.row_count, "columns": t.col_count, } - tables.append(tab_dict) - + parms.tables.append(tab_dict) + parms.tab_rects = tab_rects # list of table rectangles - tab_rects0 = list(tab_rects.values()) + parms.tab_rects0 = list(tab_rects.values()) # Select paths not contained in any table # ignore full page graphics - page_clip = page.rect + (36, 36, -36, -36) paths = [ p for p in page.get_drawings() - if not intersects_rects(p["rect"], tab_rects0) - and p["rect"] in page_clip - and p["rect"].width < page_clip.width - and p["rect"].height < page_clip.height + if not intersects_rects(p["rect"], parms.tab_rects0) + and p["rect"] in parms.clip + and p["rect"].width < parms.clip.width + and p["rect"].height < parms.clip.height ] - # We also ignore vector graphics that only represent "text - # emphasizing sugar". + # We also ignore vector graphics that only represent + # "text emphasizing sugar". vg_clusters0 = [] # worthwhile vector graphics go here # walk through all vector graphics outside any table @@ -810,74 +845,67 @@ def get_page_output(doc, pno, margins, textflags): vg_clusters0.append(bbox) # remove paths that are not in some relevant graphic - actual_paths = [p for p in paths if is_in_rects(p["rect"], vg_clusters0)] + parms.actual_paths = [p for p in paths if is_in_rects(p["rect"], vg_clusters0)] # also add image rectangles to the list - vg_clusters0 += [pymupdf.Rect(i["bbox"]) for i in img_info] + vg_clusters0.extend(parms.img_rects) # these may no longer be pairwise disjoint: # remove area overlaps by joining into larger rects - vg_clusters0 = refine_boxes(vg_clusters0) + parms.vg_clusters0 = refine_boxes(vg_clusters0) - vg_clusters = dict((i, r) for i, r in enumerate(vg_clusters0)) + parms.vg_clusters = dict((i, r) for i, r in enumerate(parms.vg_clusters0)) # identify text bboxes on page, avoiding tables, images and graphics text_rects = column_boxes( - page, - paths=actual_paths, + parms.page, + paths=parms.actual_paths, no_image_text=True, - textpage=textpage, - avoid=tab_rects0 + vg_clusters0, + textpage=parms.textpage, + avoid=parms.tab_rects0 + parms.vg_clusters0, footer_margin=margins[3], header_margin=margins[1], ) - """Extract markdown text iterating over text rectangles. + """ + ------------------------------------------------------------------ + Extract markdown text iterating over text rectangles. We also output any tables. They may live above, below or inside the text rectangles. + ------------------------------------------------------------------ """ for text_rect in text_rects: - # output tables above this block of text - md_string += output_tables(tabs, text_rect, tab_rects, line_rects, textpage) - md_string += output_images( - page, textpage, text_rect, vg_clusters, line_rects - ) + # output tables above this rectangle + parms.md_string += output_tables(parms, text_rect) + parms.md_string += output_images(parms, text_rect) # output text inside this rectangle - md_string += write_text( - page, - textpage, - text_rect, - tabs=tabs, - tab_rects=tab_rects, - img_rects=vg_clusters, - links=links, - force_text=force_text, - line_rects=line_rects, - ) + parms.md_string += write_text(parms, text_rect, force_text=force_text) - md_string = md_string.replace(" ,", ",").replace("-\n", "") + parms.md_string = parms.md_string.replace(" ,", ",").replace("-\n", "") # write any remaining tables and images - md_string += output_tables(tabs, None, tab_rects, line_rects, textpage) - md_string += output_images(page, textpage, None, vg_clusters, line_rects) - md_string += "\n-----\n\n" - while md_string.startswith("\n"): - md_string = md_string[1:] - md_string = md_string.replace(chr(0), chr(0xFFFD)) + + parms.md_string += output_tables(parms, None) + + parms.md_string += output_images(parms, None) + + parms.md_string += "\n-----\n\n" + while parms.md_string.startswith("\n"): + parms.md_string = parms.md_string[1:] + parms.md_string = parms.md_string.replace(chr(0), chr(0xFFFD)) if EXTRACT_WORDS is True: # output words in sequence compliant with Markdown text - rawwords = textpage.extractWORDS() + rawwords = parms.textpage.extractWORDS() rawwords.sort(key=lambda w: (w[3], w[0])) + words = [] - for lrect in line_rects: + for lrect in parms.line_rects: lwords = [] for w in rawwords: wrect = pymupdf.Rect(w[:4]) if wrect in lrect: lwords.append(w) - # append sorted words of this line - # words.extend(sorted(lwords, key=lambda w: w[0])) words.extend(sort_words(lwords)) # remove word duplicates without spoiling the sequence @@ -890,7 +918,8 @@ def get_page_output(doc, pno, margins, textflags): else: words = [] - return md_string, images, tables, graphics, words + parms.words = words + return parms if page_chunks is False: document_output = "" @@ -904,11 +933,9 @@ def get_page_output(doc, pno, margins, textflags): print(f"Processing {doc.name}...") pages = ProgressBar(pages) for pno in pages: - page_output, images, tables, graphics, words = get_page_output( - doc, pno, margins, textflags - ) + parms = get_page_output(doc, pno, margins, textflags) if page_chunks is False: - document_output += page_output + document_output += parms.md_string else: # build subet of TOC for this page page_tocs = [t for t in toc if t[-1] == pno + 1] @@ -918,13 +945,14 @@ def get_page_output(doc, pno, margins, textflags): { "metadata": metadata, "toc_items": page_tocs, - "tables": tables, - "images": images, - "graphics": graphics, - "text": page_output, - "words": words, + "tables": parms.tables, + "images": parms.images, + "graphics": parms.graphics, + "text": parms.md_string, + "words": parms.words, } ) + del parms return document_output diff --git a/pymupdf4llm/setup.py b/pymupdf4llm/setup.py index 9633f207..2081dedc 100644 --- a/pymupdf4llm/setup.py +++ b/pymupdf4llm/setup.py @@ -13,11 +13,11 @@ "Programming Language :: Python :: 3", "Topic :: Utilities", ] -requires = ["pymupdf>=1.24.3"] +requires = ["pymupdf>=1.24.10"] setuptools.setup( name="pymupdf4llm", - version="0.0.16", + version="0.0.17", author="Artifex", author_email="support@artifex.com", description="PyMuPDF Utilities for LLM/RAG",