diff --git a/README.md b/README.md index 39a1a32d..28d1b1c9 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ There's a hosted API for marker available [here](https://www.datalab.to/): - Supports PDFs, word documents, and powerpoints - 1/4th the price of leading cloud-based competitors -- High uptime (99.99%), quality, and speed (.25s/page for 50 page doc) +- High uptime (99.99%), quality, and speed (around 15 seconds to convert a 250 page PDF) # Community diff --git a/benchmarks/overall.py b/benchmarks/overall.py index 6564b256..f6fb9591 100644 --- a/benchmarks/overall.py +++ b/benchmarks/overall.py @@ -16,6 +16,8 @@ import subprocess import shutil from tabulate import tabulate + +from marker.settings import settings from scoring import score_text configure_logging() @@ -53,7 +55,7 @@ def main(in_folder: str, reference_folder: str, out_file: str, nougat: bool, md_ md_filename = fname.rsplit(".", 1)[0] + ".md" reference_filename = os.path.join(reference_folder, md_filename) - with open(reference_filename, "r", encoding="utf-8") as f: + with open(reference_filename, "r") as f: reference = f.read() pdf_filename = os.path.join(in_folder, fname) diff --git a/data/images/overall.png b/data/images/overall.png index 0b7f5318..0946421a 100644 Binary files a/data/images/overall.png and b/data/images/overall.png differ diff --git a/data/images/per_doc.png b/data/images/per_doc.png index 6c864a57..ed26cfb9 100644 Binary files a/data/images/per_doc.png and b/data/images/per_doc.png differ diff --git a/marker/builders/document.py b/marker/builders/document.py index d9729beb..60c42749 100644 --- a/marker/builders/document.py +++ b/marker/builders/document.py @@ -33,13 +33,15 @@ def __call__(self, provider: PdfProvider, layout_builder: LayoutBuilder, ocr_bui def build_document(self, provider: PdfProvider): PageGroupClass: PageGroup = get_block_class(BlockTypes.Page) + lowres_images = provider.get_images(provider.page_range, self.lowres_image_dpi) + highres_images = provider.get_images(provider.page_range, self.highres_image_dpi) initial_pages = [ PageGroupClass( - page_id=i, - lowres_image=provider.get_image(i, self.lowres_image_dpi), - highres_image=provider.get_image(i, self.highres_image_dpi), - polygon=provider.get_page_bbox(i) - ) for i in provider.page_range + page_id=p, + lowres_image=lowres_images[i], + highres_image=highres_images[i], + polygon=provider.get_page_bbox(p) + ) for i, p in enumerate(provider.page_range) ] DocumentClass: Document = get_block_class(BlockTypes.Document) return DocumentClass(filepath=provider.filepath, pages=initial_pages) diff --git a/marker/builders/layout.py b/marker/builders/layout.py index afacf5a4..8a17bdda 100644 --- a/marker/builders/layout.py +++ b/marker/builders/layout.py @@ -82,6 +82,10 @@ def add_blocks_to_pages(self, pages: List[PageGroup], layout_results: List[Layou layout_block.polygon = layout_block.polygon.rescale(layout_page_size, provider_page_size) page.add_structure(layout_block) + # Ensure page has non-empty structure + if page.structure is None: + page.structure = [] + def merge_blocks(self, document_pages: List[PageGroup], provider_page_lines: ProviderPageLines): good_pages = [] for document_page in document_pages: diff --git a/marker/builders/ocr.py b/marker/builders/ocr.py index 93c5ca19..5a9fb537 100644 --- a/marker/builders/ocr.py +++ b/marker/builders/ocr.py @@ -1,10 +1,10 @@ from typing import List +from ftfy import fix_text from surya.model.detection.model import EfficientViTForSemanticSegmentation from surya.model.recognition.encoderdecoder import OCREncoderDecoderModel from surya.ocr import run_ocr -from marker.settings import settings from marker.builders import BaseBuilder from marker.providers import ProviderOutput, ProviderPageLines from marker.providers.pdf import PdfProvider @@ -14,6 +14,7 @@ from marker.schema.registry import get_block_class from marker.schema.text.line import Line from marker.schema.text.span import Span +from marker.settings import settings class OcrBuilder(BaseBuilder): @@ -96,13 +97,13 @@ def ocr_extraction(self, document: Document, provider: PdfProvider) -> ProviderP ) spans = [ SpanClass( - text=ocr_line.text + "\n", + text=fix_text(ocr_line.text) + "\n", formats=['plain'], page_id=page_id, polygon=polygon, minimum_position=0, maximum_position=0, - font='', + font='Unknown', font_weight=0, font_size=0, ) diff --git a/marker/config/parser.py b/marker/config/parser.py index 11601ac7..7a1ee9f5 100644 --- a/marker/config/parser.py +++ b/marker/config/parser.py @@ -32,6 +32,8 @@ def common_options(fn): help="Path to JSON file with additional configuration.")(fn) fn = click.option("--languages", type=str, default=None, help="Comma separated list of languages to use for OCR.")(fn) fn = click.option("--disable_multiprocessing", is_flag=True, default=False, help="Disable multiprocessing.")(fn) + fn = click.option("--paginate_output", is_flag=True, default=False, help="Paginate output.")(fn) + fn = click.option("--disable_image_extraction", is_flag=True, default=False, help="Disable image extraction.")(fn) return fn def generate_config_dict(self) -> Dict[str, any]: @@ -61,6 +63,12 @@ def generate_config_dict(self) -> Dict[str, any]: case "disable_multiprocessing": if v: config["pdftext_workers"] = 1 + case "paginate_output": + if v: + config["paginate_output"] = True + case "disable_image_extraction": + if v: + config["extract_images"] = False return config def get_renderer(self): diff --git a/marker/converters/pdf.py b/marker/converters/pdf.py index 978a8651..b77eb1e8 100644 --- a/marker/converters/pdf.py +++ b/marker/converters/pdf.py @@ -10,6 +10,7 @@ from marker.builders.ocr import OcrBuilder from marker.builders.structure import StructureBuilder from marker.converters import BaseConverter +from marker.processors.blockquote import BlockquoteProcessor from marker.processors.code import CodeProcessor from marker.processors.debug import DebugProcessor from marker.processors.document_toc import DocumentTOCProcessor @@ -17,6 +18,7 @@ from marker.processors.footnote import FootnoteProcessor from marker.processors.ignoretext import IgnoreTextProcessor from marker.processors.line_numbers import LineNumbersProcessor +from marker.processors.list import ListProcessor from marker.processors.page_header import PageHeaderProcessor from marker.processors.sectionheader import SectionHeaderProcessor from marker.processors.table import TableProcessor @@ -52,16 +54,18 @@ def __init__(self, artifact_dict: Dict[str, Any], processor_list: List[str] | No processor_list = strings_to_classes(processor_list) else: processor_list = [ - FootnoteProcessor, - PageHeaderProcessor, - EquationProcessor, - TableProcessor, - SectionHeaderProcessor, - TextProcessor, + BlockquoteProcessor, CodeProcessor, DocumentTOCProcessor, + EquationProcessor, + FootnoteProcessor, IgnoreTextProcessor, LineNumbersProcessor, + ListProcessor, + PageHeaderProcessor, + SectionHeaderProcessor, + TableProcessor, + TextProcessor, DebugProcessor, ] diff --git a/marker/output.py b/marker/output.py index e47c861f..ce209afa 100644 --- a/marker/output.py +++ b/marker/output.py @@ -6,6 +6,7 @@ from marker.renderers.html import HTMLOutput from marker.renderers.json import JSONOutput from marker.renderers.markdown import MarkdownOutput +from marker.settings import settings def output_exists(output_dir: str, fname_base: str): @@ -29,11 +30,12 @@ def text_from_rendered(rendered: BaseModel): def save_output(rendered: BaseModel, output_dir: str, fname_base: str): text, ext, images = text_from_rendered(rendered) + text = text.encode(settings.OUTPUT_ENCODING, errors='replace').decode(settings.OUTPUT_ENCODING) - with open(os.path.join(output_dir, f"{fname_base}.{ext}"), "w+") as f: + with open(os.path.join(output_dir, f"{fname_base}.{ext}"), "w+", encoding=settings.OUTPUT_ENCODING) as f: f.write(text) - with open(os.path.join(output_dir, f"{fname_base}_meta.json"), "w+") as f: + with open(os.path.join(output_dir, f"{fname_base}_meta.json"), "w+", encoding=settings.OUTPUT_ENCODING) as f: f.write(json.dumps(rendered.metadata, indent=2)) for img_name, img in images.items(): - img.save(os.path.join(output_dir, img_name), "PNG") + img.save(os.path.join(output_dir, img_name), "PNG", optimize=False, compress_level=3) diff --git a/marker/processors/blockquote.py b/marker/processors/blockquote.py new file mode 100644 index 00000000..cc71e3ab --- /dev/null +++ b/marker/processors/blockquote.py @@ -0,0 +1,49 @@ +from marker.processors import BaseProcessor +from marker.schema import BlockTypes +from marker.schema.document import Document + + +class BlockquoteProcessor(BaseProcessor): + """ + A processor for tagging blockquotes + """ + block_types = (BlockTypes.Text, BlockTypes.TextInlineMath) + min_x_indent = 0.05 # % of block width + x_start_tolerance = 0.01 # % of block width + x_end_tolerance = 0.01 # % of block width + + def __init__(self, config): + super().__init__(config) + + def __call__(self, document: Document): + for page in document.pages: + for block in page.contained_blocks(document, self.block_types): + if block.structure is None: + continue + + if not len(block.structure) >= 2: + continue + + next_block = page.get_next_block(block) + if next_block is None: + continue + if next_block.block_type not in self.block_types: + continue + if next_block.structure is None: + continue + if next_block.ignore_for_output: + continue + + matching_x_end = abs(next_block.polygon.x_end - block.polygon.x_end) < self.x_end_tolerance * block.polygon.width + matching_x_start = abs(next_block.polygon.x_start - block.polygon.x_start) < self.x_start_tolerance * block.polygon.width + x_indent = next_block.polygon.x_start > block.polygon.x_start + (self.min_x_indent * block.polygon.width) + y_indent = next_block.polygon.y_start > block.polygon.y_end + + if block.blockquote: + next_block.blockquote = (matching_x_end and matching_x_start) or (x_indent and y_indent) + next_block.blockquote_level = block.blockquote_level + if (x_indent and y_indent): + next_block.blockquote_level += 1 + elif len(next_block.structure) >= 2 and (x_indent and y_indent): + next_block.blockquote = True + next_block.blockquote_level = 1 diff --git a/marker/processors/debug.py b/marker/processors/debug.py index 05f85b58..3d46b046 100644 --- a/marker/processors/debug.py +++ b/marker/processors/debug.py @@ -69,7 +69,7 @@ def __call__(self, document: Document): print(f"Dumped block debug data to {self.debug_data_folder}") def draw_pdf_debug_images(self, document: Document): - for idx, page in enumerate(document.pages): + for page in document.pages: png_image = page.highres_image.copy() line_bboxes = [] @@ -87,12 +87,12 @@ def draw_pdf_debug_images(self, document: Document): png_image = self.render_layout_boxes(page, png_image) - debug_file = os.path.join(self.debug_folder, f"pdf_page_{idx}.png") + debug_file = os.path.join(self.debug_folder, f"pdf_page_{page.page_id}.png") png_image.save(debug_file) def draw_layout_debug_images(self, document: Document, pdf_mode=False): - for idx, page in enumerate(document.pages): + for page in document.pages: img_size = page.highres_image.size png_image = Image.new("RGB", img_size, color="white") @@ -110,7 +110,7 @@ def draw_layout_debug_images(self, document: Document, pdf_mode=False): png_image = self.render_layout_boxes(page, png_image) - debug_file = os.path.join(self.debug_folder, f"layout_page_{idx}.png") + debug_file = os.path.join(self.debug_folder, f"layout_page_{page.page_id}.png") png_image.save(debug_file) @@ -143,7 +143,7 @@ def render_layout_boxes(self, page, png_image): def dump_block_debug_data(self, document: Document): debug_file = os.path.join(self.debug_folder, f"blocks.json") debug_data = [] - for idx, page in enumerate(document.pages): + for page in document.pages: page_data = page.model_dump(exclude=["lowres_image", "highres_image"]) debug_data.append(page_data) diff --git a/marker/processors/equation.py b/marker/processors/equation.py index 5a990d8c..5da8436d 100644 --- a/marker/processors/equation.py +++ b/marker/processors/equation.py @@ -29,7 +29,7 @@ class EquationProcessor(BaseProcessor): """ block_types = (BlockTypes.Equation, ) model_max_length = 384 - batch_size = None + texify_batch_size = None token_buffer = 256 def __init__(self, texify_model: GenerateVisionEncoderDecoderModel, config=None): @@ -68,8 +68,8 @@ def __call__(self, document: Document): block.latex = prediction def get_batch_size(self): - if self.batch_size is not None: - return self.batch_size + if self.texify_batch_size is not None: + return self.texify_batch_size elif settings.TORCH_DEVICE_MODEL == "cuda": return 6 elif settings.TORCH_DEVICE_MODEL == "mps": diff --git a/marker/processors/list.py b/marker/processors/list.py new file mode 100644 index 00000000..ff394a4a --- /dev/null +++ b/marker/processors/list.py @@ -0,0 +1,90 @@ +from typing import List + +from marker.processors import BaseProcessor +from marker.schema import BlockTypes +from marker.schema.blocks import ListItem +from marker.schema.document import Document + + +class ListProcessor(BaseProcessor): + """ + A processor for merging lists across pages and columns + """ + block_types = (BlockTypes.ListGroup,) + ignored_block_types = (BlockTypes.PageHeader, BlockTypes.PageFooter) + min_x_indent = 0.01 # % of page width + + def __init__(self, config): + super().__init__(config) + + def __call__(self, document: Document): + self.list_group_continuation(document) + self.list_group_indentation(document) + + def list_group_continuation(self, document: Document): + for page in document.pages: + for block in page.contained_blocks(document, self.block_types): + next_block = document.get_next_block(block, self.ignored_block_types) + if next_block is None: + continue + if next_block.block_type not in self.block_types: + continue + if next_block.structure is None: + continue + if next_block.ignore_for_output: + continue + + column_break, page_break = False, False + next_block_in_first_quadrant = False + + if next_block.page_id == block.page_id: # block on the same page + # we check for a column break + column_break = next_block.polygon.y_start <= block.polygon.y_end + else: + page_break = True + next_page = document.get_page(next_block.page_id) + next_block_in_first_quadrant = (next_block.polygon.x_start < next_page.polygon.width // 2) and \ + (next_block.polygon.y_start < next_page.polygon.height // 2) + + block.has_continuation = column_break or (page_break and next_block_in_first_quadrant) + + def list_group_indentation(self, document: Document): + for page in document.pages: + for block in page.contained_blocks(document, self.block_types): + if block.structure is None: + continue + if block.ignore_for_output: + continue + + stack: List[ListItem] = [block.get_next_block(page, None)] + for list_item_id in block.structure: + list_item_block: ListItem = page.get_block(list_item_id) + + while stack and list_item_block.polygon.x_start <= stack[-1].polygon.x_start + (self.min_x_indent * page.polygon.width): + stack.pop() + + if stack and list_item_block.polygon.y_start > stack[-1].polygon.y_start: + list_item_block.list_indent_level = stack[-1].list_indent_level + if list_item_block.polygon.x_start > stack[-1].polygon.x_start + (self.min_x_indent * page.polygon.width): + list_item_block.list_indent_level += 1 + + next_list_item_block = block.get_next_block(page, list_item_block) + if next_list_item_block is not None and next_list_item_block.polygon.x_start > list_item_block.polygon.x_end: + stack = [next_list_item_block] # reset stack on column breaks + else: + stack.append(list_item_block) + + stack: List[ListItem] = [block.get_next_block(page, None)] + for list_item_id in block.structure.copy(): + list_item_block: ListItem = page.get_block(list_item_id) + + while stack and list_item_block.list_indent_level <= stack[-1].list_indent_level: + stack.pop() + + if stack: + current_parent = stack[-1] + current_parent.add_structure(list_item_block) + current_parent.polygon = current_parent.polygon.merge([list_item_block.polygon]) + + block.remove_structure_items([list_item_id]) + stack.append(list_item_block) diff --git a/marker/processors/page_header.py b/marker/processors/page_header.py index 0972084a..989f9d50 100644 --- a/marker/processors/page_header.py +++ b/marker/processors/page_header.py @@ -8,7 +8,7 @@ class PageHeaderProcessor(BaseProcessor): """ A processor for moving PageHeaders to the top """ - block_types = (BlockTypes.PageHeader, BlockTypes.PageFooter) + block_types = (BlockTypes.PageHeader) def __call__(self, document: Document): for page in document.pages: diff --git a/marker/processors/table.py b/marker/processors/table.py index 163a7465..853f1205 100644 --- a/marker/processors/table.py +++ b/marker/processors/table.py @@ -1,4 +1,5 @@ +from ftfy import fix_text from surya.input.pdflines import get_page_text_lines from surya.model.detection.model import EfficientViTForSemanticSegmentation from surya.model.recognition.encoderdecoder import OCREncoderDecoderModel @@ -100,6 +101,8 @@ def __call__(self, document: Document): for table_d, table_res in zip(table_data, tables): block = document.get_block(table_d["block_id"]) cells = assign_rows_columns(table_res, table_d["img_size"]) + for cell in cells: + cell.text = fix_text(cell.text) block.cells = cells def get_detector_batch_size(self): diff --git a/marker/processors/text.py b/marker/processors/text.py index 5294659e..e13d699b 100644 --- a/marker/processors/text.py +++ b/marker/processors/text.py @@ -19,6 +19,7 @@ class TextProcessor(BaseProcessor): Default is 0.02. """ block_types = (BlockTypes.Text, BlockTypes.TextInlineMath) + ignored_block_types = (BlockTypes.PageHeader, BlockTypes.PageFooter) column_gap_ratio = 0.02 # column gaps are atleast 2% of the current column width def __init__(self, config): @@ -32,64 +33,42 @@ def __call__(self, document: Document): if not len(block.structure) >= 2: # Skip single lines continue - + + next_block = document.get_next_block(block, self.ignored_block_types) + if next_block is None: # we've reached the end of the document + continue + if next_block.block_type not in self.block_types: + continue # we found a non-text block + if next_block.structure is None: + continue # This is odd though, why do we have text blocks with no structure? + if next_block.ignore_for_output: + continue # skip ignored blocks + column_gap = block.polygon.width * self.column_gap_ratio column_break, page_break = False, False - next_block = page.get_next_block(block) + next_block_starts_indented = True + next_block_in_first_quadrant = False + last_line_is_full_width = False + last_line_is_hyphentated = False + new_block_lines = [] - if next_block is not None: # next block exists + if next_block.page_id == block.page_id: # block on the same page # we check for a column break column_break = ( - math.floor(next_block.polygon.y_start) <= math.floor(block.polygon.y_start) and + math.floor(next_block.polygon.y_start) <= math.ceil(block.polygon.y_start) and next_block.polygon.x_start > (block.polygon.x_end + column_gap) ) - else: # It's a page break since we don't have a next block in the page + else: page_break = True + next_page = document.get_page(next_block.page_id) + next_block_in_first_quadrant = (next_block.polygon.x_start < next_page.polygon.width // 2) and \ + (next_block.polygon.y_start < next_page.polygon.height // 2) if not (column_break or page_break): continue - - next_block_starts_indented = True - next_block_in_first_quadrant = False - last_line_is_full_width = False - last_line_is_hyphentated = False - new_block_lines = [] - - if column_break: - if next_block.block_type not in self.block_types: - continue - if next_block.structure is None: # This is odd though, why do we have text blocks with no structure? - continue - - new_block_lines = next_block.structure_blocks(document) - else: # page break - next_page = document.get_next_page(page) - if next_page is None: - continue # we're on the last page, so we don't worry about merging - - # Go through the next page only - for next_page_block_id in next_page.structure: - if next_page_block_id.block_type in [BlockTypes.PageHeader, BlockTypes.PageFooter]: - continue # skip headers and footers - - # we have our block - next_page_block = next_page.get_block(next_page_block_id) - if next_page_block.ignore_for_output: - continue # skip ignored blocks - - if not (next_page_block.structure is not None and \ - next_page_block.block_type in self.block_types): - # we found a non-text block or an empty text block, so we can stop looking - break - - new_block_lines = next_page_block.structure_blocks(document) - - next_block_in_first_quadrant = (next_page_block.polygon.x_start < next_page.polygon.width // 2) and \ - (next_page_block.polygon.y_start < next_page.polygon.height // 2) - break - else: - continue # we didn't break anywhere so we continue + + new_block_lines = next_block.structure_blocks(document) # we check for next_block indentation if len(new_block_lines): diff --git a/marker/providers/__init__.py b/marker/providers/__init__.py index 02007853..6b389065 100644 --- a/marker/providers/__init__.py +++ b/marker/providers/__init__.py @@ -1,5 +1,6 @@ from typing import List, Optional, Dict +from PIL import Image from pydantic import BaseModel from marker.schema.text import Span @@ -11,6 +12,10 @@ class ProviderOutput(BaseModel): line: Line spans: List[Span] + @property + def raw_text(self): + return "".join(span.text for span in self.spans) + ProviderPageLines = Dict[int, List[ProviderOutput]] class BaseProvider: @@ -21,7 +26,7 @@ def __init__(self, filepath: str, config: Optional[BaseModel | dict] = None): def __len__(self): pass - def get_image(self, idx: int, dpi: int): + def get_images(self, idxs: List[int], dpi: int) -> List[Image.Image]: pass def get_page_bbox(self, idx: int) -> List[float]: diff --git a/marker/providers/pdf.py b/marker/providers/pdf.py index f5fbbe79..30b3f835 100644 --- a/marker/providers/pdf.py +++ b/marker/providers/pdf.py @@ -1,6 +1,10 @@ import atexit import re +from concurrent.futures import ThreadPoolExecutor +from concurrent.futures.process import ProcessPoolExecutor +from itertools import repeat from typing import List, Set +import multiprocessing as mp import pypdfium2 as pdfium from ftfy import fix_text @@ -195,12 +199,17 @@ def detect_bad_ocr(self, text): return False - def get_image(self, idx: int, dpi: int) -> Image.Image: - page = self.doc[idx] + @staticmethod + def _render_image(pdf: pdfium.PdfDocument, idx: int, dpi: int) -> Image.Image: + page = pdf[idx] image = page.render(scale=dpi / 72, draw_annots=False).to_pil() image = image.convert("RGB") return image + def get_images(self, idxs: List[int], dpi: int) -> List[Image.Image]: + images = [self._render_image(self.doc, idx, dpi) for idx in idxs] + return images + def get_page_bbox(self, idx: int) -> PolygonBox | None: bbox = self.page_bboxes.get(idx) if bbox: diff --git a/marker/renderers/__init__.py b/marker/renderers/__init__.py index d2358188..c8bf79da 100644 --- a/marker/renderers/__init__.py +++ b/marker/renderers/__init__.py @@ -8,13 +8,16 @@ from pydantic import BaseModel from marker.schema import BlockTypes -from marker.schema.blocks.base import BlockOutput, BlockId +from marker.schema.blocks.base import BlockId, BlockOutput +from marker.schema.document import Document +from marker.settings import settings from marker.util import assign_config class BaseRenderer: remove_blocks: list = [BlockTypes.PageHeader, BlockTypes.PageFooter] image_blocks: list = [BlockTypes.Picture, BlockTypes.Figure] + extract_images: bool = True def __init__(self, config: Optional[BaseModel | dict] = None): assign_config(self, config) @@ -24,7 +27,7 @@ def __call__(self, document): raise NotImplementedError @staticmethod - def extract_image(document, image_id, to_base64=False): + def extract_image(document: Document, image_id, to_base64=False): image_block = document.get_block(image_id) page = document.get_page(image_block.page_id) page_img = page.highres_image @@ -33,7 +36,7 @@ def extract_image(document, image_id, to_base64=False): if to_base64: image_buffer = io.BytesIO() cropped.save(image_buffer, format='PNG') - cropped = base64.b64encode(image_buffer.getvalue()).decode('utf-8') + cropped = base64.b64encode(image_buffer.getvalue()).decode(settings.OUTPUT_ENCODING) return cropped @staticmethod @@ -54,7 +57,7 @@ def replace_whitespace(match): return html - def generate_page_stats(self, document, document_output): + def generate_page_stats(self, document: Document, document_output): page_stats = [] for page in document.pages: block_counts = Counter([str(block.block_type) for block in page.children]).most_common() @@ -65,7 +68,7 @@ def generate_page_stats(self, document, document_output): }) return page_stats - def generate_document_metadata(self, document, document_output): + def generate_document_metadata(self, document: Document, document_output): metadata = { "table_of_contents": document.table_of_contents, "page_stats": self.generate_page_stats(document, document_output), @@ -75,7 +78,7 @@ def generate_document_metadata(self, document, document_output): return metadata - def extract_block_html(self, document, block_output): + def extract_block_html(self, document: Document, block_output: BlockOutput): soup = BeautifulSoup(block_output.html, 'html.parser') content_refs = soup.find_all('content-ref') @@ -91,14 +94,13 @@ def extract_block_html(self, document, block_output): ref_block_id: BlockId = item.id break - if ref_block_id.block_type in self.image_blocks: + if ref_block_id.block_type in self.image_blocks and self.extract_images: images[ref_block_id] = self.extract_image(document, ref_block_id, to_base64=True) else: images.update(sub_images) ref.replace_with(BeautifulSoup(content, 'html.parser')) - if block_output.id.block_type in self.image_blocks: + if block_output.id.block_type in self.image_blocks and self.extract_images: images[block_output.id] = self.extract_image(document, block_output.id, to_base64=True) return str(soup), images - diff --git a/marker/renderers/html.py b/marker/renderers/html.py index 9d7bfa18..29ca6be3 100644 --- a/marker/renderers/html.py +++ b/marker/renderers/html.py @@ -1,4 +1,5 @@ import re +from typing import Literal from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning from pydantic import BaseModel @@ -21,11 +22,12 @@ class HTMLOutput(BaseModel): class HTMLRenderer(BaseRenderer): page_blocks: list = [BlockTypes.Page] paginate_output: bool = False + image_extraction_mode: Literal["lowres", "highres"] = "highres" def extract_image(self, document, image_id): image_block = document.get_block(image_id) page = document.get_page(image_block.page_id) - page_img = page.highres_image + page_img = page.lowres_image if self.image_extraction_mode == "lowres" else page.highres_image image_box = image_block.polygon.rescale(page.polygon.size, page_img.size) cropped = page_img.crop(image_box.bbox) return cropped @@ -49,10 +51,13 @@ def extract_html(self, document, document_output, level=0): if ref_block_id.block_type in self.remove_blocks: ref.replace_with('') elif ref_block_id.block_type in self.image_blocks: - image = self.extract_image(document, ref_block_id) - image_name = f"{ref_block_id.to_path()}.png" - images[image_name] = image - ref.replace_with(BeautifulSoup(f"

", 'html.parser')) + if self.extract_images: + image = self.extract_image(document, ref_block_id) + image_name = f"{ref_block_id.to_path()}.png" + images[image_name] = image + ref.replace_with(BeautifulSoup(f"

", 'html.parser')) + else: + ref.replace_with('') elif ref_block_id.block_type in self.page_blocks: images.update(sub_images) if self.paginate_output: diff --git a/marker/renderers/json.py b/marker/renderers/json.py index 0388717d..de2a16d4 100644 --- a/marker/renderers/json.py +++ b/marker/renderers/json.py @@ -1,12 +1,13 @@ from __future__ import annotations -from typing import List, Dict +from typing import Dict, List from pydantic import BaseModel -from marker.schema.blocks import Block from marker.renderers import BaseRenderer from marker.schema import BlockTypes +from marker.schema.blocks import Block, BlockOutput +from marker.schema.document import Document from marker.schema.registry import get_block_class @@ -37,7 +38,7 @@ class JSONRenderer(BaseRenderer): image_blocks: list = [BlockTypes.Picture, BlockTypes.Figure] page_blocks: list = [BlockTypes.Page] - def extract_json(self, document, block_output): + def extract_json(self, document: Document, block_output: BlockOutput): cls = get_block_class(block_output.id.block_type) if cls.__base__ == Block: html, images = self.extract_block_html(document, block_output) @@ -64,7 +65,7 @@ def extract_json(self, document, block_output): section_hierarchy=reformat_section_hierarchy(block_output.section_hierarchy) ) - def __call__(self, document) -> JSONOutput: + def __call__(self, document: Document) -> JSONOutput: document_output = document.render() json_output = [] for page_output in document_output.children: diff --git a/marker/renderers/markdown.py b/marker/renderers/markdown.py index 0e0f2f80..0cadaf16 100644 --- a/marker/renderers/markdown.py +++ b/marker/renderers/markdown.py @@ -5,8 +5,10 @@ from pydantic import BaseModel from marker.renderers.html import HTMLRenderer +from marker.schema import BlockTypes from marker.schema.document import Document + def cleanup_text(full_text): full_text = re.sub(r'\n{3,}', '\n\n', full_text) full_text = re.sub(r'(\n\s){3,}', '\n\n', full_text) @@ -32,9 +34,13 @@ def convert_p(self, el, text, *args): hyphens = r'-—¬' has_continuation = el.has_attr('class') and 'has-continuation' in el['class'] if has_continuation: - if regex.compile(rf'.*[\p{{Ll}}|\d][{hyphens}]\s?$', regex.DOTALL).match(text): # handle hypenation across pages - return regex.split(rf"[{hyphens}]\s?$", text)[0] - return f"{text} " + block_type = BlockTypes[el['block-type']] + if block_type in [BlockTypes.TextInlineMath, BlockTypes.Text]: + if regex.compile(rf'.*[\p{{Ll}}|\d][{hyphens}]\s?$', regex.DOTALL).match(text): # handle hypenation across pages + return regex.split(rf"[{hyphens}]\s?$", text)[0] + return f"{text} " + if block_type == BlockTypes.ListGroup: + return f"{text}" return f"{text}\n\n" if text else "" # default convert_p behavior diff --git a/marker/schema/__init__.py b/marker/schema/__init__.py index 7312c0f1..1957639c 100644 --- a/marker/schema/__init__.py +++ b/marker/schema/__init__.py @@ -1,7 +1,7 @@ -from enum import auto, IntEnum +from enum import auto, StrEnum -class BlockTypes(IntEnum): +class BlockTypes(StrEnum): Line = auto() Span = auto() FigureGroup = auto() diff --git a/marker/schema/blocks/base.py b/marker/schema/blocks/base.py index c4a7a41b..fb295031 100644 --- a/marker/schema/blocks/base.py +++ b/marker/schema/blocks/base.py @@ -64,7 +64,7 @@ class Block(BaseModel): page_id: Optional[int] = None text_extraction_method: Optional[Literal['pdftext', 'surya']] = None structure: List[BlockId] | None = None # The top-level page structure, which is the block ids in order - ignore_for_output: bool = False # Whether this block should be ignored in output + ignore_for_output: bool = False # Whether this block should be ignored in output source: Literal['layout', 'heuristics', 'processor'] = 'layout' model_config = ConfigDict(arbitrary_types_allowed=True) @@ -87,6 +87,32 @@ def structure_blocks(self, document_page: Document | PageGroup) -> List[Block]: return [] return [document_page.get_block(block_id) for block_id in self.structure] + def get_prev_block(self, document_page: Document | PageGroup, block: Block, ignored_block_types: Optional[List[BlockTypes]] = None): + if ignored_block_types is None: + ignored_block_types = [] + + structure_idx = self.structure.index(block.id) + if structure_idx == 0: + return None + + for prev_block_id in reversed(self.structure[:structure_idx]): + if prev_block_id.block_type not in ignored_block_types: + return document_page.get_block(prev_block_id) + + def get_next_block(self, document_page: Document | PageGroup, block: Optional[Block] = None, ignored_block_types: Optional[List[BlockTypes]] = None): + if ignored_block_types is None: + ignored_block_types = [] + + structure_idx = 0 + if block is not None: + structure_idx = self.structure.index(block.id) + 1 + + for next_block_id in self.structure[structure_idx:]: + if next_block_id.block_type not in ignored_block_types: + return document_page.get_block(next_block_id) + + return None # No valid next block found + def add_structure(self, block: Block): if self.structure is None: self.structure = [block.id] @@ -170,7 +196,7 @@ def render(self, document: Document, parent_structure: Optional[List[str]], sect for block_id in self.structure: block = document.get_block(block_id) rendered = block.render(document, self.structure, section_hierarchy) - section_hierarchy = rendered.section_hierarchy # Update the section hierarchy from the peer blocks + section_hierarchy = rendered.section_hierarchy.copy() # Update the section hierarchy from the peer blocks child_content.append(rendered) return BlockOutput( diff --git a/marker/schema/blocks/inlinemath.py b/marker/schema/blocks/inlinemath.py index 99f46759..1b446ae7 100644 --- a/marker/schema/blocks/inlinemath.py +++ b/marker/schema/blocks/inlinemath.py @@ -5,6 +5,8 @@ class InlineMath(Block): block_type: BlockTypes = BlockTypes.TextInlineMath has_continuation: bool = False + blockquote: bool = False + blockquote_level: int = 0 def assemble_html(self, child_blocks, parent_structure): if self.ignore_for_output: @@ -13,7 +15,14 @@ def assemble_html(self, child_blocks, parent_structure): template = super().assemble_html(child_blocks, parent_structure) template = template.replace("\n", " ") - class_attr = "" + el_attr = f" block-type='{self.block_type}'" if self.has_continuation: - class_attr = " class='has-continuation'" - return f"{template}

" + el_attr += " class='has-continuation'" + + if self.blockquote: + # Add indentation for blockquote levels + blockquote_prefix = "
" * self.blockquote_level + blockquote_suffix = "
" * self.blockquote_level + return f"{blockquote_prefix}{template}

{blockquote_suffix}" + else: + return f"{template}

" diff --git a/marker/schema/blocks/listitem.py b/marker/schema/blocks/listitem.py index 51ab0839..fef515a4 100644 --- a/marker/schema/blocks/listitem.py +++ b/marker/schema/blocks/listitem.py @@ -12,16 +12,21 @@ def replace_bullets(child_blocks): child_blocks = first_block.children if first_block is not None and first_block.id.block_type == BlockTypes.Line: - bullet_pattern = r"(^|[\n ]|<[^>]*>)[•●○■▪▫–—-]( )" + bullet_pattern = r"(^|[\n ]|<[^>]*>)[•●○ഠ ം◦■▪▫–—-]( )" first_block.html = re.sub(bullet_pattern, r"\1\2", first_block.html) class ListItem(Block): block_type: BlockTypes = BlockTypes.ListItem + list_indent_level: int = 0 def assemble_html(self, child_blocks, parent_structure): template = super().assemble_html(child_blocks, parent_structure) template = template.replace("\n", " ") # Remove the first bullet character replace_bullets(child_blocks) - return f"
  • {template}
  • " + + el_attr = f" block-type='{self.block_type}'" + if self.list_indent_level: + return f"
      {template}
    " + return f"{template}" diff --git a/marker/schema/blocks/pageheader.py b/marker/schema/blocks/pageheader.py index 4414b1b0..d304490e 100644 --- a/marker/schema/blocks/pageheader.py +++ b/marker/schema/blocks/pageheader.py @@ -3,7 +3,7 @@ class PageHeader(Block): - block_type: str = BlockTypes.PageHeader + block_type: BlockTypes = BlockTypes.PageHeader def assemble_html(self, child_blocks, parent_structure): if self.ignore_for_output: diff --git a/marker/schema/blocks/text.py b/marker/schema/blocks/text.py index 89fba932..4c2dea86 100644 --- a/marker/schema/blocks/text.py +++ b/marker/schema/blocks/text.py @@ -5,6 +5,8 @@ class Text(Block): block_type: BlockTypes = BlockTypes.Text has_continuation: bool = False + blockquote: bool = False + blockquote_level: int = 0 def assemble_html(self, child_blocks, parent_structure): if self.ignore_for_output: @@ -13,7 +15,13 @@ def assemble_html(self, child_blocks, parent_structure): template = super().assemble_html(child_blocks, parent_structure) template = template.replace("\n", " ") - class_attr = "" + el_attr = f" block-type='{self.block_type}'" if self.has_continuation: - class_attr += " class='has-continuation'" - return f"{template}

    " + el_attr += " class='has-continuation'" + + if self.blockquote: + blockquote_prefix = "
    " * self.blockquote_level + blockquote_suffix = "
    " * self.blockquote_level + return f"{blockquote_prefix}{template}

    {blockquote_suffix}" + else: + return f"{template}

    " diff --git a/marker/schema/document.py b/marker/schema/document.py index 19ca34a4..d7ca4c73 100644 --- a/marker/schema/document.py +++ b/marker/schema/document.py @@ -42,15 +42,23 @@ def get_page(self, page_id): return page return None - def get_next_block(self, block: Block): + def get_next_block(self, block: Block, ignored_block_types: List[BlockTypes] = None): + if ignored_block_types is None: + ignored_block_types = [] + next_block = None + + # Try to find the next block in the current page page = self.get_page(block.page_id) - next_block = page.get_next_block(block) + next_block = page.get_next_block(block, ignored_block_types) if next_block: return next_block - next_page = self.get_next_page(page) - if not next_page: - return None - return next_page.get_block(next_page.structure[0]) + + # If no block found, search subsequent pages + for page in self.pages[self.pages.index(page) + 1:]: + next_block = page.get_next_block(None, ignored_block_types) + if next_block: + return next_block + return None def get_next_page(self, page: PageGroup): page_idx = self.pages.index(page) @@ -85,7 +93,7 @@ def render(self): section_hierarchy = None for page in self.pages: rendered = page.render(self, None, section_hierarchy) - section_hierarchy = rendered.section_hierarchy + section_hierarchy = rendered.section_hierarchy.copy() child_content.append(rendered) return DocumentOutput( diff --git a/marker/schema/groups/list.py b/marker/schema/groups/list.py index 0149211f..8e8ee3ab 100644 --- a/marker/schema/groups/list.py +++ b/marker/schema/groups/list.py @@ -4,7 +4,12 @@ class ListGroup(Group): block_type: BlockTypes = BlockTypes.ListGroup + has_continuation: bool = False def assemble_html(self, child_blocks, parent_structure): template = super().assemble_html(child_blocks, parent_structure) - return f"

      {template}

    " + + el_attr = f" block-type='{self.block_type}'" + if self.has_continuation: + el_attr += " class='has-continuation'" + return f"
      {template}

    " diff --git a/marker/schema/groups/page.py b/marker/schema/groups/page.py index 1f216612..c00af1f5 100644 --- a/marker/schema/groups/page.py +++ b/marker/schema/groups/page.py @@ -1,7 +1,6 @@ from collections import defaultdict -from typing import Dict, List, TYPE_CHECKING, Sequence, Tuple +from typing import Any, Dict, List, Optional, Sequence, Tuple, Union -import numpy as np from PIL import Image from marker.providers import ProviderOutput @@ -18,7 +17,7 @@ class PageGroup(Group): block_type: BlockTypes = BlockTypes.Page lowres_image: Image.Image | None = None highres_image: Image.Image | None = None - children: List[Block] | None = None + children: List[Union[Any, Block]] | None = None layout_sliced: bool = False # Whether the layout model had to slice the image (order may be wrong) excluded_block_types: Sequence[BlockTypes] = (BlockTypes.Line, BlockTypes.Span,) maximum_assignment_distance: float = 20 # pixels @@ -35,11 +34,20 @@ def add_child(self, block: Block): else: self.children.append(block) - def get_next_block(self, block: Block): - block_idx = self.structure.index(block.id) - if block_idx + 1 < len(self.structure): - return self.get_block(self.structure[block_idx + 1]) - return None + def get_next_block(self, block: Optional[Block] = None, ignored_block_types: Optional[List[BlockTypes]] = None): + if ignored_block_types is None: + ignored_block_types = [] + + structure_idx = 0 + if block is not None: + structure_idx = self.structure.index(block.id) + 1 + + # Iterate over blocks following the given block + for next_block_id in self.structure[structure_idx:]: + if next_block_id.block_type not in ignored_block_types: + return self.get_block(next_block_id) + + return None # No valid next block found def get_prev_block(self, block: Block): block_idx = self.structure.index(block.id) @@ -123,6 +131,11 @@ def identify_missing_blocks( if line_idx in assigned_line_idxs: continue + # if the unassociated line is a new line with minimal area, we can skip it + if provider_outputs[line_idx].line.polygon.area <= 1 and \ + provider_outputs[line_idx].raw_text == "\n": + continue + if new_block is None: new_block = [(line_idx, provider_outputs[line_idx])] elif all([ diff --git a/marker/settings.py b/marker/settings.py index 40a739d1..0a510568 100644 --- a/marker/settings.py +++ b/marker/settings.py @@ -14,6 +14,9 @@ class Settings(BaseSettings): FONT_DIR: str = os.path.join(BASE_DIR, "static", "fonts") DEBUG_DATA_FOLDER: str = os.path.join(BASE_DIR, "debug_data") + # General + OUTPUT_ENCODING: str = "utf-8" + # General models TORCH_DEVICE: Optional[str] = None # Note: MPS device does not work for text detection, and will default to CPU diff --git a/marker_app.py b/marker_app.py index 32579ee9..208ddbe8 100644 --- a/marker_app.py +++ b/marker_app.py @@ -25,8 +25,7 @@ def load_models(): return create_model_dict() -def convert_pdf(fname: str, **kwargs) -> (str, Dict[str, Any], dict): - config_parser = ConfigParser(kwargs) +def convert_pdf(fname: str, config_parser: ConfigParser) -> (str, Dict[str, Any], dict): config_dict = config_parser.generate_config_dict() config_dict["pdftext_workers"] = 1 converter = PdfConverter( @@ -122,18 +121,24 @@ def page_count(pdf_file): st.stop() # Run Marker -with tempfile.NamedTemporaryFile(suffix=".pdf") as temp_pdf: +with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb+") as temp_pdf: temp_pdf.write(in_file.getvalue()) temp_pdf.seek(0) filename = temp_pdf.name + cli_options = { + "output_format": output_format, + "page_range": page_range, + "force_ocr": force_ocr, + "debug": debug, + "output_dir": settings.DEBUG_DATA_FOLDER if debug else None, + } + config_parser = ConfigParser(cli_options) rendered = convert_pdf( filename, - page_range=page_range, - force_ocr=force_ocr, - output_format=output_format, - output_dir=settings.DEBUG_DATA_FOLDER if debug else None, - debug=debug + config_parser ) + page_range = config_parser.generate_config_dict()["page_range"] + first_page = page_range[0] if page_range else 0 text, ext, images = text_from_rendered(rendered) with col2: @@ -149,10 +154,10 @@ def page_count(pdf_file): with col1: debug_data_path = rendered.metadata.get("debug_data_path") if debug_data_path: - pdf_image_path = os.path.join(debug_data_path, f"pdf_page_0.png") + pdf_image_path = os.path.join(debug_data_path, f"pdf_page_{first_page}.png") img = Image.open(pdf_image_path) st.image(img, caption="PDF debug image", use_container_width=True) - layout_image_path = os.path.join(debug_data_path, f"layout_page_0.png") + layout_image_path = os.path.join(debug_data_path, f"layout_page_{first_page}.png") img = Image.open(layout_image_path) st.image(img, caption="Layout debug image", use_container_width=True) diff --git a/marker_server.py b/marker_server.py index aa5a3178..2f8bbe21 100644 --- a/marker_server.py +++ b/marker_server.py @@ -18,6 +18,7 @@ from fastapi import FastAPI, Form, File, UploadFile from marker.converters.pdf import PdfConverter from marker.models import create_model_dict +from marker.settings import settings app_data = {} @@ -110,7 +111,7 @@ async def _convert_pdf(params: CommonParams): for k, v in images.items(): byte_stream = io.BytesIO() v.save(byte_stream, format="PNG") - encoded[k] = base64.b64encode(byte_stream.getvalue()).decode("utf-8") + encoded[k] = base64.b64encode(byte_stream.getvalue()).decode(settings.OUTPUT_ENCODING) return { "format": params.output_format, @@ -140,7 +141,7 @@ async def convert_pdf_upload( ), ): upload_path = os.path.join(UPLOAD_DIRECTORY, file.filename) - with open(upload_path, "wb") as upload_file: + with open(upload_path, "wb+") as upload_file: file_contents = await file.read() upload_file.write(file_contents) diff --git a/pyproject.toml b/pyproject.toml index 61bdc89f..830d0d7d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "marker-pdf" -version = "1.0.0" +version = "1.0.1" description = "Convert PDF to markdown with high speed and accuracy." authors = ["Vik Paruchuri "] readme = "README.md" diff --git a/tests/builders/test_rotated_bboxes.py b/tests/builders/test_rotated_bboxes.py index 90632551..d62d0438 100644 --- a/tests/builders/test_rotated_bboxes.py +++ b/tests/builders/test_rotated_bboxes.py @@ -11,7 +11,7 @@ def test_rotated_bboxes(pdf_document): # Ensure we match all text lines up properly text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,)) text_blocks = first_page.contained_blocks(pdf_document, (BlockTypes.Text,)) - assert len(text_lines) == 97 + assert len(text_lines) == 95 # Ensure the bbox sizes match up max_line_position = max([line.polygon.x_end for line in text_lines]) diff --git a/tests/providers/test_pdf_provider.py b/tests/providers/test_pdf_provider.py index a399b06e..40b96b90 100644 --- a/tests/providers/test_pdf_provider.py +++ b/tests/providers/test_pdf_provider.py @@ -4,8 +4,8 @@ @pytest.mark.config({"page_range": [0]}) def test_pdf_provider(pdf_provider): assert len(pdf_provider) == 12 - assert pdf_provider.get_image(0, 72).size == (612, 792) - assert pdf_provider.get_image(0, 96).size == (816, 1056) + assert pdf_provider.get_images([0], 72)[0].size == (612, 792) + assert pdf_provider.get_images([0], 96)[0].size == (816, 1056) page_lines = pdf_provider.get_page_lines(0) assert len(page_lines) == 93 diff --git a/tests/renderers/test_markdown_renderer.py b/tests/renderers/test_markdown_renderer.py index 8f82de4c..0752b100 100644 --- a/tests/renderers/test_markdown_renderer.py +++ b/tests/renderers/test_markdown_renderer.py @@ -25,4 +25,13 @@ def test_markdown_renderer_pagination(pdf_document): def test_markdown_renderer_metadata(pdf_document): renderer = MarkdownRenderer({"paginate_output": True}) metadata = renderer(pdf_document).metadata - assert "table_of_contents" in metadata \ No newline at end of file + assert "table_of_contents" in metadata + + +@pytest.mark.config({"page_range": [0, 1]}) +def test_markdown_renderer_images(pdf_document): + renderer = MarkdownRenderer({"extract_images": False}) + markdown_output = renderer(pdf_document) + + assert len(markdown_output.images) == 0 + assert '![](' not in markdown_output.markdown