From 26f389070165348c8f734570ca3ed9862d81bc4e Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Mon, 17 Jun 2024 09:51:27 -0700 Subject: [PATCH] Bugfixes and new features --- README.md | 17 +++++++++++++---- convert.py | 24 ++++++++++++++++-------- marker/images/extract.py | 5 +++++ marker/postprocessors/markdown.py | 7 ++++++- marker/settings.py | 1 + pyproject.toml | 2 +- 6 files changed, 42 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 33b7af5c..bdf36906 100644 --- a/README.md +++ b/README.md @@ -38,16 +38,16 @@ The above results are with marker and nougat setup so they each take ~4GB of VRA See [below](#benchmarks) for detailed speed and accuracy benchmarks, and instructions on how to run your own benchmarks. +# Hosted API + +There is a hosted API for marker available [here](https://www.datalab.to/). It has been tuned for performance, and generally takes 10s + 1s/page for conversion. + # Commercial usage I want marker to be as widely accessible as possible, while still funding my development/training costs. Research and personal usage is always okay, but there are some restrictions on commercial usage. The weights for the models are licensed `cc-by-nc-sa-4.0`, but I will waive that for any organization under $5M USD in gross revenue in the most recent 12-month period AND under $5M in lifetime VC/angel funding raised. If you want to remove the GPL license requirements (dual-license) and/or use the weights commercially over the revenue limit, check out the options [here](https://www.datalab.to). -# Hosted API - -There is a hosted API for marker available [here](https://www.datalab.to/). It's currently in beta, and I'm working on optimizing speed. - # Community [Discord](https://discord.gg//KuZwXNGnfH) is where we discuss future development. @@ -147,6 +147,15 @@ There are some settings that you may find useful if things aren't working the wa In general, if output is not what you expect, trying to OCR the PDF is a good first step. Not all PDFs have good text/bboxes embedded in them. +## Useful settings + +These settings can improve/change output quality: + +- `OCR_ALL_PAGES` will force OCR across the document. Many PDFs have bad text embedded due to older OCR engines being used. +- `PAGINATE_OUTPUT` will put a horizontal rule between pages. Default: False. +- `EXTRACT_IMAGES` will extract images and save separately. Default: True. +- `BAD_SPAN_TYPES` specifies layout blocks to remove from the markdown output. + # Benchmarks Benchmarking PDF extraction quality is hard. I've created a test set by finding books and scientific papers that have a pdf version and a latex source. I convert the latex to text, and compare the reference to the output of text extraction methods. It's noisy, but at least directionally correct. diff --git a/convert.py b/convert.py index 615ea62d..6d375ef4 100755 --- a/convert.py +++ b/convert.py @@ -23,6 +23,9 @@ def worker_init(shared_model): + if shared_model is None: + shared_model = load_all_models() + global model_refs model_refs = shared_model @@ -105,17 +108,22 @@ def main(): tasks_per_gpu = settings.INFERENCE_RAM // settings.VRAM_PER_TASK if settings.CUDA else 0 total_processes = min(tasks_per_gpu, total_processes) - mp.set_start_method('spawn') # Required for CUDA, forkserver doesn't work - model_lst = load_all_models() + try: + mp.set_start_method('spawn') # Required for CUDA, forkserver doesn't work + except RuntimeError: + raise RuntimeError("Set start method to spawn twice. This may be a temporary issue with the script. Please try running it again.") - for model in model_lst: - if model is None: - continue + if settings.TORCH_DEVICE == "mps" or settings.TORCH_DEVICE_MODEL == "mps": + print("Cannot use MPS with torch multiprocessing share_memory. This will make things less memory efficient. If you want to share memory, you have to use CUDA or CPU. Set the TORCH_DEVICE environment variable to change the device.") - if model.device.type == "mps": - raise ValueError("Cannot use MPS with torch multiprocessing share_memory. You have to use CUDA or CPU. Set the TORCH_DEVICE environment variable to change the device.") + model_lst = None + else: + model_lst = load_all_models() - model.share_memory() + for model in model_lst: + if model is None: + continue + model.share_memory() print(f"Converting {len(files_to_convert)} pdfs in chunk {args.chunk_idx + 1}/{args.num_chunks} with {total_processes} processes, and storing in {out_folder}") task_args = [(f, out_folder, metadata.get(os.path.basename(f)), args.min_length) for f in files_to_convert] diff --git a/marker/images/extract.py b/marker/images/extract.py index 80fd4f8e..3870873a 100644 --- a/marker/images/extract.py +++ b/marker/images/extract.py @@ -39,6 +39,11 @@ def extract_page_images(page_obj, page): image_blocks = find_image_blocks(page) for image_idx, (block_idx, line_idx, bbox) in enumerate(image_blocks): + if block_idx >= len(page.blocks): + block_idx = len(page.blocks) - 1 + if block_idx < 0: + continue + block = page.blocks[block_idx] image = render_bbox_image(page_obj, page, bbox) image_filename = get_image_filename(page, image_idx) diff --git a/marker/postprocessors/markdown.py b/marker/postprocessors/markdown.py index ea39b23e..005f2b4b 100644 --- a/marker/postprocessors/markdown.py +++ b/marker/postprocessors/markdown.py @@ -4,6 +4,8 @@ import regex from typing import List +from marker.settings import settings + def escape_markdown(text): # List of characters that need to be escaped in markdown @@ -143,7 +145,7 @@ def merge_lines(blocks: List[List[MergedBlock]]): block_text = "" block_type = "" - for page in blocks: + for idx, page in enumerate(blocks): for block in page: block_type = block.block_type if block_type != prev_type and prev_type: @@ -168,6 +170,9 @@ def merge_lines(blocks: List[List[MergedBlock]]): else: block_text = line.text + if settings.PAGINATE_OUTPUT and idx < len(blocks) - 1: + block_text += "\n\n" + "-" * 16 + "\n\n" # Page separator horizontal rule + # Append the final block text_blocks.append( FullyMergedBlock( diff --git a/marker/settings.py b/marker/settings.py index 6dd26d09..155e813e 100644 --- a/marker/settings.py +++ b/marker/settings.py @@ -11,6 +11,7 @@ class Settings(BaseSettings): TORCH_DEVICE: Optional[str] = None # Note: MPS device does not work for text detection, and will default to CPU IMAGE_DPI: int = 96 # DPI to render images pulled from pdf at EXTRACT_IMAGES: bool = True # Extract images from pdfs and save them + PAGINATE_OUTPUT: bool = False # Paginate output markdown @computed_field @property diff --git a/pyproject.toml b/pyproject.toml index aa1b5aa8..95d31cf4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "marker-pdf" -version = "0.2.13" +version = "0.2.14" description = "Convert PDF to markdown with high speed and accuracy." authors = ["Vik Paruchuri "] readme = "README.md"