From 86ba8f54446322f36ba3ea964d013e9215197025 Mon Sep 17 00:00:00 2001 From: yzz Date: Fri, 3 Jan 2025 14:19:04 +0800 Subject: [PATCH] feat(web_api): pdf parse API and function implementation - added md_content and content_list result to return in the functions --- magic_pdf/pipe/operators.py | 2 + projects/web_api/app_v0_10_6.py | 48 +++++++++++++ projects/web_api/pdf_parse_main.py | 106 +++++++++++++++++++++++++++++ 3 files changed, 156 insertions(+) create mode 100644 projects/web_api/app_v0_10_6.py create mode 100644 projects/web_api/pdf_parse_main.py diff --git a/magic_pdf/pipe/operators.py b/magic_pdf/pipe/operators.py index f8a6168a..3a806d96 100644 --- a/magic_pdf/pipe/operators.py +++ b/magic_pdf/pipe/operators.py @@ -45,6 +45,7 @@ def dump_md( pdf_info_list, md_make_mode, drop_mode, img_dir_or_bucket_prefix ) writer.write_string(file_path, md_content) + return md_content def dump_content_list( self, writer: DataWriter, file_path: str, image_dir_or_bucket_prefix: str @@ -66,6 +67,7 @@ def dump_content_list( writer.write_string( file_path, json.dumps(content_list, ensure_ascii=False, indent=4) ) + return content_list def dump_middle_json(self, writer: DataWriter, file_path: str): """Dump the result of pipeline. diff --git a/projects/web_api/app_v0_10_6.py b/projects/web_api/app_v0_10_6.py new file mode 100644 index 00000000..1b84260b --- /dev/null +++ b/projects/web_api/app_v0_10_6.py @@ -0,0 +1,48 @@ +from typing import Literal + +from fastapi import FastAPI, UploadFile, HTTPException + +from pdf_parse_main import pdf_parse_main + +app = FastAPI() + +parse_allowed_methods = Literal["auto", "txt", "ocr"] + +# Here to set the default output path of the parsing result file +PDF_OUTPUT_PATH = "/tmp/output" + + +@app.post("/pdf-parse") +async def pdf_parse( + file: UploadFile, + parse_method: parse_allowed_methods = "auto", + is_output: bool = False, + save_path: str = PDF_OUTPUT_PATH, +): + """ + is_output: Whether to keep the parsing result file + save_path: Parse result file save path + """ + if not file.filename.lower().endswith(".pdf"): + raise HTTPException(status_code=415, detail="File type error") + + pdf_bytes = await file.read() + pdf_file_name = file.filename.split(".")[0] + + try: + md_content, list_content, txt_content = await pdf_parse_main( + pdf_bytes, pdf_file_name, parse_method, is_output, save_path + ) + + return {"md_data": md_content, "content_list_data": list_content, "txt_data": txt_content} + + except ValueError as ve: + raise HTTPException(status_code=400, detail=str(ve)) + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + +if __name__ == "__main__": + import uvicorn + + uvicorn.run(app, host="0.0.0.0", port=8999) diff --git a/projects/web_api/pdf_parse_main.py b/projects/web_api/pdf_parse_main.py new file mode 100644 index 00000000..9d2dc406 --- /dev/null +++ b/projects/web_api/pdf_parse_main.py @@ -0,0 +1,106 @@ +import os +from shutil import rmtree +from datetime import datetime + +from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze + +from magic_pdf.config.enums import SupportedPdfParseMethod +from magic_pdf.data.data_reader_writer import FileBasedDataReader, FileBasedDataWriter +from magic_pdf.data.dataset import PymuDocDataset + + +async def pdf_parse_main( + pdf_file, + pdf_file_name: str = "noname", + parse_method: str = "auto", + is_save_output: bool = False, + save_path: str = None, +): + """ + + Args: + pdf_file: file path or file bytes + pdf_file_name: Use the file name as the folder name for saving the result file. + parse_method: auto/txt/ocr + is_save_output: Whether to save the output file. + save_path: Directory to save the output file. By default, the output file will be saved in the current workspace directory. + + Returns: + md_content: markdown result without images + list_content: list result + txt_content: just text from list_content + + """ + local_md_dir = None + + try: + # In the case that the pdf_file is the file path, read its byte data. + if isinstance(pdf_file, str) and os.path.exists(pdf_file): + file_reader = FileBasedDataReader() + pdf_bytes = file_reader.read(pdf_file) + pdf_file_name = os.path.splitext(os.path.basename(pdf_file))[0] + elif isinstance(pdf_file, bytes): + pdf_bytes = pdf_file + pdf_file_name = pdf_file_name + else: + raise ValueError( + "pdf_file must be a file path or byte data. \ + Please ensure the path is correct or provide the correct byte data." + ) + + # Create the output directory + timestamp = datetime.now().strftime("%Y%m%d%H%M%f")[:-4] + if save_path: + local_md_dir = os.path.join(save_path, f"{pdf_file_name}_{timestamp}") + else: + local_md_dir = os.path.join(os.getcwd(), f"{pdf_file_name}_{timestamp}") + local_image_dir = os.path.join(local_md_dir, "images") + os.makedirs(local_image_dir, exist_ok=True) + + md_writer = FileBasedDataWriter(local_md_dir) + image_writer = FileBasedDataWriter(local_image_dir) + + ds = PymuDocDataset(pdf_bytes) + + if parse_method == "auto": + parse_method = "ocr" if ds.classify() == SupportedPdfParseMethod.OCR else "txt" + + if parse_method == "txt": + infer_result = ds.apply(doc_analyze, ocr=False) + pipe_result = infer_result.pipe_txt_mode(image_writer, debug_mode=True) + elif parse_method == "ocr": + infer_result = ds.apply(doc_analyze, ocr=True) + pipe_result = infer_result.pipe_ocr_mode(image_writer, debug_mode=True) + else: + raise ValueError(f"Unsupported parsing method: {parse_method}, please choose [auto, txt, ocr].") + + # result + md_content = pipe_result.dump_md(md_writer, f"{pdf_file_name}.md", local_image_dir) + list_content = pipe_result.dump_content_list(md_writer, f"{pdf_file_name}_content_list.json", local_image_dir) + # middle_content = pipe_result._pipe_res + + # get text + txt_content = "\n".join(i.get("text", "") for i in list_content) + + return md_content, list_content, txt_content + + except Exception as e: + raise Exception(f"An error occurred when processing the file: {e}") + + finally: + if not is_save_output and local_md_dir and os.path.exists(local_md_dir): + # delete the output directory + rmtree(local_md_dir) + + +# test +if __name__ == "__main__": + import asyncio + + pdf_path = "/home/yzz/pdf_file_test/Quality.pdf" + + with open(pdf_path, "rb") as f: + pdf_bytes = f.read() + + # asyncio.run(pdf_parse_main(pdf_path, parse_method="auto", is_save_output=True)) + asyncio.run(pdf_parse_main(pdf_bytes, parse_method="auto", is_save_output=True))