feat(web_api): pdf parse API and function implementation

- added md_content and content_list result to return in the functions
opendatalab · Jan 3, 2025 · 86ba8f5 · 86ba8f5
1 parent 81fcef8
commit 86ba8f5
Show file tree

Hide file tree

Showing 3 changed files with 156 additions and 0 deletions.
diff --git a/magic_pdf/pipe/operators.py b/magic_pdf/pipe/operators.py
@@ -45,6 +45,7 @@ def dump_md(
             pdf_info_list, md_make_mode, drop_mode, img_dir_or_bucket_prefix
         )
         writer.write_string(file_path, md_content)
+        return md_content
 
     def dump_content_list(
         self, writer: DataWriter, file_path: str, image_dir_or_bucket_prefix: str
@@ -66,6 +67,7 @@ def dump_content_list(
         writer.write_string(
             file_path, json.dumps(content_list, ensure_ascii=False, indent=4)
         )
+        return content_list
 
     def dump_middle_json(self, writer: DataWriter, file_path: str):
         """Dump the result of pipeline.

diff --git a/projects/web_api/app_v0_10_6.py b/projects/web_api/app_v0_10_6.py
@@ -0,0 +1,48 @@
+from typing import Literal
+
+from fastapi import FastAPI, UploadFile, HTTPException
+
+from pdf_parse_main import pdf_parse_main
+
+app = FastAPI()
+
+parse_allowed_methods = Literal["auto", "txt", "ocr"]
+
+# Here to set the default output path of the parsing result file
+PDF_OUTPUT_PATH = "/tmp/output"
+
+
+@app.post("/pdf-parse")
+async def pdf_parse(
+    file: UploadFile,
+    parse_method: parse_allowed_methods = "auto",
+    is_output: bool = False,
+    save_path: str = PDF_OUTPUT_PATH,
+):
+    """
+    is_output: Whether to keep the parsing result file
+    save_path: Parse result file save path
+    """
+    if not file.filename.lower().endswith(".pdf"):
+        raise HTTPException(status_code=415, detail="File type error")
+
+    pdf_bytes = await file.read()
+    pdf_file_name = file.filename.split(".")[0]
+
+    try:
+        md_content, list_content, txt_content = await pdf_parse_main(
+            pdf_bytes, pdf_file_name, parse_method, is_output, save_path
+        )
+
+        return {"md_data": md_content, "content_list_data": list_content, "txt_data": txt_content}
+
+    except ValueError as ve:
+        raise HTTPException(status_code=400, detail=str(ve))
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+if __name__ == "__main__":
+    import uvicorn
+
+    uvicorn.run(app, host="0.0.0.0", port=8999)
diff --git a/projects/web_api/pdf_parse_main.py b/projects/web_api/pdf_parse_main.py
@@ -0,0 +1,106 @@
+import os
+from shutil import rmtree
+from datetime import datetime
+
+from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+
+from magic_pdf.config.enums import SupportedPdfParseMethod
+from magic_pdf.data.data_reader_writer import FileBasedDataReader, FileBasedDataWriter
+from magic_pdf.data.dataset import PymuDocDataset
+
+
+async def pdf_parse_main(
+        pdf_file,
+        pdf_file_name: str = "noname",
+        parse_method: str = "auto",
+        is_save_output: bool = False,
+        save_path: str = None,
+):
+    """
+
+    Args:
+        pdf_file: file path or file bytes
+        pdf_file_name: Use the file name as the folder name for saving the result file.
+        parse_method: auto/txt/ocr
+        is_save_output: Whether to save the output file.
+        save_path: Directory to save the output file. By default, the output file will be saved in the current workspace directory.
+
+    Returns:
+        md_content: markdown result without images
+        list_content: list result
+        txt_content: just text from list_content
+
+    """
+    local_md_dir = None
+
+    try:
+        # In the case that the pdf_file is the file path, read its byte data.
+        if isinstance(pdf_file, str) and os.path.exists(pdf_file):
+            file_reader = FileBasedDataReader()
+            pdf_bytes = file_reader.read(pdf_file)
+            pdf_file_name = os.path.splitext(os.path.basename(pdf_file))[0]
+        elif isinstance(pdf_file, bytes):
+            pdf_bytes = pdf_file
+            pdf_file_name = pdf_file_name
+        else:
+            raise ValueError(
+                "pdf_file must be a file path or byte data. \
+                Please ensure the path is correct or provide the correct byte data."
+            )
+
+        # Create the output directory
+        timestamp = datetime.now().strftime("%Y%m%d%H%M%f")[:-4]
+        if save_path:
+            local_md_dir = os.path.join(save_path, f"{pdf_file_name}_{timestamp}")
+        else:
+            local_md_dir = os.path.join(os.getcwd(), f"{pdf_file_name}_{timestamp}")
+        local_image_dir = os.path.join(local_md_dir, "images")
+        os.makedirs(local_image_dir, exist_ok=True)
+
+        md_writer = FileBasedDataWriter(local_md_dir)
+        image_writer = FileBasedDataWriter(local_image_dir)
+
+        ds = PymuDocDataset(pdf_bytes)
+
+        if parse_method == "auto":
+            parse_method = "ocr" if ds.classify() == SupportedPdfParseMethod.OCR else "txt"
+
+        if parse_method == "txt":
+            infer_result = ds.apply(doc_analyze, ocr=False)
+            pipe_result = infer_result.pipe_txt_mode(image_writer, debug_mode=True)
+        elif parse_method == "ocr":
+            infer_result = ds.apply(doc_analyze, ocr=True)
+            pipe_result = infer_result.pipe_ocr_mode(image_writer, debug_mode=True)
+        else:
+            raise ValueError(f"Unsupported parsing method: {parse_method}, please choose [auto, txt, ocr].")
+
+        # result
+        md_content = pipe_result.dump_md(md_writer, f"{pdf_file_name}.md", local_image_dir)
+        list_content = pipe_result.dump_content_list(md_writer, f"{pdf_file_name}_content_list.json", local_image_dir)
+        # middle_content = pipe_result._pipe_res
+
+        # get text
+        txt_content = "\n".join(i.get("text", "") for i in list_content)
+
+        return md_content, list_content, txt_content
+
+    except Exception as e:
+        raise Exception(f"An error occurred when processing the file: {e}")
+
+    finally:
+        if not is_save_output and local_md_dir and os.path.exists(local_md_dir):
+            # delete the output directory
+            rmtree(local_md_dir)
+
+
+# test
+if __name__ == "__main__":
+    import asyncio
+
+    pdf_path = "/home/yzz/pdf_file_test/Quality.pdf"
+
+    with open(pdf_path, "rb") as f:
+        pdf_bytes = f.read()
+
+    # asyncio.run(pdf_parse_main(pdf_path, parse_method="auto", is_save_output=True))
+    asyncio.run(pdf_parse_main(pdf_bytes, parse_method="auto", is_save_output=True))