Skip to content

Commit

Permalink
feat(web_api): pdf parse API and function implementation
Browse files Browse the repository at this point in the history
- added md_content and content_list result to return in the functions
  • Loading branch information
yzztin committed Jan 3, 2025
1 parent 81fcef8 commit 86ba8f5
Show file tree
Hide file tree
Showing 3 changed files with 156 additions and 0 deletions.
2 changes: 2 additions & 0 deletions magic_pdf/pipe/operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def dump_md(
pdf_info_list, md_make_mode, drop_mode, img_dir_or_bucket_prefix
)
writer.write_string(file_path, md_content)
return md_content

def dump_content_list(
self, writer: DataWriter, file_path: str, image_dir_or_bucket_prefix: str
Expand All @@ -66,6 +67,7 @@ def dump_content_list(
writer.write_string(
file_path, json.dumps(content_list, ensure_ascii=False, indent=4)
)
return content_list

def dump_middle_json(self, writer: DataWriter, file_path: str):
"""Dump the result of pipeline.
Expand Down
48 changes: 48 additions & 0 deletions projects/web_api/app_v0_10_6.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from typing import Literal

from fastapi import FastAPI, UploadFile, HTTPException

from pdf_parse_main import pdf_parse_main

app = FastAPI()

parse_allowed_methods = Literal["auto", "txt", "ocr"]

# Here to set the default output path of the parsing result file
PDF_OUTPUT_PATH = "/tmp/output"


@app.post("/pdf-parse")
async def pdf_parse(
file: UploadFile,
parse_method: parse_allowed_methods = "auto",
is_output: bool = False,
save_path: str = PDF_OUTPUT_PATH,
):
"""
is_output: Whether to keep the parsing result file
save_path: Parse result file save path
"""
if not file.filename.lower().endswith(".pdf"):
raise HTTPException(status_code=415, detail="File type error")

pdf_bytes = await file.read()
pdf_file_name = file.filename.split(".")[0]

try:
md_content, list_content, txt_content = await pdf_parse_main(
pdf_bytes, pdf_file_name, parse_method, is_output, save_path
)

return {"md_data": md_content, "content_list_data": list_content, "txt_data": txt_content}

except ValueError as ve:
raise HTTPException(status_code=400, detail=str(ve))
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))


if __name__ == "__main__":
import uvicorn

uvicorn.run(app, host="0.0.0.0", port=8999)
106 changes: 106 additions & 0 deletions projects/web_api/pdf_parse_main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import os
from shutil import rmtree
from datetime import datetime

from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze

from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.data.data_reader_writer import FileBasedDataReader, FileBasedDataWriter
from magic_pdf.data.dataset import PymuDocDataset


async def pdf_parse_main(
pdf_file,
pdf_file_name: str = "noname",
parse_method: str = "auto",
is_save_output: bool = False,
save_path: str = None,
):
"""
Args:
pdf_file: file path or file bytes
pdf_file_name: Use the file name as the folder name for saving the result file.
parse_method: auto/txt/ocr
is_save_output: Whether to save the output file.
save_path: Directory to save the output file. By default, the output file will be saved in the current workspace directory.
Returns:
md_content: markdown result without images
list_content: list result
txt_content: just text from list_content
"""
local_md_dir = None

try:
# In the case that the pdf_file is the file path, read its byte data.
if isinstance(pdf_file, str) and os.path.exists(pdf_file):
file_reader = FileBasedDataReader()
pdf_bytes = file_reader.read(pdf_file)
pdf_file_name = os.path.splitext(os.path.basename(pdf_file))[0]
elif isinstance(pdf_file, bytes):
pdf_bytes = pdf_file
pdf_file_name = pdf_file_name
else:
raise ValueError(
"pdf_file must be a file path or byte data. \
Please ensure the path is correct or provide the correct byte data."
)

# Create the output directory
timestamp = datetime.now().strftime("%Y%m%d%H%M%f")[:-4]
if save_path:
local_md_dir = os.path.join(save_path, f"{pdf_file_name}_{timestamp}")
else:
local_md_dir = os.path.join(os.getcwd(), f"{pdf_file_name}_{timestamp}")
local_image_dir = os.path.join(local_md_dir, "images")
os.makedirs(local_image_dir, exist_ok=True)

md_writer = FileBasedDataWriter(local_md_dir)
image_writer = FileBasedDataWriter(local_image_dir)

ds = PymuDocDataset(pdf_bytes)

if parse_method == "auto":
parse_method = "ocr" if ds.classify() == SupportedPdfParseMethod.OCR else "txt"

if parse_method == "txt":
infer_result = ds.apply(doc_analyze, ocr=False)
pipe_result = infer_result.pipe_txt_mode(image_writer, debug_mode=True)
elif parse_method == "ocr":
infer_result = ds.apply(doc_analyze, ocr=True)
pipe_result = infer_result.pipe_ocr_mode(image_writer, debug_mode=True)
else:
raise ValueError(f"Unsupported parsing method: {parse_method}, please choose [auto, txt, ocr].")

# result
md_content = pipe_result.dump_md(md_writer, f"{pdf_file_name}.md", local_image_dir)
list_content = pipe_result.dump_content_list(md_writer, f"{pdf_file_name}_content_list.json", local_image_dir)
# middle_content = pipe_result._pipe_res

# get text
txt_content = "\n".join(i.get("text", "") for i in list_content)

return md_content, list_content, txt_content

except Exception as e:
raise Exception(f"An error occurred when processing the file: {e}")

finally:
if not is_save_output and local_md_dir and os.path.exists(local_md_dir):
# delete the output directory
rmtree(local_md_dir)


# test
if __name__ == "__main__":
import asyncio

pdf_path = "/home/yzz/pdf_file_test/Quality.pdf"

with open(pdf_path, "rb") as f:
pdf_bytes = f.read()

# asyncio.run(pdf_parse_main(pdf_path, parse_method="auto", is_save_output=True))
asyncio.run(pdf_parse_main(pdf_bytes, parse_method="auto", is_save_output=True))

0 comments on commit 86ba8f5

Please sign in to comment.