From 206b4a5ad65d2fe6913bc7ea9f747ebfc8f25852 Mon Sep 17 00:00:00 2001 From: Vijay Soni Date: Mon, 23 Dec 2024 01:02:52 +0530 Subject: [PATCH 1/6] Add Web API for MarkItDown Related to #133 --- Dockerfile | 4 ++-- README.md | 36 ++++++++++++++++++++++++++++++++++++ src/markitdown/api.py | 31 +++++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+), 2 deletions(-) create mode 100644 src/markitdown/api.py diff --git a/Dockerfile b/Dockerfile index 0072d9e..82d0706 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,7 +12,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ ffmpeg \ && rm -rf /var/lib/apt/lists/* -RUN pip install markitdown +RUN pip install markitdown fastapi uvicorn # Default USERID and GROUPID ARG USERID=10000 @@ -20,4 +20,4 @@ ARG GROUPID=10000 USER $USERID:$GROUPID -ENTRYPOINT [ "markitdown" ] +ENTRYPOINT ["uvicorn", "src.markitdown.api:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/README.md b/README.md index d2314c3..1bb6345 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,42 @@ print(result.text_content) docker build -t markitdown:latest . docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md ``` + +### Web API + +You can also use MarkItDown via a REST endpoint. The Web API is built using FastAPI and can be run using Docker. + +#### Running the Web API + +1. Build the Docker image: + +```sh +docker build -t markitdown-api:latest . +``` + +2. Run the Docker container: + +```sh +docker run --rm -p 8000:8000 markitdown-api:latest +``` + +The Web API will be available at `http://localhost:8000`. + +#### Using the Web API + +The Web API provides a single endpoint `/convert` that accepts a file and returns the converted markdown. + +- **Endpoint:** `/convert` +- **Method:** `POST` +- **Request Body:** Multipart form data with a file field named `file` +- **Response:** JSON object with a `markdown` field containing the converted markdown + +Example using `curl`: + +```sh +curl -X POST "http://localhost:8000/convert" -F "file=@path-to-file.pdf" +``` +
Batch Processing Multiple Files diff --git a/src/markitdown/api.py b/src/markitdown/api.py new file mode 100644 index 0000000..b57a4cf --- /dev/null +++ b/src/markitdown/api.py @@ -0,0 +1,31 @@ +from fastapi import FastAPI, File, UploadFile, HTTPException +from fastapi.responses import FileResponse +from markitdown import MarkItDown +import os + +app = FastAPI() + +@app.post("/convert") +async def convert(file: UploadFile = File(...)): + if not file.filename: + raise HTTPException(status_code=400, detail="No file uploaded") + + try: + contents = await file.read() + temp_file_path = f"/tmp/{file.filename}" + with open(temp_file_path, "wb") as temp_file: + temp_file.write(contents) + + markitdown = MarkItDown() + result = markitdown.convert(temp_file_path) + + output_file_path = f"/tmp/{os.path.splitext(file.filename)[0]}.md" + with open(output_file_path, "w") as output_file: + output_file.write(result.text_content) + + os.remove(temp_file_path) + + return FileResponse(output_file_path, filename=f"{os.path.splitext(file.filename)[0]}.md") + + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) From 05084712c491a8484a94ce9408334583e185708b Mon Sep 17 00:00:00 2001 From: Vijay Soni Date: Mon, 23 Dec 2024 01:07:13 +0530 Subject: [PATCH 2/6] --- src/markitdown/__main__.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/markitdown/__main__.py b/src/markitdown/__main__.py index b6cf963..ea32f60 100644 --- a/src/markitdown/__main__.py +++ b/src/markitdown/__main__.py @@ -6,6 +6,7 @@ from textwrap import dedent from .__about__ import __version__ from ._markitdown import MarkItDown, DocumentConverterResult +import uvicorn def main(): @@ -57,9 +58,16 @@ def main(): "--output", help="Output file name. If not provided, output is written to stdout.", ) + parser.add_argument( + "--api", + action="store_true", + help="Start the FastAPI server", + ) args = parser.parse_args() - if args.filename is None: + if args.api: + uvicorn.run("src.markitdown.api:app", host="0.0.0.0", port=8000) + elif args.filename is None: markitdown = MarkItDown() result = markitdown.convert_stream(sys.stdin.buffer) _handle_output(args, result) From 5cc9144942d1f6c0d3d6317faa28edf20c9e3abf Mon Sep 17 00:00:00 2001 From: Vijay Soni Date: Mon, 23 Dec 2024 01:08:36 +0530 Subject: [PATCH 3/6] --- pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 3e14cec..3cb22bd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,6 +40,8 @@ dependencies = [ "pathvalidate", "charset-normalizer", "openai", + "fastapi", + "uvicorn", ] [project.urls] From a85af725fe2dce6d35bc0b28575f47fa2e51848a Mon Sep 17 00:00:00 2001 From: Vijay Soni Date: Mon, 23 Dec 2024 01:10:27 +0530 Subject: [PATCH 4/6] --- src/markitdown/__main__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/markitdown/__main__.py b/src/markitdown/__main__.py index ea32f60..aa820f8 100644 --- a/src/markitdown/__main__.py +++ b/src/markitdown/__main__.py @@ -60,8 +60,8 @@ def main(): ) parser.add_argument( "--api", - action="store_true", - help="Start the FastAPI server", + action="api", + help="Start the API server", ) args = parser.parse_args() From a1989520d46c3d367f87ff93ba6a5f26e70edf12 Mon Sep 17 00:00:00 2001 From: Vijay Soni Date: Mon, 23 Dec 2024 01:13:13 +0530 Subject: [PATCH 5/6] --- src/markitdown/api.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/markitdown/api.py b/src/markitdown/api.py index b57a4cf..85f4350 100644 --- a/src/markitdown/api.py +++ b/src/markitdown/api.py @@ -25,7 +25,8 @@ async def convert(file: UploadFile = File(...)): os.remove(temp_file_path) - return FileResponse(output_file_path, filename=f"{os.path.splitext(file.filename)[0]}.md") + # return FileResponse(output_file_path, filename=f"{os.path.splitext(file.filename)[0]}.md") + return {"markdown": result.text_content} except Exception as e: raise HTTPException(status_code=500, detail=str(e)) From d46cff88579ecaa5e3e7cbb04c820fb6e095ae63 Mon Sep 17 00:00:00 2001 From: Vijay Soni Date: Mon, 23 Dec 2024 01:13:56 +0530 Subject: [PATCH 6/6] --- src/markitdown/api.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/markitdown/api.py b/src/markitdown/api.py index 85f4350..0f9ce4e 100644 --- a/src/markitdown/api.py +++ b/src/markitdown/api.py @@ -19,9 +19,9 @@ async def convert(file: UploadFile = File(...)): markitdown = MarkItDown() result = markitdown.convert(temp_file_path) - output_file_path = f"/tmp/{os.path.splitext(file.filename)[0]}.md" - with open(output_file_path, "w") as output_file: - output_file.write(result.text_content) + # output_file_path = f"/tmp/{os.path.splitext(file.filename)[0]}.md" + # with open(output_file_path, "w") as output_file: + # output_file.write(result.text_content) os.remove(temp_file_path)