diff --git a/Dockerfile b/Dockerfile index 0072d9e..82d0706 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,7 +12,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ ffmpeg \ && rm -rf /var/lib/apt/lists/* -RUN pip install markitdown +RUN pip install markitdown fastapi uvicorn # Default USERID and GROUPID ARG USERID=10000 @@ -20,4 +20,4 @@ ARG GROUPID=10000 USER $USERID:$GROUPID -ENTRYPOINT [ "markitdown" ] +ENTRYPOINT ["uvicorn", "src.markitdown.api:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/README.md b/README.md index d2314c3..1bb6345 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,42 @@ print(result.text_content) docker build -t markitdown:latest . docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md ``` + +### Web API + +You can also use MarkItDown via a REST endpoint. The Web API is built using FastAPI and can be run using Docker. + +#### Running the Web API + +1. Build the Docker image: + +```sh +docker build -t markitdown-api:latest . +``` + +2. Run the Docker container: + +```sh +docker run --rm -p 8000:8000 markitdown-api:latest +``` + +The Web API will be available at `http://localhost:8000`. + +#### Using the Web API + +The Web API provides a single endpoint `/convert` that accepts a file and returns the converted markdown. + +- **Endpoint:** `/convert` +- **Method:** `POST` +- **Request Body:** Multipart form data with a file field named `file` +- **Response:** JSON object with a `markdown` field containing the converted markdown + +Example using `curl`: + +```sh +curl -X POST "http://localhost:8000/convert" -F "file=@path-to-file.pdf" +``` +
Batch Processing Multiple Files diff --git a/pyproject.toml b/pyproject.toml index 3e14cec..3cb22bd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,6 +40,8 @@ dependencies = [ "pathvalidate", "charset-normalizer", "openai", + "fastapi", + "uvicorn", ] [project.urls] diff --git a/src/markitdown/__main__.py b/src/markitdown/__main__.py index b6cf963..aa820f8 100644 --- a/src/markitdown/__main__.py +++ b/src/markitdown/__main__.py @@ -6,6 +6,7 @@ from textwrap import dedent from .__about__ import __version__ from ._markitdown import MarkItDown, DocumentConverterResult +import uvicorn def main(): @@ -57,9 +58,16 @@ def main(): "--output", help="Output file name. If not provided, output is written to stdout.", ) + parser.add_argument( + "--api", + action="api", + help="Start the API server", + ) args = parser.parse_args() - if args.filename is None: + if args.api: + uvicorn.run("src.markitdown.api:app", host="0.0.0.0", port=8000) + elif args.filename is None: markitdown = MarkItDown() result = markitdown.convert_stream(sys.stdin.buffer) _handle_output(args, result) diff --git a/src/markitdown/api.py b/src/markitdown/api.py new file mode 100644 index 0000000..0f9ce4e --- /dev/null +++ b/src/markitdown/api.py @@ -0,0 +1,32 @@ +from fastapi import FastAPI, File, UploadFile, HTTPException +from fastapi.responses import FileResponse +from markitdown import MarkItDown +import os + +app = FastAPI() + +@app.post("/convert") +async def convert(file: UploadFile = File(...)): + if not file.filename: + raise HTTPException(status_code=400, detail="No file uploaded") + + try: + contents = await file.read() + temp_file_path = f"/tmp/{file.filename}" + with open(temp_file_path, "wb") as temp_file: + temp_file.write(contents) + + markitdown = MarkItDown() + result = markitdown.convert(temp_file_path) + + # output_file_path = f"/tmp/{os.path.splitext(file.filename)[0]}.md" + # with open(output_file_path, "w") as output_file: + # output_file.write(result.text_content) + + os.remove(temp_file_path) + + # return FileResponse(output_file_path, filename=f"{os.path.splitext(file.filename)[0]}.md") + return {"markdown": result.text_content} + + except Exception as e: + raise HTTPException(status_code=500, detail=str(e))