Skip to content

Commit

Permalink
feat: megaparse-sdk-cherry (#105)
Browse files Browse the repository at this point in the history
* feat:create base sdk

* fix: restructure repo

* fix: separate package

* fix : remove comments

* add: pyproject

* fix: rebase branch

* fix: release please & sdk dep

* delete general.py

* fix: remove unused files

* change bearer to api key & reformat imports

* fix: API key optional
  • Loading branch information
chloedia authored Nov 7, 2024
1 parent 8261b00 commit ad44aa3
Show file tree
Hide file tree
Showing 22 changed files with 625 additions and 345 deletions.
3 changes: 2 additions & 1 deletion .env.example
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
LLAMA_CLOUD_API_KEY=llx-1234567890
OPENAI_API_KEY = sk-1234567890
OPENAI_API_KEY=sk-1234567890
MEGAPARSE_API_KEY=MyMegaParseKey
17 changes: 17 additions & 0 deletions .github/workflows/release-please.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,20 @@ jobs:
run: rye build
- name: Rye Publish
run: rye publish --token ${{ secrets.PYPI_API_TOKEN }} --yes

deploy-sdk:
if: needs.release-please.outputs.release_created == 'true'
needs: release-please
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Install Rye
uses: eifinger/setup-rye@v2
with:
enable-cache: true
- name: Rye Sync
run: cd megaparse/sdk && rye sync --no-lock
- name: Rye Build
run: cd megaparse/sdk && rye build
- name: Rye Publish
run: cd megaparse/sdk && rye publish --token ${{ secrets.PYPI_API_TOKEN }} --yes
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
*.md
/output
/input
.env
Expand Down
9 changes: 7 additions & 2 deletions megaparse/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# from .Converter import MegaParse
"""My library with optional components."""

# __all__ = ["MegaParse"]
__version__ = "0.1.0"

# Import only SDK components by default
from megaparse import sdk

__all__ = ["sdk"]
19 changes: 10 additions & 9 deletions megaparse/api/app.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
import os
import tempfile
from fastapi import Depends, FastAPI, UploadFile, File, HTTPException

import httpx
import psutil
from fastapi import Depends, FastAPI, File, HTTPException, UploadFile
from langchain_anthropic import ChatAnthropic
from langchain_community.document_loaders import PlaywrightURLLoader
from langchain_openai import ChatOpenAI
from llama_parse.utils import Language

from megaparse.api.utils.type import HTTPModelNotSupported
from megaparse.core.megaparse import MegaParse
from megaparse.core.parser.builder import ParserBuilder
from megaparse.core.parser.type import ParserConfig, ParserType
from megaparse.core.parser.unstructured_parser import StrategyEnum, UnstructuredParser
import psutil
import os
from langchain_community.document_loaders import PlaywrightURLLoader

from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic
from llama_parse.utils import Language
import httpx

app = FastAPI()

Expand Down
4 changes: 4 additions & 0 deletions megaparse/sdk/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
## MegaParse SDK

What if you just don't care about the code and just want to call a service that does everythink for you ?
We thought of that (since we do to), just use our SDK.
3 changes: 3 additions & 0 deletions megaparse/sdk/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from megaparse.sdk.src import MegaParseSDK

__all__ = ["MegaParseSDK"]
31 changes: 31 additions & 0 deletions megaparse/sdk/examples/usage_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import asyncio
import os

from megaparse.sdk import MegaParseSDK


async def main():
api_key = str(os.getenv("MEGAPARSE_API_KEY"))
megaparse = MegaParseSDK(api_key)

url = "https://www.quivr.com"

# Upload a URL
url_response = await megaparse.url.upload(url)
print(f"\n----- URL Response : {url} -----\n")
print(url_response)

file_path = "megaparse/sdk/pdf/MegaFake_report.pdf"
# Upload a file
response = await megaparse.file.upload(
file_path=file_path,
method="unstructured", # type: ignore # unstructured, llama_parser, megaparse_vision
strategy="auto",
)
print(f"\n----- File Response : {file_path} -----\n")
print(response)
await megaparse.close()


if __name__ == "__main__":
asyncio.run(main())
Binary file added megaparse/sdk/pdf/MegaFake_report.pdf
Binary file not shown.
30 changes: 30 additions & 0 deletions megaparse/sdk/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
[project]
name = "megaparse-sdk"
version = "0.1.0"
description = "Megaparse SDK"
dependencies = [
"python-dotenv>=1.0.0",
"pycryptodome>=3.21.0",
"psutil>=6.1.0",
"llama-parse>=0.4.0",
"httpx>=0.27.0",

]

readme = "README.md"
requires-python = "< 3.12"

[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[tool.rye]
managed = true
dev-dependencies = []
universal = true

[tool.hatch.metadata]
allow-direct-references = true

[tool.hatch.build.targets.wheel]
packages = ["megaparse_sdk"]
13 changes: 13 additions & 0 deletions megaparse/sdk/src/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from .client import MegaParseClient
from .endpoints.file_upload import FileUpload
from .endpoints.url_upload import URLUpload


class MegaParseSDK:
def __init__(self, api_key: str):
self.client = MegaParseClient(api_key)
self.file = FileUpload(self.client)
self.url = URLUpload(self.client)

async def close(self):
await self.client.close()
26 changes: 26 additions & 0 deletions megaparse/sdk/src/client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from typing import Any

import httpx


class MegaParseClient:
def __init__(self, api_key: str | None = None):
self.base_url = "https://megaparse.tooling.quivr.app" # to define once in production # to define once in production

self.api_key = api_key
if self.api_key:
self.session = httpx.AsyncClient(
headers={"x-api-key": self.api_key}, timeout=60
)
else:
self.session = httpx.AsyncClient(timeout=60)

async def request(self, method: str, endpoint: str, **kwargs: Any) -> Any:
url = f"{self.base_url}{endpoint}"
client = self.session
response = await client.request(method, url, **kwargs)
response.raise_for_status()
return response.json()

async def close(self):
await self.session.aclose()
1 change: 1 addition & 0 deletions megaparse/sdk/src/endpoints/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

34 changes: 34 additions & 0 deletions megaparse/sdk/src/endpoints/file_upload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from typing import Optional

from httpx import Response
from llama_parse.utils import Language

from megaparse.sdk.src.client import MegaParseClient
from megaparse.sdk.utils.type import ParserType, StrategyEnum


class FileUpload:
def __init__(self, client: MegaParseClient):
self.client = client

async def upload(
self,
file_path: str,
method: ParserType = ParserType.UNSTRUCTURED,
strategy: str = StrategyEnum.AUTO,
check_table: bool = False,
language: Language = Language.ENGLISH,
parsing_instruction: Optional[str] = None,
model_name: str = "gpt-4o",
) -> Response:
with open(file_path, "rb") as file:
files = {"file": (file_path, file)}
data = {
"method": method,
"strategy": strategy,
"check_table": check_table,
"language": language.value,
"parsing_instruction": parsing_instruction,
"model_name": model_name,
}
return await self.client.request("POST", "/v1/file", files=files, data=data)
11 changes: 11 additions & 0 deletions megaparse/sdk/src/endpoints/url_upload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from megaparse.sdk.src.client import MegaParseClient


class URLUpload:
def __init__(self, client: MegaParseClient):
self.client = client

async def upload(self, url: str):
endpoint = f"/v1/url?url={url}"
headers = {"accept": "application/json"}
return await self.client.request("POST", endpoint, headers=headers, data="")
17 changes: 17 additions & 0 deletions megaparse/sdk/utils/type.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from enum import Enum


class ParserType(str, Enum):
"""Parser type enumeration."""

UNSTRUCTURED = "unstructured"
LLAMA_PARSER = "llama_parser"
MEGAPARSE_VISION = "megaparse_vision"


class StrategyEnum(str, Enum):
"""Method to use for the conversion"""

FAST = "fast"
AUTO = "auto"
HI_RES = "hi_res"
77 changes: 59 additions & 18 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,42 +8,52 @@ authors = [
]

readme = "README.md"
requires-python = ">= 3.11"

dependencies = [
"python-docx>=1.1.0",
"mammoth>=1.8.0",
"python-pptx>=1.0.2",
"llama-parse>=0.4.0",
"pdf2docx>=0.5.0",
"unstructured[all-docs]>=0.15.0",
"langchain>=0.2.0",
"langchain-community>=0.2.0",
"langchain-openai>=0.1.0",
"langchain-core>=0.2.0",
"python-dotenv>=1.0.0",
]

[project.optional-dependencies]
all = [
"pycryptodome>=3.21.0",
"llama-index>=0.10.0",
"pdfplumber>=0.11.0",
"fastapi>=0.115.2",
"uvicorn>=0.32.0",
"ratelimit>=2.2.1",
"requests>=2.32.3",
"backoff>=2.2.1",
"pypdf>=5.0.1",
"psutil>=6.1.0",
"numpy<=2.0.0",
"playwright>=1.47.0",
"langchain-anthropic>=0.2.3",
"python-magic>=0.4.27",
"unstructured[all-docs]>=0.15.0",
"langchain>=0.2.0",
"langchain-community>=0.2.0",
"langchain-openai>=0.1.0",
"langchain-core>=0.2.0",
"llama-parse>=0.4.0",
"uvicorn>=0.32.0",
"fastapi>=0.115.2",
"ratelimit>=2.2.1",
"requests>=2.32.3",
]
requires-python = ">= 3.11"

[tool.rye.workspace]
members = [
".",
"megaparse/sdk"
]


[tool.hatch.metadata]
allow-direct-references = true

[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[tool.rye]
python = "3.11.9"
python = ">= 3.11"
managed = true
universal = true
dev-dependencies = [
Expand All @@ -59,8 +69,39 @@ dev-dependencies = [
"pytest-cov>=5.0.0",
]

[tool.hatch.metadata]
allow-direct-references = true
[tool.ruff]
line-length = 88
exclude = [".git", "__pycache__", ".mypy_cache", ".pytest_cache"]

[tool.ruff.lint]
select = [
"E", # pycodestyle errors
"W", # pycodestyle warnings
"F", # pyflakes
"I", # isort
"C", # flake8-comprehensions
"B", # flake8-bugbear
]
ignore = [
"B904",
"B006",
"E501", # line too long, handled by black
"B008", # do not perform function calls in argument defaults
"C901", # too complex
]

[tool.ruff.lint.isort]
order-by-type = true
relative-imports-order = "closest-to-furthest"
extra-standard-library = ["typing"]
section-order = [
"future",
"standard-library",
"third-party",
"first-party",
"local-folder",
]
known-first-party = []

[tool.hatch.build.targets.wheel]
packages = ["megaparse"]
Loading

0 comments on commit ad44aa3

Please sign in to comment.