Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: modular parser and formatter v0 #175

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,6 @@ venv
*.DS_Store
.tool-versions
megaparse/sdk/examples/only_pdfs/*
benchmark/auto/*
benchmark/hi_res/*

Empty file.
26 changes: 0 additions & 26 deletions libs/megaparse/src/megaparse/checker/format_checker.py

This file was deleted.

211 changes: 0 additions & 211 deletions libs/megaparse/src/megaparse/checker/markdown_processor.py

This file was deleted.

29 changes: 29 additions & 0 deletions libs/megaparse/src/megaparse/examples/parse_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from megaparse.formatter.unstructured_formatter.md_formatter import MarkDownFormatter
from megaparse.megaparse import MegaParse
from megaparse.formatter.structured_formatter.custom_structured_formatter import (
CustomStructuredFormatter,
)
from megaparse.parser.unstructured_parser import UnstructuredParser

from langchain_openai import ChatOpenAI
from pydantic import BaseModel, Field


class MyCustomFormat(BaseModel):
title: str = Field(description="The title of the document.")
problem: str = Field(description="The problem statement.")
solution: str = Field(description="The solution statement.")


if __name__ == "__main__":
# Parse a file
parser = UnstructuredParser()
model = ChatOpenAI()
formatter_1 = MarkDownFormatter()
formatter_2 = CustomStructuredFormatter(model=model, output_model=MyCustomFormat)

megaparse = MegaParse(parser=parser, formatters=[formatter_1, formatter_2])

file_path = "libs/megaparse/tests/pdf/sample_pdf.pdf"
result = megaparse.load(file_path=file_path)
print(result)
40 changes: 40 additions & 0 deletions libs/megaparse/src/megaparse/formatter/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from abc import ABC
from typing import List, Union

from langchain_core.language_models.chat_models import BaseChatModel
from unstructured.documents.elements import Element


# TODO: Implement the Formatter class @Chloe
class BaseFormatter(ABC):
"""
A class used to improve the layout of elements, particularly focusing on converting HTML tables to markdown tables.
Attributes
----------
model : BaseChatModel
An instance of a chat model used to process and improve the layout of elements.
Methods
-------
improve_layout(elements: List[Element]) -> List[Element]
Processes a list of elements, converting HTML tables to markdown tables and improving the overall layout.
"""

def __init__(self, model: BaseChatModel | None = None):
self.model = model

async def format(
self, elements: Union[List[Element], str], file_path: str | None = None
) -> Union[List[Element], str]:
if isinstance(elements, list):
return await self.format_elements(elements, file_path)
return await self.format_string(elements, file_path)

async def format_elements(
self, elements: List[Element], file_path: str | None = None
) -> Union[List[Element], str]:
raise NotImplementedError("Subclasses should implement this method")

async def format_string(
self, text: str, file_path: str | None = None
) -> Union[List[Element], str]:
raise NotImplementedError("Subclasses should implement this method")
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from langchain_core.language_models.chat_models import BaseChatModel
from megaparse.formatter.base import BaseFormatter
from pydantic import BaseModel


class StructuredFormatter(BaseFormatter):
def __init__(self, model: BaseChatModel, output_model: type[BaseModel]):
super().__init__(model)
self.output_model = output_model

async def format_string(
self,
text: str,
file_path: str | None = None,
) -> str: # FIXME: Return a structured output of type BaseModel ?
raise NotImplementedError()
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from typing import Optional

from langchain_core.language_models.chat_models import BaseChatModel
from megaparse.formatter.structured_formatter import StructuredFormatter
from pydantic import BaseModel


class CustomStructuredFormatter(StructuredFormatter):
async def format_string(
self,
text: str,
file_path: str | None = None,
) -> str:
"""
Structure the file using an AI language model.
Args:
text: The text to format.
file_path: The file path of the text.
model: The AI language model to use for formatting.
Returns:
The structured text.
"""
if not self.model:
raise ValueError("A Model is needed to use the CustomStructuredFormatter.")
print("Formatting text using CustomStructuredFormatter...")
if len(text) < 0:
raise ValueError(
"A non empty text is needed to format text using CustomStructuredFormatter."
)
if not self.output_model:
raise ValueError(
"An output model is needed to structure text using CustomStructuredFormatter."
)

structured_model = self.model.with_structured_output(self.output_model) # type: ignore

formatted_text = structured_model.invoke(
f"Parse the text in a structured format: {text}"
)
assert isinstance(formatted_text, BaseModel), "Model output is not a BaseModel."

return formatted_text.model_dump_json()
12 changes: 12 additions & 0 deletions libs/megaparse/src/megaparse/formatter/table_formatter/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from typing import List

from unstructured.documents.elements import Element

from megaparse.formatter.base import BaseFormatter


class TableFormatter(BaseFormatter):
async def format_elements(
self, elements: List[Element], file_path: str | None = None
) -> List[Element]:
raise NotImplementedError()
Loading
Loading