Skip to content

Commit

Permalink
add: structured output formatter
Browse files Browse the repository at this point in the history
  • Loading branch information
chloedia committed Dec 9, 2024
1 parent 5b63dc6 commit eea6cfd
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 13 deletions.
19 changes: 17 additions & 2 deletions libs/megaparse/src/megaparse/examples/parse_file.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,28 @@
from megaparse.formatter.unstructured_formatter.md_formatter import MarkDownFormatter
from megaparse.megaparse import MegaParse
from megaparse.formatter.structured_formatter.custom_structured_formatter import (
CustomStructuredFormatter,
)
from megaparse.parser.unstructured_parser import UnstructuredParser

from langchain_openai import ChatOpenAI
from pydantic import BaseModel, Field


class MyCustomFormat(BaseModel):
title: str = Field(description="The title of the document.")
problem: str = Field(description="The problem statement.")
solution: str = Field(description="The solution statement.")


if __name__ == "__main__":
# Parse a file
parser = UnstructuredParser()
formatter = MarkDownFormatter()
model = ChatOpenAI()
formatter_1 = MarkDownFormatter()
formatter_2 = CustomStructuredFormatter(model=model, output_model=MyCustomFormat)

megaparse = MegaParse(parser=parser, formatters=[formatter])
megaparse = MegaParse(parser=parser, formatters=[formatter_1, formatter_2])

file_path = "libs/megaparse/tests/pdf/sample_pdf.pdf"
result = megaparse.load(file_path=file_path)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from langchain_core.language_models.chat_models import BaseChatModel
from megaparse.formatter.base import BaseFormatter
from pydantic import BaseModel


class StructuredFormatter(BaseFormatter):
def __init__(self, model: BaseChatModel, output_model: type[BaseModel]):
super().__init__(model)
self.output_model = output_model

async def format_string(
self,
text: str,
file_path: str | None = None,
) -> str: # FIXME: Return a structured output of type BaseModel ?
raise NotImplementedError()
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from typing import Optional

from langchain_core.language_models.chat_models import BaseChatModel
from megaparse.formatter.structured_formatter import StructuredFormatter
from pydantic import BaseModel


class CustomStructuredFormatter(StructuredFormatter):
async def format_string(
self,
text: str,
file_path: str | None = None,
) -> str:
"""
Structure the file using an AI language model.
Args:
text: The text to format.
file_path: The file path of the text.
model: The AI language model to use for formatting.
Returns:
The structured text.
"""
if not self.model:
raise ValueError("A Model is needed to use the CustomStructuredFormatter.")
print("Formatting text using CustomStructuredFormatter...")
if len(text) < 0:
raise ValueError(
"A non empty text is needed to format text using CustomStructuredFormatter."
)
if not self.output_model:
raise ValueError(
"An output model is needed to structure text using CustomStructuredFormatter."
)

structured_model = self.model.with_structured_output(self.output_model) # type: ignore

formatted_text = structured_model.invoke(
f"Parse the text in a structured format: {text}"
)
assert isinstance(formatted_text, BaseModel), "Model output is not a BaseModel."

return formatted_text.model_dump_json()

This file was deleted.

0 comments on commit eea6cfd

Please sign in to comment.