diff --git a/libs/megaparse/src/megaparse/examples/parse_file.py b/libs/megaparse/src/megaparse/examples/parse_file.py index b728824..b10d811 100644 --- a/libs/megaparse/src/megaparse/examples/parse_file.py +++ b/libs/megaparse/src/megaparse/examples/parse_file.py @@ -1,13 +1,28 @@ from megaparse.formatter.unstructured_formatter.md_formatter import MarkDownFormatter from megaparse.megaparse import MegaParse +from megaparse.formatter.structured_formatter.custom_structured_formatter import ( + CustomStructuredFormatter, +) from megaparse.parser.unstructured_parser import UnstructuredParser +from langchain_openai import ChatOpenAI +from pydantic import BaseModel, Field + + +class MyCustomFormat(BaseModel): + title: str = Field(description="The title of the document.") + problem: str = Field(description="The problem statement.") + solution: str = Field(description="The solution statement.") + + if __name__ == "__main__": # Parse a file parser = UnstructuredParser() - formatter = MarkDownFormatter() + model = ChatOpenAI() + formatter_1 = MarkDownFormatter() + formatter_2 = CustomStructuredFormatter(model=model, output_model=MyCustomFormat) - megaparse = MegaParse(parser=parser, formatters=[formatter]) + megaparse = MegaParse(parser=parser, formatters=[formatter_1, formatter_2]) file_path = "libs/megaparse/tests/pdf/sample_pdf.pdf" result = megaparse.load(file_path=file_path) diff --git a/libs/megaparse/src/megaparse/formatter/structured_formatter/__init__.py b/libs/megaparse/src/megaparse/formatter/structured_formatter/__init__.py new file mode 100644 index 0000000..c369a15 --- /dev/null +++ b/libs/megaparse/src/megaparse/formatter/structured_formatter/__init__.py @@ -0,0 +1,16 @@ +from langchain_core.language_models.chat_models import BaseChatModel +from megaparse.formatter.base import BaseFormatter +from pydantic import BaseModel + + +class StructuredFormatter(BaseFormatter): + def __init__(self, model: BaseChatModel, output_model: type[BaseModel]): + super().__init__(model) + self.output_model = output_model + + async def format_string( + self, + text: str, + file_path: str | None = None, + ) -> str: # FIXME: Return a structured output of type BaseModel ? + raise NotImplementedError() diff --git a/libs/megaparse/src/megaparse/formatter/structured_formatter/custom_structured_formatter.py b/libs/megaparse/src/megaparse/formatter/structured_formatter/custom_structured_formatter.py new file mode 100644 index 0000000..c5a5a50 --- /dev/null +++ b/libs/megaparse/src/megaparse/formatter/structured_formatter/custom_structured_formatter.py @@ -0,0 +1,42 @@ +from typing import Optional + +from langchain_core.language_models.chat_models import BaseChatModel +from megaparse.formatter.structured_formatter import StructuredFormatter +from pydantic import BaseModel + + +class CustomStructuredFormatter(StructuredFormatter): + async def format_string( + self, + text: str, + file_path: str | None = None, + ) -> str: + """ + Structure the file using an AI language model. + Args: + text: The text to format. + file_path: The file path of the text. + model: The AI language model to use for formatting. + Returns: + The structured text. + """ + if not self.model: + raise ValueError("A Model is needed to use the CustomStructuredFormatter.") + print("Formatting text using CustomStructuredFormatter...") + if len(text) < 0: + raise ValueError( + "A non empty text is needed to format text using CustomStructuredFormatter." + ) + if not self.output_model: + raise ValueError( + "An output model is needed to structure text using CustomStructuredFormatter." + ) + + structured_model = self.model.with_structured_output(self.output_model) # type: ignore + + formatted_text = structured_model.invoke( + f"Parse the text in a structured format: {text}" + ) + assert isinstance(formatted_text, BaseModel), "Model output is not a BaseModel." + + return formatted_text.model_dump_json() diff --git a/libs/megaparse/src/megaparse/formatter/structured_output/__init__.py b/libs/megaparse/src/megaparse/formatter/structured_output/__init__.py deleted file mode 100644 index 9152b58..0000000 --- a/libs/megaparse/src/megaparse/formatter/structured_output/__init__.py +++ /dev/null @@ -1,11 +0,0 @@ -# from typing import List - -# from megaparse.formatter.base import BaseFormatter -# from pydantic import BaseModel - - -# class StructuredFormatter(BaseFormatter): -# async def format_string( -# self, text: str, file_path: str | None = None, model: BaseModel | None = None -# ) -> BaseModel: -# raise NotImplementedError()