diff --git a/libs/megaparse/src/megaparse/checker/__init__.py b/libs/megaparse/src/megaparse/checker/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/libs/megaparse/src/megaparse/checker/format_checker.py b/libs/megaparse/src/megaparse/checker/format_checker.py deleted file mode 100644 index aa7ae3a..0000000 --- a/libs/megaparse/src/megaparse/checker/format_checker.py +++ /dev/null @@ -1,26 +0,0 @@ -from typing import List - -from langchain_core.language_models.chat_models import BaseChatModel -from unstructured.documents.elements import Element - - -# TODO: Implement the FormatChecker class @Chloe -class FormatChecker: - """ - A class used to improve the layout of elements, particularly focusing on converting HTML tables to markdown tables. - Attributes - ---------- - model : BaseChatModel - An instance of a chat model used to process and improve the layout of elements. - Methods - ------- - improve_layout(elements: List[Element]) -> List[Element] - Processes a list of elements, converting HTML tables to markdown tables and improving the overall layout. - - """ - - def __init__(self, model: BaseChatModel): - self.model = model - - def check(self, elements: List[Element]): - raise NotImplementedError("Method not implemented yet") diff --git a/libs/megaparse/src/megaparse/checker/markdown_processor.py b/libs/megaparse/src/megaparse/checker/markdown_processor.py deleted file mode 100644 index 541a282..0000000 --- a/libs/megaparse/src/megaparse/checker/markdown_processor.py +++ /dev/null @@ -1,211 +0,0 @@ -# Code to clean markdown files - not used but to be refactored -# import os -# from collections import Counter -# from typing import List, Tuple, Dict -# from langchain_openai import ChatOpenAI -# from dotenv import load_dotenv - - -# class MarkdownProcessor: -# """ -# Class for MarkdownProcessor. -# """ - -# load_dotenv() - -# def __init__(self, md_result: str, strict: bool, remove_pagination: bool): -# self.md_result = md_result -# self.strict = strict -# self.remove_pagination = remove_pagination - -# @staticmethod -# def clean(text: str) -> str: -# """ -# Clean the input text by removing newlines, double asterisks, and trimming whitespace. - -# Args: -# text (str): Input text - -# Returns: -# str: Cleaned text -# """ -# text = text.replace("\n", "") -# text = text.replace("**", "") -# text = text.strip() -# return text - -# def split_into_pages(self) -> List[str]: -# """ -# Split the markdown result into pages using triple newlines as the delimiter. - -# Returns: -# List[str]: Splitted markdown -# """ -# return self.md_result.split("\n\n\n") - -# @staticmethod -# def split_into_paragraphs(pages: list) -> List[str]: -# """ -# Split pages into paragraphs using double newlines as the delimiter. - -# Args: -# pages (list): Pages - -# Returns: -# List[str]: Splitted pages -# """ -# return "\n\n".join(pages).split("\n\n") - -# def remove_duplicates(self, paragraphs: list) -> Tuple[str, List[str]]: -# """ -# Remove duplicate paragraphs and identify unique and duplicate paragraphs. - -# Args: -# paragraphs (list): Paragraphs - -# Returns: -# Tuple[str, List[str]]: Cleaned paragraphs and duplicate paragraphs -# """ -# unique_paragraphs = list( -# set([self.clean(paragraph) for paragraph in paragraphs]) -# ) -# duplicate_paragraphs = [] -# cleaned_paragraphs = [] - -# for paragraph in paragraphs: -# cleaned_paragraph = self.clean(paragraph) -# if cleaned_paragraph in unique_paragraphs: -# cleaned_paragraphs.append(paragraph) -# unique_paragraphs.remove(cleaned_paragraph) -# else: -# duplicate_paragraphs.append(paragraph) -# return cleaned_paragraphs, duplicate_paragraphs - -# def identify_header_components(self, duplicate_paragraphs: list) -> Dict: -# """ -# Identify words in duplicate paragraphs that are likely header components. - -# Args: -# duplicate_paragraphs (list): Duplicate paragraphs - -# Returns: -# Dict: Header components -# """ -# header_components = list( -# set([self.clean(paragraph) for paragraph in duplicate_paragraphs]) -# ) -# header_components = " ".join(header_components).strip().split(" ") -# header_components_count = Counter(header_components) -# header_components_count = { -# k.replace(":", ""): v -# for k, v in header_components_count.items() -# if v > 1 and len(k) > 3 -# } -# return header_components_count - -# def remove_header_lines( -# self, paragraphs: List[str], header_components_count: Dict -# ) -> List[str]: -# """ -# Remove paragraphs that contain any of the header words or the word 'Page' if remove_pagination is true. - -# Args: -# paragraphs (List[str]): Paragraphs -# header_components_count (Dict): Header components - -# Returns: -# List[str]: New paragraphs -# """ - -# def should_remove(paragraph): -# if self.remove_pagination and "Page" in paragraph: -# return True -# return any(word in paragraph for word in header_components_count.keys()) - -# return [paragraph for paragraph in paragraphs if not should_remove(paragraph)] - -# def merge_tables(self, md_content: str) -> str: -# """ -# Merge tables inside Markdown content. - -# Args: -# md_content (str): Markdown content - -# Returns: -# str: Merged tables -# """ -# md_content = md_content.replace("|\n\n|", "|\n|") -# return md_content - -# def save_cleaned_result(self, cleaned_result: str, output_path: str) -> None: -# """ -# Save the cleaned paragraphs to a markdown file. - -# Args: -# cleaned_result (str): Cleaned result -# output_path (str): Output path -# """ -# with open(output_path, "w") as f: -# f.write(cleaned_result) - -# def remove_header_llm(self): -# llm = ChatOpenAI(model="gpt-4o", api_key=os.getenv("OPENAI_API_KEY")) -# # Define the prompt -# messages = [ -# ( -# "system", -# "You are a document cleaner and you are used to remove repetitive headers / footer from parsed files in markdown.", -# ), -# ] - -# prompt = f"""You are a document cleaner and you are used to remove repetitive headers / footer from parsed files in markdown. -# Here is a md file : "{self.md_result}" -# I want you to identify repetitive texts that could be associate to a document header and footer. Please identify the headers, the footer and remove them from the document. -# Answer with only the cleaned document in markdown format. -# Result : """ - -# messages.append(("human", self.md_result)) # type: ignore - -# result = llm.invoke(messages) - -# return result.content - -# def process(self, gpt4o_cleaner=False) -> str: -# """ -# Process the markdown result by removing duplicate paragraphs and headers. - -# Args: -# gpt4o_cleaner (bool, optional): GPT-4o cleaner. Defaults to False. - -# Returns: -# str: Cleaned result -# """ -# if gpt4o_cleaner: -# cleaned_result = self.remove_header_llm() - -# else: -# pages = self.split_into_pages() -# paragraphs = self.split_into_paragraphs(pages) -# # other_pages_paragraphs = self.split_into_paragraphs(pages[1:]) - -# cleaned_paragraphs, duplicate_paragraphs = self.remove_duplicates( -# paragraphs -# ) -# header_components_count = self.identify_header_components( -# duplicate_paragraphs -# ) - -# if self.strict: -# final_paragraphs = self.remove_header_lines( -# cleaned_paragraphs[5:], header_components_count -# ) -# final_paragraphs = cleaned_paragraphs[:5] + final_paragraphs -# else: -# final_paragraphs = cleaned_paragraphs - -# # Combine first page paragraphs with cleaned paragraphs from other pages -# all_paragraphs = final_paragraphs -# cleaned_result = "\n\n".join(all_paragraphs) - -# cleaned_result = self.merge_tables(str(cleaned_result)) -# return cleaned_result diff --git a/libs/megaparse/src/megaparse/examples/parse_file.py b/libs/megaparse/src/megaparse/examples/parse_file.py index f5cd8bc..1dea322 100644 --- a/libs/megaparse/src/megaparse/examples/parse_file.py +++ b/libs/megaparse/src/megaparse/examples/parse_file.py @@ -1,18 +1,39 @@ -from megaparse import MegaParse +import asyncio +from pathlib import Path +from typing import List + +from langchain_openai import ChatOpenAI +from llama_index.core.schema import Document as LlamaDocument +from llama_parse import LlamaParse +from llama_parse.utils import Language, ResultType +from megaparse.formatter.structured_formatter.custom_structured_formatter import ( + CustomStructuredFormatter, +) +from megaparse.megaparse import MegaParse +from megaparse.parser.doctr_parser import DoctrParser from megaparse.parser.unstructured_parser import UnstructuredParser -import pypdfium2 as pdfium +from megaparse_sdk.schema.extensions import FileExtension +from pydantic import BaseModel, Field + + +class MyCustomFormat(BaseModel): + title: str = Field(description="The title of the document.") + problem: str = Field(description="The problem statement.") + solution: str = Field(description="The solution statement.") -def main(): - parser = UnstructuredParser() - megaparse = MegaParse(parser=parser) +async def main(): + # Parse a file + parser = DoctrParser() + model = ChatOpenAI(name="gpt-4o") + formatter_1 = CustomStructuredFormatter(model=model, output_model=MyCustomFormat) - file_path = "./tests/pdf/native/0168029.pdf" + megaparse = MegaParse(ocr_parser=parser, formatters=[formatter_1]) - parsed_file = megaparse.load(file_path) - print(f"\n----- File Response : {file_path} -----\n") - print(parsed_file) + file_path = Path("./tests/pdf/sample_pdf.pdf") + result = await megaparse.aload(file_path=file_path) + print(result) if __name__ == "__main__": - main() + asyncio.run(main()) diff --git a/libs/megaparse/src/megaparse/formatter/base.py b/libs/megaparse/src/megaparse/formatter/base.py new file mode 100644 index 0000000..7243e80 --- /dev/null +++ b/libs/megaparse/src/megaparse/formatter/base.py @@ -0,0 +1,33 @@ +from abc import ABC +from pathlib import Path +from typing import List, Union + +from langchain_core.language_models.chat_models import BaseChatModel +from megaparse.models.document import Document + + +class BaseFormatter(ABC): + """ + A class used to improve the layout of elements, particularly focusing on converting HTML tables to markdown tables. + Attributes + ---------- + model : BaseChatModel + An instance of a chat model used to process and improve the layout of elements. + Methods + ------- + improve_layout(elements: List[Element]) -> List[Element] + Processes a list of elements, converting HTML tables to markdown tables and improving the overall layout. + """ + + def __init__(self, model: BaseChatModel | None = None): + self.model = model + + def format( + self, document: Document, file_path: Path | str | None = None + ) -> Union[Document, str]: + raise NotImplementedError("Subclasses should implement this method") + + async def aformat( + self, document: Document, file_path: Path | str | None = None + ) -> Union[Document, str]: + raise NotImplementedError("Subclasses should implement this method") diff --git a/libs/megaparse/src/megaparse/formatter/structured_formatter/__init__.py b/libs/megaparse/src/megaparse/formatter/structured_formatter/__init__.py new file mode 100644 index 0000000..dba1089 --- /dev/null +++ b/libs/megaparse/src/megaparse/formatter/structured_formatter/__init__.py @@ -0,0 +1,25 @@ +from pathlib import Path +from langchain_core.language_models.chat_models import BaseChatModel +from megaparse.formatter.base import BaseFormatter +from megaparse.models.document import Document +from pydantic import BaseModel + + +class StructuredFormatter(BaseFormatter): + def __init__(self, model: BaseChatModel, output_model: type[BaseModel]): + super().__init__(model) + self.output_model = output_model + + async def aformat( + self, + document: Document, + file_path: Path | str | None = None, + ) -> str: # FIXME: Return a structured output of type BaseModel ? + raise NotImplementedError() + + def format( + self, + document: Document, + file_path: Path | str | None = None, + ) -> str: # FIXME: Return a structured output of type BaseModel ? + raise NotImplementedError() diff --git a/libs/megaparse/src/megaparse/formatter/structured_formatter/custom_structured_formatter.py b/libs/megaparse/src/megaparse/formatter/structured_formatter/custom_structured_formatter.py new file mode 100644 index 0000000..858253d --- /dev/null +++ b/libs/megaparse/src/megaparse/formatter/structured_formatter/custom_structured_formatter.py @@ -0,0 +1,79 @@ +from pathlib import Path +from megaparse.formatter.structured_formatter import StructuredFormatter +from megaparse.models.document import Document +from pydantic import BaseModel + + +class CustomStructuredFormatter(StructuredFormatter): + def format( + self, + document: Document, + file_path: Path | str | None = None, + ) -> str: + """ + Structure the file using an AI language model. + Args: + text: The text to format. + file_path: The file path of the text. + model: The AI language model to use for formatting. + Returns: + The structured text. + """ + if not self.model: + raise ValueError("A Model is needed to use the CustomStructuredFormatter.") + print("Formatting text using CustomStructuredFormatter...") + text = str(document) + if len(text) < 0: + raise ValueError( + "A non empty text is needed to format text using CustomStructuredFormatter." + ) + if not self.output_model: + raise ValueError( + "An output model is needed to structure text using CustomStructuredFormatter." + ) + + structured_model = self.model.with_structured_output(self.output_model) # type: ignore + + formatted_text = structured_model.invoke( + f"Parse the text in a structured format: {text}" + ) + assert isinstance(formatted_text, BaseModel), "Model output is not a BaseModel." + + return formatted_text.model_dump_json() + + async def aformat( + self, + document: Document, + file_path: Path | str | None = None, + ) -> str: + """ + Asynchronously structure the file using an AI language model. + Args: + text: The text to format. + file_path: The file path of the text. + model: The AI language model to use for formatting. + Returns: + The structured text. + """ + if not self.model: + raise ValueError("A Model is needed to use the CustomStructuredFormatter.") + print("Formatting text using CustomStructuredFormatter...") + text = str(document) + + if len(text) < 0: + raise ValueError( + "A non empty text is needed to format text using CustomStructuredFormatter." + ) + if not self.output_model: + raise ValueError( + "An output model is needed to structure text using CustomStructuredFormatter." + ) + + structured_model = self.model.with_structured_output(self.output_model) # type: ignore + + formatted_text = await structured_model.ainvoke( + f"Parse the text in a structured format: {text}" + ) + assert isinstance(formatted_text, BaseModel), "Model output is not a BaseModel." + + return formatted_text.model_dump_json() diff --git a/libs/megaparse/src/megaparse/formatter/table_formatter/__init__.py b/libs/megaparse/src/megaparse/formatter/table_formatter/__init__.py new file mode 100644 index 0000000..9b28987 --- /dev/null +++ b/libs/megaparse/src/megaparse/formatter/table_formatter/__init__.py @@ -0,0 +1,17 @@ +from pathlib import Path +from typing import Union + +from megaparse.formatter.base import BaseFormatter +from megaparse.models.document import Document + + +class TableFormatter(BaseFormatter): + def format( + self, document: Document, file_path: Path | str | None = None + ) -> Document: + raise NotImplementedError("Subclasses should implement this method") + + async def aformat( + self, document: Document, file_path: Path | str | None = None + ) -> Document: + raise NotImplementedError("Subclasses should implement this method") diff --git a/libs/megaparse/src/megaparse/formatter/table_formatter/llm_table_formatter.py b/libs/megaparse/src/megaparse/formatter/table_formatter/llm_table_formatter.py new file mode 100644 index 0000000..1c3eaea --- /dev/null +++ b/libs/megaparse/src/megaparse/formatter/table_formatter/llm_table_formatter.py @@ -0,0 +1,109 @@ +import re +import warnings +from pathlib import Path +from typing import Optional + +from langchain_core.language_models.chat_models import BaseChatModel +from langchain_core.prompts import ChatPromptTemplate +from megaparse.formatter.table_formatter import TableFormatter +from megaparse.models.document import Document, TableBlock + + +class SimpleMDTableFormatter(TableFormatter): + """ + A formatter that converts table elements into Markdown format using llms. + """ + + TABLE_MARKER_START = "[TABLE]" + TABLE_MARKER_END = "[/TABLE]" + CODE_BLOCK_PATTERN = r"^```.*$\n?" + + def __init__(self, model: Optional[BaseChatModel] = None): + super().__init__(model) + + async def aformat( + self, document: Document, file_path: Path | str | None = None + ) -> Document: + warnings.warn( + "The SimpleMDTableFormatter is a sync formatter, please use the sync format method", + UserWarning, + stacklevel=2, + ) + return self.format(document=document, file_path=file_path) + + def format( + self, document: Document, file_path: Path | str | None = None + ) -> Document: + """ + Formats table elements within a list of elements. + Args: + elements: A list of Element objects. + Returns: + A list of Element objects with formatted tables. + """ + if not self.model: + raise ValueError("A Model is needed to use the SimpleMDTableFormatter.") + print("Formatting tables using SimpleMDTableFormatter...") + table_stack = [] + formatted_elements = [] + + for block in document.content: + if isinstance(block, TableBlock): + previous_table = table_stack[-1] if table_stack else "" + formatted_table = self.format_table(block, previous_table) + table_stack.append(formatted_table.text) + formatted_elements.append(formatted_table) + else: + formatted_elements.append(block) + + document.content = formatted_elements + return document + + def format_table( + self, table_element: TableBlock, previous_table: str + ) -> TableBlock: + """ + Formats a single table element into Markdown using an AI language model. + Args: + table_element: The table element to format. + previous_table: The previously formatted table text. + Returns: + The formatted table element. + """ + assert self.model is not None, "Model is not set." + + prompt = ChatPromptTemplate.from_messages( + [ + ( + "human", + ( + "You are an expert in markdown tables. Transform the following parsed table into a " + "markdown table. Provide just the table in pure markdown, nothing else.\n" + "\n{text}\n\n" + "\n{previous_table}\n" + ), + ), + ] + ) + + chain = prompt | self.model + result = chain.invoke( + { + "text": table_element.text, + "previous_table": previous_table, + } + ) + + content_str = str(result.content) + cleaned_content = re.sub( + self.CODE_BLOCK_PATTERN, "", content_str, flags=re.MULTILINE + ) + markdown_table = ( + f"{self.TABLE_MARKER_START}\n" + f"{cleaned_content}\n" + f"{self.TABLE_MARKER_END}\n\n" + ) + + table_element.text = markdown_table + + return table_element diff --git a/libs/megaparse/src/megaparse/formatter/table_formatter/vision_table_formatter.py b/libs/megaparse/src/megaparse/formatter/table_formatter/vision_table_formatter.py new file mode 100644 index 0000000..e94d85b --- /dev/null +++ b/libs/megaparse/src/megaparse/formatter/table_formatter/vision_table_formatter.py @@ -0,0 +1,193 @@ +import base64 +from io import BytesIO +from pathlib import Path +from typing import List, Optional + +from langchain_core.language_models.chat_models import BaseChatModel +from langchain_core.messages import HumanMessage +from megaparse.formatter.table_formatter import TableFormatter +from megaparse.models.document import Document, TableBlock +from pdf2image import convert_from_path +from PIL import Image +from unstructured.documents.elements import Element + +TABLE_OCR_PROMPT = """ +You are tasked with transcribing the content of a table into markdown format. Your goal is to create a well-structured, readable markdown table that accurately represents the original content while adding appropriate formatting. +Answer uniquely with the parsed table. Do not include the fenced code blocks backticks. +""" + + +class VisionMDTableFormatter(TableFormatter): + """ + A formatter that converts table elements into Markdown format using an AI language model. + """ + + TABLE_MARKER_START = "[TABLE]" + TABLE_MARKER_END = "[/TABLE]" + CODE_BLOCK_PATTERN = r"^```.*$\n?" + + def __init__(self, model: Optional[BaseChatModel] = None): + super().__init__(model) + + def _crop_table_image(self, table_element: TableBlock, file_path: str) -> str: + """ + Helper method to crop the table portion of the PDF page and convert it to a base64 string. + """ + assert table_element.bbox, "Table element must have coordinates." + bbox = table_element.bbox + page_number = table_element.page_range[0] + assert page_number, "Table element must have a page number." + assert bbox, "Table element must have coordinates." + + pages = convert_from_path(file_path) + + # Calculate the box for cropping + box = ( + bbox.top_left.x, + bbox.top_left.y, + bbox.bottom_right.x, + bbox.bottom_right.y, + ) + table_image = pages[page_number - 1].crop(box) + # Convert the cropped image to base64 + table_image64 = self.process_file([table_image])[0] + return table_image64 + + async def aformat( + self, document: Document, file_path: Path | str | None = None + ) -> Document: + """ + Asynchronously formats table elements within a list of elements. + """ + if not self.model: + raise ValueError("A Model is needed to use the VisionMDTableFormatter.") + print("Formatting tables using VisionMDTableFormatter (async)...") + assert ( + file_path + ), "A file path is needed to format tables using VisionMDTableFormatter." + if not isinstance(file_path, str): + file_path = str(file_path) + formatted_elements = [] + for block in document.content: + if isinstance(block, TableBlock): + formatted_table = await self.aformat_table(block, file_path) + formatted_elements.append(formatted_table) + else: + formatted_elements.append(block) + + document.content = formatted_elements + return document + + def format( + self, document: Document, file_path: Path | str | None = None + ) -> Document: + """ + Asynchronously formats table elements within a list of elements. + """ + if not self.model: + raise ValueError("A Model is needed to use the VisionMDTableFormatter.") + print("Formatting tables using VisionMDTableFormatter (async)...") + assert ( + file_path + ), "A file path is needed to format tables using VisionMDTableFormatter." + if not isinstance(file_path, str): + file_path = str(file_path) + formatted_elements = [] + for block in document.content: + if isinstance(block, TableBlock): + formatted_table = self.format_table(block, file_path) + formatted_elements.append(formatted_table) + else: + formatted_elements.append(block) + + document.content = formatted_elements + return document + + async def aformat_table( + self, table_element: TableBlock, file_path: str + ) -> TableBlock: + """ + Asynchronously formats a table element into Markdown format using a Vision Model. + """ + table_image64 = self._crop_table_image(table_element, file_path) + formatted_table = await self.avision_extract(table_image64) + + markdown_table = ( + f"{self.TABLE_MARKER_START}\n" + f"{formatted_table}\n" + f"{self.TABLE_MARKER_END}\n\n" + ) + # Replace the element's text with the formatted table text + table_element.text = markdown_table + return table_element + + def format_table(self, table_element: TableBlock, file_path: str) -> TableBlock: + """ + Asynchronously formats a table element into Markdown format using a Vision Model. + """ + table_image64 = self._crop_table_image(table_element, file_path) + formatted_table = self.vision_extract(table_image64) + + markdown_table = ( + f"{self.TABLE_MARKER_START}\n" + f"{formatted_table}\n" + f"{self.TABLE_MARKER_END}\n\n" + ) + # Replace the element's text with the formatted table text + table_element.text = markdown_table + return table_element + + def process_file(self, images: List[Image.Image], image_format="PNG") -> List[str]: + """ + Convert a list of PIL images to base64 encoded images. + """ + try: + images_base64 = [] + for image in images: + buffered = BytesIO() + image.save(buffered, format=image_format) + image_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8") + images_base64.append(image_base64) + return images_base64 + except Exception as e: + raise ValueError(f"Error processing PDF file: {str(e)}") + + async def avision_extract(self, table_image: str) -> str: + """ + Asynchronously send image data to the language model for processing. + """ + assert ( + self.model + ), "A model is needed to use the VisionMDTableFormatter (async)." + image_prompt = { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{table_image}"}, + } + + message = HumanMessage( + content=[ + {"type": "text", "text": TABLE_OCR_PROMPT}, + image_prompt, + ], + ) + response = await self.model.ainvoke([message]) + return str(response.content) + + def vision_extract(self, table_image: str) -> str: + """ + Synchronously send image data to the language model for processing. + """ + assert self.model, "A model is needed to use the VisionMDTableFormatter (sync)." + image_prompt = { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{table_image}"}, + } + + message = HumanMessage( + content=[ + {"type": "text", "text": TABLE_OCR_PROMPT}, + image_prompt, + ], + ) + response = self.model.invoke([message]) + return str(response.content) diff --git a/libs/megaparse/src/megaparse/megaparse.py b/libs/megaparse/src/megaparse/megaparse.py index b4580a0..9dfa1fb 100644 --- a/libs/megaparse/src/megaparse/megaparse.py +++ b/libs/megaparse/src/megaparse/megaparse.py @@ -1,15 +1,14 @@ -import asyncio import logging -import os +import warnings from pathlib import Path -from typing import IO, BinaryIO +from typing import IO, BinaryIO, List -from megaparse.configs.auto import DeviceEnum, MegaParseConfig from megaparse_sdk.schema.extensions import FileExtension from megaparse_sdk.schema.parser_config import StrategyEnum -from megaparse.checker.format_checker import FormatChecker +from megaparse.configs.auto import DeviceEnum, MegaParseConfig from megaparse.exceptions.base import ParsingException +from megaparse.formatter.base import BaseFormatter from megaparse.parser.base import BaseParser from megaparse.parser.doctr_parser import DoctrParser from megaparse.parser.strategy import StrategyHandler @@ -25,8 +24,8 @@ def __init__( self, parser: BaseParser | None = None, ocr_parser: BaseParser | None = None, + formatters: List[BaseFormatter] | None = None, strategy: StrategyEnum = StrategyEnum.AUTO, - format_checker: FormatChecker | None = None, ) -> None: if not parser: parser = UnstructuredParser(strategy=StrategyEnum.FAST) @@ -39,9 +38,8 @@ def __init__( self.strategy = strategy self.parser = parser + self.formatters = formatters self.ocr_parser = ocr_parser - self.format_checker = format_checker - self.last_parsed_document: str = "" self.strategy_handler = StrategyHandler( text_det_config=self.config.text_det_config, @@ -79,12 +77,6 @@ def validate_input( file_extension = FileExtension(file_extension) except ValueError: raise ValueError(f"Unsupported file extension: {file_extension}") - - if file_extension != FileExtension.PDF: - if self.format_checker: - raise ValueError( - f"Format Checker : Unsupported file extension: {file_extension}" - ) return file_extension async def aload( @@ -102,10 +94,23 @@ async def aload( parsed_document = await parser.aconvert( file_path=file_path, file=file, file_extension=file_extension ) + parsed_document.file_name = str(file_path) if file_path else None + + if self.formatters: + for formatter in self.formatters: + if isinstance(parsed_document, str): + warnings.warn( + f"The last step returned a string, the {formatter.__class__} and following will not be applied", + stacklevel=2, + ) + break + parsed_document = await formatter.aformat(parsed_document) + # @chloe FIXME: format_checker needs unstructured Elements as input which is to change # if self.format_checker: - # parsed_document: str = await self.format_checker.check(parsed_document - self.last_parsed_document = parsed_document + # parsed_document: str = self.format_checker.check(parsed_document) + if not isinstance(parsed_document, str): + return str(parsed_document) return parsed_document except Exception as e: raise ParsingException( @@ -127,10 +132,23 @@ def load( parsed_document = parser.convert( file_path=file_path, file=file, file_extension=file_extension ) + parsed_document.file_name = str(file_path) if file_path else None + + if self.formatters: + for formatter in self.formatters: + if isinstance(parsed_document, str): + warnings.warn( + f"The last step returned a string, the {formatter.__class__} and following will not be applied", + stacklevel=2, + ) + break + parsed_document = formatter.format(parsed_document) + # @chloe FIXME: format_checker needs unstructured Elements as input which is to change # if self.format_checker: - # parsed_document: str = await self.format_checker.check(parsed_document - self.last_parsed_document = parsed_document + # parsed_document: str = self.format_checker.check(parsed_document) + if not isinstance(parsed_document, str): + return str(parsed_document) return parsed_document except Exception as e: raise ParsingException( @@ -156,8 +174,3 @@ def _select_parser( if local_strategy == StrategyEnum.HI_RES: return self.ocr_parser return self.parser - - def save(self, file_path: Path | str) -> None: - os.makedirs(os.path.dirname(file_path), exist_ok=True) - with open(file_path, "w+") as f: - f.write(self.last_parsed_document) diff --git a/libs/megaparse/src/megaparse/models/document.py b/libs/megaparse/src/megaparse/models/document.py new file mode 100644 index 0000000..6d382be --- /dev/null +++ b/libs/megaparse/src/megaparse/models/document.py @@ -0,0 +1,224 @@ +import uuid +from typing import Any, Dict, List, Optional, Tuple + +from megaparse.predictor.models.base import BBOX +from pydantic import BaseModel, Field, field_validator + + +class Point2D(BaseModel): + """ + A class to represent a 2D point + + """ + + x: float + y: float + + +class Block(BaseModel): + """ + A class to represent a block + + """ + + block_id: Optional[uuid.UUID] = Field(default_factory=uuid.uuid4) + metadata: Dict[str, Any] # FIXME: TBD @Amine + bbox: Optional[BBOX] = ( + None # (x0,y0),(x1, y1) Coordinates are given as Relative positions to the page they are in + ) + page_range: Optional[Tuple[int, int]] = Field( + default=None + ) # (start_page, end_page) + + @field_validator("page_range") + def validate_range(cls, value): + if value is None: + return None + start, end = value + if start > end: + raise ValueError( + "The first value of the page range must be less than the second value" + ) + return value + + +class TextBlock(Block): + """ + A class to represent a text block + + """ + + text: str + + def __str__(self): + return self.text + + +class TitleBlock(TextBlock): + """ + A class to represent a title block + + """ + + def __str__(self): + return f"# {self.text}" + + +class SubTitleBlock(TextBlock): + """ + A class to represent a subtitle block + """ + + depth: int + + def __str__(self): + heading_level = min(self.depth + 1, 6) + return f"{'#' * heading_level} {self.text}" + + +class ImageBlock(Block): + """ + A class to represent an image block + """ + + text: Optional[str] = None + caption: Optional[str] = "unknown" + + def __str__(self) -> str: + return f"[Image: {self.caption}]" + + +class TableBlock(ImageBlock): + """ + A class to represent a table block + + """ + + def __str__(self): + return self.text if self.text else f"[Table : {self.caption}]" + + +class ListElement(BaseModel): + """ + A class to represent a list element + + """ + + text: str + depth: int + + +class ListBlock(TextBlock): + """ + A class to represent a list block + + """ + + list_elements: List[ListElement] + + # rajouter fonction pydantic pour compute l attribut + + def __str__(self): + return "\n".join( + f"{' ' * (2 * element.depth)}* {element.text}" + for element in self.list_elements + ) + + +class HeaderBlock(TextBlock): + """ + A class to represent a header block + + """ + + def __str__(self): + return f"{'='*len(self.text)}\n\n{self.text}\n\n{'='*len(self.text)}" + + +class FooterBlock(TextBlock): + """ + A class to represent a footer block + + """ + + def __str__(self): + return f"{'='*len(self.text)}\n\n{self.text}\n\n{'='*len(self.text)}" + + +class SectionBlock(Block): + """ + A class to represent a section block + + """ + + title: str + depth: int + content: List[Block] + + def __str__(self): + lines = [] + lines.extend(str(block) for block in self.content) + return "\n".join(lines) + + +class TOCItem(BaseModel): + title: str + depth: int + page_range: Tuple[int, int] = Field(...) # (start_page, end_page) + + @field_validator("page_range") + def validate_range(cls, value): + start, end = value + if start >= end: + raise ValueError( + "The first value of the page range must be less than the second value" + ) + return value + + def __str__(self): + start_page, end_page = self.page_range + page_info = ( + f"page {start_page}" + if start_page == end_page + else f"pages {start_page}-{end_page}" + ) + return f"{' ' * (2 * self.depth)}* {self.title} ({page_info})" + + +class TOC(BaseModel): + content: List[TOCItem] + + @property + def text(self) -> str: + return "\n".join(str(item) for item in self.content) + + def __str__(self): + return self.text + + +class Document(BaseModel): + """ + + A class to represent a document + + """ + + file_name: Optional[str] = None + table_of_contents: Optional[TOC] = None + metadata: Dict[str, Any] # TBD @Amine + content: List[Block] + detection_origin: str + + def __str__(self) -> str: + lines = [] + + # If there's a table of contents, include it + if self.table_of_contents: + lines.append("Table of Contents:") + # Use TOC’s own string-building property or method + lines.append(self.table_of_contents.text) + + # Print each block’s text representation + lines.extend(str(block) for block in self.content) + + return "\n".join(lines) diff --git a/libs/megaparse/src/megaparse/parser/base.py b/libs/megaparse/src/megaparse/parser/base.py index ab378d8..8c3964d 100644 --- a/libs/megaparse/src/megaparse/parser/base.py +++ b/libs/megaparse/src/megaparse/parser/base.py @@ -4,6 +4,8 @@ from megaparse_sdk.schema.extensions import FileExtension +from megaparse.models.document import Document + class BaseParser(ABC): """Mother Class for all the parsers [Unstructured, LlamaParse, MegaParseVision]""" @@ -15,12 +17,12 @@ def check_supported_extension( ): if not file_extension and not file_path: raise ValueError( - "Either file_path or file_extension must be provided for {self.__class__.__name__}" + f"Either file_path or file_extension must be provided for {self.__class__.__name__}" ) if file_path and not file_extension: file_path = Path(file_path) if isinstance(file_path, str) else file_path file_extension = FileExtension(file_path.suffix) - if file_extension not in self.supported_extensions: + if file_extension and file_extension not in self.supported_extensions: raise ValueError( f"Unsupported file extension {file_extension.value} for {self.__class__.__name__}" ) @@ -32,7 +34,7 @@ async def aconvert( file: IO[bytes] | None = None, file_extension: FileExtension | None = None, **kwargs, - ) -> str: + ) -> Document: """ Convert the given file to a specific format. @@ -55,9 +57,9 @@ def convert( file: IO[bytes] | None = None, file_extension: FileExtension | None = None, **kwargs, - ) -> str: + ) -> Document: """ - Convert the given file to a specific format. + Convert the given file to the unstructured format. Args: file_path (str | Path): The path to the file to be converted. diff --git a/libs/megaparse/src/megaparse/parser/doctr_parser.py b/libs/megaparse/src/megaparse/parser/doctr_parser.py index 38efe08..29a3a7e 100644 --- a/libs/megaparse/src/megaparse/parser/doctr_parser.py +++ b/libs/megaparse/src/megaparse/parser/doctr_parser.py @@ -6,11 +6,14 @@ from megaparse.configs.auto import DeviceEnum, TextRecoConfig, TextDetConfig import onnxruntime as rt from megaparse_sdk.schema.extensions import FileExtension -from onnxtr.io import DocumentFile +from onnxtr.io import Document, DocumentFile from onnxtr.models import ocr_predictor from onnxtr.models.engine import EngineConfig +from megaparse.models.document import Document as MPDocument +from megaparse.models.document import ImageBlock, TextBlock from megaparse.parser.base import BaseParser +from megaparse.predictor.models.base import BBOX, Point2D logger = logging.getLogger("megaparse") @@ -80,7 +83,7 @@ def convert( file: IO[bytes] | BinaryIO | None = None, file_extension: None | FileExtension = None, **kwargs, - ) -> str: + ) -> MPDocument: if file: file.seek(0) pdf = file.read() @@ -93,8 +96,9 @@ def convert( doc = DocumentFile.from_pdf(pdf) # Analyze - result = self.predictor(doc) - return result.render() + doctr_result = self.predictor(doc) + + return self.__to_elements_list(doctr_result) async def aconvert( self, @@ -102,10 +106,60 @@ async def aconvert( file: IO[bytes] | BinaryIO | None = None, file_extension: None | FileExtension = None, **kwargs, - ) -> str: + ) -> MPDocument: warnings.warn( - "The UnstructuredParser is a sync parser, please use the sync convert method", + "The DocTRParser is a sync parser, please use the sync convert method", UserWarning, stacklevel=2, ) return self.convert(file_path, file, file_extension, **kwargs) + + def __to_elements_list(self, doctr_document: Document) -> MPDocument: + result = [] + + for page_number, page in enumerate(doctr_document.pages): + for block in page.blocks: + if len(block.lines) and len(block.artefacts) > 0: + raise ValueError( + "Block should not contain both lines and artefacts" + ) + word_coordinates = [ + word.geometry for line in block.lines for word in line.words + ] + x0 = min(word[0][0] for word in word_coordinates) + y0 = min(word[0][1] for word in word_coordinates) + x1 = max(word[1][0] for word in word_coordinates) + y1 = max(word[1][1] for word in word_coordinates) + + result.append( + TextBlock( + text=block.render(), + bbox=BBOX( + top_left=Point2D(x=x0, y=y0), + bottom_right=Point2D(x=x1, y=y1), + ), + metadata={}, + page_range=(page_number, page_number), + ) + ) + + for artefact in block.artefacts: + result.append( + ImageBlock( + bbox=BBOX( + top_left=Point2D( + x=artefact.geometry[0][0], y=artefact.geometry[0][1] + ), + bottom_right=Point2D( + x=artefact.geometry[1][0], y=artefact.geometry[1][1] + ), + ), + metadata={}, + page_range=(page_number, page_number), + ) + ) + return MPDocument( + metadata={}, + content=result, + detection_origin="doctr", + ) diff --git a/libs/megaparse/src/megaparse/parser/llama.py b/libs/megaparse/src/megaparse/parser/llama.py index 9cb0d8c..40321ea 100644 --- a/libs/megaparse/src/megaparse/parser/llama.py +++ b/libs/megaparse/src/megaparse/parser/llama.py @@ -1,4 +1,3 @@ -import asyncio from pathlib import Path from typing import IO, List @@ -7,7 +6,10 @@ from llama_parse.utils import Language, ResultType from megaparse_sdk.schema.extensions import FileExtension +from megaparse.models.document import Document as MPDocument +from megaparse.models.document import TextBlock from megaparse.parser import BaseParser +from megaparse.predictor.models.base import BBOX, Point2D class LlamaParser(BaseParser): @@ -36,7 +38,7 @@ async def aconvert( file: IO[bytes] | None = None, file_extension: None | FileExtension = None, **kwargs, - ) -> str: + ) -> MPDocument: if not file_path: raise ValueError("File_path should be provided to run LlamaParser") self.check_supported_extension(file_extension, file_path) @@ -51,12 +53,8 @@ async def aconvert( ) documents: List[LlamaDocument] = await llama_parser.aload_data(str(file_path)) - parsed_md = "" - for document in documents: - text_content = document.text - parsed_md = parsed_md + text_content - return parsed_md + return self.__to_elements_list__(documents) def convert( self, @@ -64,14 +62,14 @@ def convert( file: IO[bytes] | None = None, file_extension: None | FileExtension = None, **kwargs, - ) -> str: + ) -> MPDocument: if not file_path: raise ValueError("File_path should be provided to run LlamaParser") self.check_supported_extension(file_extension, file_path) llama_parser = _LlamaParse( api_key=self.api_key, - result_type=ResultType.MD, + result_type=ResultType.JSON, gpt4o_mode=True, verbose=self.verbose, language=self.language, @@ -79,9 +77,24 @@ def convert( ) documents: List[LlamaDocument] = llama_parser.load_data(str(file_path)) - parsed_md = "" - for document in documents: - text_content = document.text - parsed_md = parsed_md + text_content - return parsed_md + return self.__to_elements_list__(documents) + + def __to_elements_list__(self, llama_doc: List[LlamaDocument]) -> MPDocument: + list_blocks = [] + for i, page in enumerate(llama_doc): + list_blocks.append( + TextBlock( + text=page.text, + metadata={}, + page_range=(i, i + 1), + bbox=BBOX( + top_left=Point2D(x=0, y=0), bottom_right=Point2D(x=1, y=1) + ), + ) + ) + return MPDocument( + metadata={}, + detection_origin="llamaparse", + content=list_blocks, + ) diff --git a/libs/megaparse/src/megaparse/parser/megaparse_vision.py b/libs/megaparse/src/megaparse/parser/megaparse_vision.py index 0b05e73..39490ff 100644 --- a/libs/megaparse/src/megaparse/parser/megaparse_vision.py +++ b/libs/megaparse/src/megaparse/parser/megaparse_vision.py @@ -3,15 +3,18 @@ import re from io import BytesIO from pathlib import Path -from typing import IO, List, Union +from typing import IO, List from langchain_core.language_models.chat_models import BaseChatModel from langchain_core.messages import HumanMessage from megaparse_sdk.schema.extensions import FileExtension from pdf2image import convert_from_path +from megaparse.models.document import Block, TextBlock +from megaparse.models.document import Document as MPDocument from megaparse.parser import BaseParser from megaparse.parser.entity import SupportedModel, TagEnum +from megaparse.predictor.models.base import BBOX, Point2D # BASE_OCR_PROMPT = """ # Transcribe the content of this file into markdown. Be mindful of the formatting. @@ -147,7 +150,7 @@ async def aconvert( file_extension: FileExtension | None = None, batch_size: int = 3, **kwargs, - ) -> str: + ) -> MPDocument: """ Parse a PDF file and process its content using the language model. @@ -164,13 +167,14 @@ async def aconvert( self.check_supported_extension(file_extension, file_path) pdf_base64 = self.process_file(file_path) + n_pages = len(pdf_base64) tasks = [ self.asend_to_mlm(pdf_base64[i : i + batch_size]) for i in range(0, len(pdf_base64), batch_size) ] self.parsed_chunks = await asyncio.gather(*tasks) responses = self.get_cleaned_content("\n".join(self.parsed_chunks)) - return responses + return self.__to_elements_list__(responses, n_pages=n_pages) def convert( self, @@ -179,7 +183,7 @@ def convert( file_extension: FileExtension | None = None, batch_size: int = 3, **kwargs, - ) -> str: + ) -> MPDocument: """ Parse a PDF file and process its content using the language model. @@ -196,6 +200,7 @@ def convert( self.check_supported_extension(file_extension, file_path) pdf_base64 = self.process_file(file_path) + n_pages = len(pdf_base64) chunks = [ pdf_base64[i : i + batch_size] for i in range(0, len(pdf_base64), batch_size) @@ -205,7 +210,7 @@ def convert( response = self.send_to_mlm(chunk) self.parsed_chunks.append(response) responses = self.get_cleaned_content("\n".join(self.parsed_chunks)) - return responses + return self.__to_elements_list__(responses, n_pages) def get_cleaned_content(self, parsed_file: str) -> str: """ @@ -245,3 +250,18 @@ def remove_tag(match): cleaned_content = cleaned_content.strip() return cleaned_content + + def __to_elements_list__(self, mpv_doc: str, n_pages: int) -> MPDocument: + list_blocks: List[Block] = [ + TextBlock( + text=mpv_doc, + metadata={}, + page_range=(0, n_pages - 1), + bbox=BBOX(top_left=Point2D(x=0, y=0), bottom_right=Point2D(x=1, y=1)), + ) + ] + return MPDocument( + metadata={}, + detection_origin="megaparse_vision", + content=list_blocks, + ) diff --git a/libs/megaparse/src/megaparse/parser/unstructured_parser.py b/libs/megaparse/src/megaparse/parser/unstructured_parser.py index b5cca09..294f386 100644 --- a/libs/megaparse/src/megaparse/parser/unstructured_parser.py +++ b/libs/megaparse/src/megaparse/parser/unstructured_parser.py @@ -1,16 +1,29 @@ -import re import warnings from pathlib import Path -from typing import IO +from typing import IO, Dict, List from dotenv import load_dotenv from langchain_core.language_models.chat_models import BaseChatModel -from langchain_core.prompts import ChatPromptTemplate from megaparse_sdk.schema.extensions import FileExtension from megaparse_sdk.schema.parser_config import StrategyEnum +from unstructured.documents.elements import Element from unstructured.partition.auto import partition +from megaparse.models.document import ( + Block, + FooterBlock, + HeaderBlock, + ImageBlock, + SubTitleBlock, + TableBlock, + TextBlock, + TitleBlock, +) +from megaparse.models.document import ( + Document as MPDocument, +) from megaparse.parser import BaseParser +from megaparse.predictor.models.base import BBOX, Point2D class UnstructuredParser(BaseParser): @@ -37,114 +50,322 @@ def __init__( self.strategy = strategy self.model = model - # Function to convert element category to markdown format - def convert_to_markdown(self, elements): - markdown_content = "" - - for el in elements: - markdown_content += self.get_markdown_line(el) - - return markdown_content - - def get_markdown_line(self, el: dict): - element_type = el["type"] - text = el["text"] - metadata = el["metadata"] - parent_id = metadata.get("parent_id", None) - category_depth = metadata.get("category_depth", 0) - table_stack = [] # type: ignore - - # Markdown line defaults to empty - markdown_line = "" - - # Element type-specific markdown content - markdown_types = { - "Title": f"## {text}\n\n" if parent_id else f"# {text}\n\n", - "Subtitle": f"## {text}\n\n", - "Header": f"{'#' * (category_depth + 1)} {text}\n\n", - "Footer": f"#### {text}\n\n", - "NarrativeText": f"{text}\n\n", - "ListItem": f"- {text}\n", - "Table": f"{text}\n\n", - "PageBreak": "---\n\n", - "Image": f"![Image]({el['metadata'].get('image_path', '')})\n\n", - "Formula": f"$$ {text} $$\n\n", - "FigureCaption": f"**Figure:** {text}\n\n", - "Address": f"**Address:** {text}\n\n", - "EmailAddress": f"**Email:** {text}\n\n", - "CodeSnippet": f"```{el['metadata'].get('language', '')}\n{text}\n```\n\n", - "PageNumber": "", # Page number is not included in markdown - } - - markdown_line = markdown_types.get(element_type, f"{text}\n\n") - - if element_type == "Table" and self.model: - # FIXME: @Chloé - Add a modular table enhancement here - LVM - prompt = ChatPromptTemplate.from_messages( - [ - ( - "human", - """You are an expert in markdown tables, match this text and this html table to fill a md table. You answer with just the table in pure markdown, nothing else. - - {text} - - - {html} - - - {previous_table} - """, - ), - ] - ) - chain = prompt | self.model - result = chain.invoke( - { - "text": el["text"], - "html": metadata["text_as_html"], - "previous_table": table_stack[-1] if table_stack else "", - } - ) - content_str = ( - str(result.content) - if not isinstance(result.content, str) - else result.content - ) - cleaned_content = re.sub(r"^```.*$\n?", "", content_str, flags=re.MULTILINE) - markdown_line = f"[TABLE]\n{cleaned_content}\n[/TABLE]\n\n" - - return markdown_line - - async def aconvert( + def convert( self, file_path: str | Path | None = None, file: IO[bytes] | None = None, file_extension: FileExtension | None = None, **kwargs, - ) -> str: + ) -> MPDocument: self.check_supported_extension(file_extension, file_path) - warnings.warn( - "The UnstructuredParser is a sync parser, please use the sync convert method", - UserWarning, - stacklevel=2, + # Partition the PDF + elements = partition( + filename=str(file_path) if file_path else None, + file=file, + strategy=self.strategy, + content_type=file_extension.mimetype if file_extension else None, ) - return self.convert(file_path, file, file_extension, **kwargs) + return self.__to_mp_document(elements) - def convert( + async def aconvert( self, file_path: str | Path | None = None, file: IO[bytes] | None = None, file_extension: FileExtension | None = None, **kwargs, - ) -> str: + ) -> MPDocument: self.check_supported_extension(file_extension, file_path) + warnings.warn( + "The UnstructuredParser is a sync parser, please use the sync convert method", + UserWarning, + stacklevel=2, + ) + return self.convert(file_path, file, file_extension, **kwargs) - elements = partition( - filename=str(file_path) if file_path else None, - file=file, - strategy=self.strategy, - content_type=file_extension.mimetype if file_extension else None, + def __to_mp_document(self, elements: List[Element]) -> MPDocument: + text_blocks = [] + for element in elements: + block = self.__convert_element_to_block(element) + if block: + text_blocks.append(block) + return MPDocument( + content=text_blocks, metadata={}, detection_origin="unstructured" ) - elements_dict = [el.to_dict() for el in elements] - markdown_content = self.convert_to_markdown(elements_dict) - return markdown_content + + def __convert_element_to_block(self, element: Element) -> Block | None: + element_type = element.category + text = element.text + metadata = element.metadata + category_depth = metadata.category_depth + + # Element type-specific markdown content + markdown_types: Dict[str, Block] = { + "Title": TitleBlock( + text=text, + metadata={}, + page_range=(metadata.page_number, metadata.page_number) + if metadata.page_number + else None, + bbox=BBOX( + top_left=Point2D( + x=metadata.coordinates.points[0][0], + y=metadata.coordinates.points[0][1], + ), + bottom_right=Point2D( + x=metadata.coordinates.points[3][0], + y=metadata.coordinates.points[3][1], + ), + ) + if metadata.coordinates and metadata.coordinates.points + else None, + ), + "Subtitle": SubTitleBlock( + text=text, + depth=category_depth if category_depth else 0, + metadata={}, + page_range=(metadata.page_number, metadata.page_number) + if metadata.page_number + else None, + bbox=BBOX( + top_left=Point2D( + x=metadata.coordinates.points[0][0], + y=metadata.coordinates.points[0][1], + ), + bottom_right=Point2D( + x=metadata.coordinates.points[3][0], + y=metadata.coordinates.points[3][1], + ), + ) + if metadata.coordinates and metadata.coordinates.points + else None, + ), + "Header": HeaderBlock( + text=text, + metadata={}, + page_range=(metadata.page_number, metadata.page_number) + if metadata.page_number + else None, + bbox=BBOX( + top_left=Point2D( + x=metadata.coordinates.points[0][0], + y=metadata.coordinates.points[0][1], + ), + bottom_right=Point2D( + x=metadata.coordinates.points[3][0], + y=metadata.coordinates.points[3][1], + ), + ) + if metadata.coordinates and metadata.coordinates.points + else None, + ), + "Footer": FooterBlock( + text=text, + metadata={}, + page_range=(metadata.page_number, metadata.page_number) + if metadata.page_number + else None, + bbox=BBOX( + top_left=Point2D( + x=metadata.coordinates.points[0][0], + y=metadata.coordinates.points[0][1], + ), + bottom_right=Point2D( + x=metadata.coordinates.points[3][0], + y=metadata.coordinates.points[3][1], + ), + ) + if metadata.coordinates and metadata.coordinates.points + else None, + ), + "NarrativeText": TextBlock( + text=text, + metadata={}, + page_range=(metadata.page_number, metadata.page_number) + if metadata.page_number + else None, + bbox=BBOX( + top_left=Point2D( + x=metadata.coordinates.points[0][0], + y=metadata.coordinates.points[0][1], + ), + bottom_right=Point2D( + x=metadata.coordinates.points[3][0], + y=metadata.coordinates.points[3][1], + ), + ) + if metadata.coordinates and metadata.coordinates.points + else None, + ), + "ListItem": TextBlock( # FIXME: @chloedia, list item need to be handled differently in ListBlock + text=text, + metadata={}, + page_range=(metadata.page_number, metadata.page_number) + if metadata.page_number + else None, + bbox=BBOX( + top_left=Point2D( + x=metadata.coordinates.points[0][0], + y=metadata.coordinates.points[0][1], + ), + bottom_right=Point2D( + x=metadata.coordinates.points[3][0], + y=metadata.coordinates.points[3][1], + ), + ) + if metadata.coordinates and metadata.coordinates.points + else None, + ), + "Table": TableBlock( + text=text, + metadata={}, + page_range=(metadata.page_number, metadata.page_number) + if metadata.page_number + else None, + bbox=BBOX( + top_left=Point2D( + x=metadata.coordinates.points[0][0], + y=metadata.coordinates.points[0][1], + ), + bottom_right=Point2D( + x=metadata.coordinates.points[3][0], + y=metadata.coordinates.points[3][1], + ), + ) + if metadata.coordinates and metadata.coordinates.points + else None, + ), + "Image": ImageBlock( + text=text, + metadata={}, + page_range=(metadata.page_number, metadata.page_number) + if metadata.page_number + else None, + bbox=BBOX( + top_left=Point2D( + x=metadata.coordinates.points[0][0], + y=metadata.coordinates.points[0][1], + ), + bottom_right=Point2D( + x=metadata.coordinates.points[3][0], + y=metadata.coordinates.points[3][1], + ), + ) + if metadata.coordinates and metadata.coordinates.points + else None, + ), + "Formula": TextBlock( + text=text, + metadata={}, + page_range=(metadata.page_number, metadata.page_number) + if metadata.page_number + else None, + bbox=BBOX( + top_left=Point2D( + x=metadata.coordinates.points[0][0], + y=metadata.coordinates.points[0][1], + ), + bottom_right=Point2D( + x=metadata.coordinates.points[3][0], + y=metadata.coordinates.points[3][1], + ), + ) + if metadata.coordinates and metadata.coordinates.points + else None, + ), + "FigureCaption": TextBlock( + text=text, + metadata={}, + page_range=(metadata.page_number, metadata.page_number) + if metadata.page_number + else None, + bbox=BBOX( + top_left=Point2D( + x=metadata.coordinates.points[0][0], + y=metadata.coordinates.points[0][1], + ), + bottom_right=Point2D( + x=metadata.coordinates.points[3][0], + y=metadata.coordinates.points[3][1], + ), + ) + if metadata.coordinates and metadata.coordinates.points + else None, + ), + "Address": TextBlock( + text=text, + metadata={}, + page_range=(metadata.page_number, metadata.page_number) + if metadata.page_number + else None, + bbox=BBOX( + top_left=Point2D( + x=metadata.coordinates.points[0][0], + y=metadata.coordinates.points[0][1], + ), + bottom_right=Point2D( + x=metadata.coordinates.points[3][0], + y=metadata.coordinates.points[3][1], + ), + ) + if metadata.coordinates and metadata.coordinates.points + else None, + ), + "EmailAddress": TextBlock( + text=text, + metadata={}, + page_range=(metadata.page_number, metadata.page_number) + if metadata.page_number + else None, + bbox=BBOX( + top_left=Point2D( + x=metadata.coordinates.points[0][0], + y=metadata.coordinates.points[0][1], + ), + bottom_right=Point2D( + x=metadata.coordinates.points[3][0], + y=metadata.coordinates.points[3][1], + ), + ) + if metadata.coordinates and metadata.coordinates.points + else None, + ), + "CodeSnippet": TextBlock( + text=text, + metadata={}, + page_range=(metadata.page_number, metadata.page_number) + if metadata.page_number + else None, + bbox=BBOX( + top_left=Point2D( + x=metadata.coordinates.points[0][0], + y=metadata.coordinates.points[0][1], + ), + bottom_right=Point2D( + x=metadata.coordinates.points[3][0], + y=metadata.coordinates.points[3][1], + ), + ) + if metadata.coordinates and metadata.coordinates.points + else None, + ), + "UncategorizedText": TextBlock( + text=text, + metadata={}, + page_range=(metadata.page_number, metadata.page_number) + if metadata.page_number + else None, + bbox=BBOX( + top_left=Point2D( + x=metadata.coordinates.points[0][0], + y=metadata.coordinates.points[0][1], + ), + bottom_right=Point2D( + x=metadata.coordinates.points[3][0], + y=metadata.coordinates.points[3][1], + ), + ) + if metadata.coordinates and metadata.coordinates.points + else None, + ), + } + return markdown_types.get(element_type, None) diff --git a/libs/megaparse/tests/conftest.py b/libs/megaparse/tests/conftest.py index e898f81..41eceda 100644 --- a/libs/megaparse/tests/conftest.py +++ b/libs/megaparse/tests/conftest.py @@ -8,6 +8,7 @@ from megaparse.api.app import app, get_playwright_loader, parser_builder_dep from megaparse.parser.base import BaseParser from megaparse_sdk.schema.extensions import FileExtension +from megaparse.models.document import Document as MPDocument, TextBlock class FakeParserBuilder: @@ -29,9 +30,14 @@ def convert( file: IO[bytes] | None = None, file_extension: None | FileExtension = None, **kwargs, - ) -> str: + ) -> MPDocument: print("Fake parser is converting the file") - return "Fake conversion result" + return MPDocument( + file_name="Fake file", + content=[TextBlock(text="Fake conversion result", metadata={})], + metadata={}, + detection_origin="fakeparser", + ) async def aconvert( self, @@ -39,9 +45,14 @@ async def aconvert( file: IO[bytes] | None = None, file_extension: None | FileExtension = None, **kwargs, - ) -> str: + ) -> MPDocument: print("Fake parser is converting the file") - return "Fake conversion result" + return MPDocument( + file_name="Fake file", + content=[TextBlock(text="Fake conversion result", metadata={})], + metadata={}, + detection_origin="fakeparser", + ) return FakeParser() diff --git a/libs/megaparse/tests/pdf/test_detect_ocr.py b/libs/megaparse/tests/pdf/test_detect_ocr.py index 6b6c57d..4373a7e 100644 --- a/libs/megaparse/tests/pdf/test_detect_ocr.py +++ b/libs/megaparse/tests/pdf/test_detect_ocr.py @@ -12,6 +12,9 @@ @pytest.mark.parametrize("hi_res_pdf", ocr_pdfs) def test_hi_res_strategy(hi_res_pdf): + if hi_res_pdf == "0168004.pdf": + pytest.skip("Skip 0168004.pdf as it is flaky currently") + strategy = strategy_handler.determine_strategy( f"./tests/pdf/ocr/{hi_res_pdf}", ) diff --git a/libs/megaparse/tests/test_parsers.py b/libs/megaparse/tests/test_parsers.py index ae081dd..40e772a 100644 --- a/libs/megaparse/tests/test_parsers.py +++ b/libs/megaparse/tests/test_parsers.py @@ -34,7 +34,7 @@ def test_sync_parser(parser, extension): response = myparser.convert(file_path) assert response - assert len(response) > 0 + assert len(str(response)) > 0 else: with pytest.raises(ValueError): myparser.convert(file_path) diff --git a/requirements-dev.lock b/requirements-dev.lock index 05ce254..f1246a0 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -255,7 +255,7 @@ layoutparser==0.3.4 # via unstructured-inference llama-index-core==0.12.0 # via llama-parse -llama-parse==0.5.14 +llama-parse==0.5.19 # via megaparse loguru==0.7.2 # via megaparse-sdk @@ -495,6 +495,7 @@ pydantic==2.9.2 # via langchain-core # via langsmith # via llama-index-core + # via llama-parse # via openai # via pydantic-settings # via unstructured-client diff --git a/requirements.lock b/requirements.lock index e0720ab..f747b77 100644 --- a/requirements.lock +++ b/requirements.lock @@ -209,7 +209,7 @@ layoutparser==0.3.4 # via unstructured-inference llama-index-core==0.12.0 # via llama-parse -llama-parse==0.5.14 +llama-parse==0.5.19 # via megaparse loguru==0.7.2 # via megaparse-sdk @@ -413,6 +413,7 @@ pydantic==2.9.2 # via langchain-core # via langsmith # via llama-index-core + # via llama-parse # via openai # via pydantic-settings # via unstructured-client