From 20cc56291162ae87c7d97ea8771f2ce1d58019b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Chlo=C3=A9=20Daems?= <73901882+chloedia@users.noreply.github.com> Date: Mon, 3 Jun 2024 12:03:40 +0200 Subject: [PATCH] add: gpt cleaner for header and footer (#13) --- megaparse/Converter.py | 8 +- megaparse/markdown_processor.py | 59 +++++++--- notebooks/evaluate.ipynb | 189 +++++--------------------------- 3 files changed, 76 insertions(+), 180 deletions(-) diff --git a/megaparse/Converter.py b/megaparse/Converter.py index bff1649..cfad5db 100644 --- a/megaparse/Converter.py +++ b/megaparse/Converter.py @@ -233,7 +233,7 @@ def _unstructured_parse(self, file_path: str): unstructured_parser = UnstructuredParser() return unstructured_parser.convert(file_path) - def convert(self, file_path: str) -> str: + def convert(self, file_path: str, gpt4o_cleaner = False) -> str: parsed_md = "" if self.llama_parse_api_key: parsed_md = self._llama_parse(self.llama_parse_api_key, file_path) @@ -248,7 +248,7 @@ def convert(self, file_path: str) -> str: strict=self.handle_header, remove_pagination=self.handle_pagination, ) - md_cleaned = md_processor.process() + md_cleaned = md_processor.process(gpt4o_cleaner=gpt4o_cleaner) return md_cleaned def save_md(self, md_content: str, file_path: Path | str) -> None: @@ -261,7 +261,7 @@ def __init__(self, file_path: str, llama_parse_api_key: str | None = None) -> No self.file_path = file_path self.llama_parse_api_key = llama_parse_api_key - def convert(self) -> str: + def convert(self, **kwargs) -> str: file_extension: str = os.path.splitext(self.file_path)[1] if file_extension == ".docx": @@ -277,7 +277,7 @@ def convert(self) -> str: else: print(self.file_path, file_extension) raise ValueError(f"Unsupported file extension: {file_extension}") - return converter.convert(self.file_path) + return converter.convert(self.file_path, **kwargs) def save_md(self, md_content: str, file_path: Path | str) -> None: os.makedirs(os.path.dirname(file_path), exist_ok=True) diff --git a/megaparse/markdown_processor.py b/megaparse/markdown_processor.py index abce481..15a39a5 100644 --- a/megaparse/markdown_processor.py +++ b/megaparse/markdown_processor.py @@ -1,6 +1,8 @@ import os from collections import Counter from typing import LiteralString +from langchain_openai import ChatOpenAI + class MarkdownProcessor: def __init__(self, md_result: str, strict: bool, remove_pagination: bool): @@ -67,25 +69,54 @@ def save_cleaned_result(self, cleaned_result: str, output_path: str): with open(output_path, "w") as f: f.write(cleaned_result) - def process(self): + def remove_header_llm(self): + llm = ChatOpenAI(model="gpt-4o") + # Define the prompt + messages = [ + ( + "system", + "You are a document cleaner and you are used to remove repetitive headers / footer from parsed files in markdown.", + ), + ] + + prompt = f"""You are a document cleaner and you are used to remove repetitive headers / footer from parsed files in markdown. + Here is a md file : "{self.md_result}" + I want you to identify repetitive texts that could be associate to a document header and footer. Please identify the headers, the footer and remove them from the document. + Answer with only the cleaned document in markdown format. + Result : """ + + messages.append(("human", self.md_result)) #type: ignore + + result = llm.invoke(messages) + + return result.content + + + + def process(self, gpt4o_cleaner = False): """Process the markdown result by removing duplicate paragraphs and headers.""" - pages = self.split_into_pages() - paragraphs = self.split_into_paragraphs(pages) - #other_pages_paragraphs = self.split_into_paragraphs(pages[1:]) - cleaned_paragraphs, duplicate_paragraphs = self.remove_duplicates(paragraphs) - header_components_count = self.identify_header_components(duplicate_paragraphs) + if gpt4o_cleaner: + cleaned_result = self.remove_header_llm() - if self.strict: - final_paragraphs = self.remove_header_lines(cleaned_paragraphs[5:], header_components_count) - final_paragraphs = cleaned_paragraphs[:5] + final_paragraphs else: - final_paragraphs = cleaned_paragraphs + pages = self.split_into_pages() + paragraphs = self.split_into_paragraphs(pages) + #other_pages_paragraphs = self.split_into_paragraphs(pages[1:]) + + cleaned_paragraphs, duplicate_paragraphs = self.remove_duplicates(paragraphs) + header_components_count = self.identify_header_components(duplicate_paragraphs) + + if self.strict: + final_paragraphs = self.remove_header_lines(cleaned_paragraphs[5:], header_components_count) + final_paragraphs = cleaned_paragraphs[:5] + final_paragraphs + else: + final_paragraphs = cleaned_paragraphs - # Combine first page paragraphs with cleaned paragraphs from other pages - all_paragraphs = final_paragraphs - cleaned_result = "\n\n".join(all_paragraphs) + # Combine first page paragraphs with cleaned paragraphs from other pages + all_paragraphs = final_paragraphs + cleaned_result = "\n\n".join(all_paragraphs) - cleaned_result = self.merge_tables(cleaned_result) + cleaned_result = self.merge_tables(str(cleaned_result)) return cleaned_result diff --git a/notebooks/evaluate.ipynb b/notebooks/evaluate.ipynb index 4a51fab..1449a84 100644 --- a/notebooks/evaluate.ipynb +++ b/notebooks/evaluate.ipynb @@ -9,165 +9,24 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Started parsing the file under job_id febd501f-09c4-497a-9152-5c36d12db1cf\n", - "Parsing table
My Mega fakereport|#1756394 31/05/2024
\n", - "Table | My Mega fake | report | |#1756394 31/05/2024 |\n", - "|--------------|--------|----------------------|\n", - "| | | |\n", - "\n", - " improved\n", - "Parsing Title\n", - "{'type': 'Title', 'element_id': '816d843b751910db0a06ba8f1ffd7fc8', 'text': 'Why Mega Parse might be the best ?', 'metadata': {'detection_class_prob': 0.6813323497772217, 'coordinates': {'points': ((199.20834350585938, 463.5521545410156), (199.20834350585938, 543.973876953125), (1387.9010177670898, 543.973876953125), (1387.9010177670898, 463.5521545410156)), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'last_modified': '2024-06-01T21:53:28', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 1, 'file_directory': '../megaparse/tests/input_tests', 'filename': 'MegaFake_report.pdf'}}\n", - "Parsing Title\n", - "{'type': 'Title', 'element_id': '06184a13c08b5aaadce7d8236ae47d3e', 'text': 'Introduction', 'metadata': {'detection_class_prob': 0.8039769530296326, 'coordinates': {'points': ((200.0, 614.0830078125), (200.0, 672.2964919278969), (493.139404296875, 672.2964919278969), (493.139404296875, 614.0830078125)), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'last_modified': '2024-06-01T21:53:28', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 1, 'file_directory': '../megaparse/tests/input_tests', 'filename': 'MegaFake_report.pdf'}}\n", - "Parsing NarrativeText\n", - "{'type': 'NarrativeText', 'element_id': 'ff7a23ceec0664a8640951a013d196aa', 'text': \"Mega Parse is a state-of-the-art document parser designed to convert various document formats such as PDF, DOCX, PPTX, and more into Markdown (MD) format, making them ready for Retrieval-Augmented Generation (RAG) ingestion. In today's data-driven world, the ability to efficiently manage and utilize large volumes of information is crucial. This report explores the features, benefits, and comparative performance of Mega Parse, illustrating why it stands out as a superior tool in the realm of document parsing.\", 'metadata': {'detection_class_prob': 0.9467281699180603, 'coordinates': {'points': ((195.34674072265625, 700.1703491210938), (195.34674072265625, 1001.353272486111), (1487.6627548333329, 1001.353272486111), (1487.6627548333329, 700.1703491210938)), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'last_modified': '2024-06-01T21:53:28', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 1, 'parent_id': '06184a13c08b5aaadce7d8236ae47d3e', 'file_directory': '../megaparse/tests/input_tests', 'filename': 'MegaFake_report.pdf'}}\n", - "Parsing Title\n", - "{'type': 'Title', 'element_id': '1fbe1a100b42c04a32346c5eb5858075', 'text': 'Features of Mega Parse', 'metadata': {'detection_class_prob': 0.8466414213180542, 'coordinates': {'points': ((198.63735961914062, 1108.432861328125), (198.63735961914062, 1171.91552734375), (799.0963745117188, 1171.91552734375), (799.0963745117188, 1108.432861328125)), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'last_modified': '2024-06-01T21:53:28', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 1, 'file_directory': '../megaparse/tests/input_tests', 'filename': 'MegaFake_report.pdf'}}\n", - "Parsing NarrativeText\n", - "{'type': 'NarrativeText', 'element_id': '9262402749e817eb511b6ea6983fada4', 'text': 'Mega Parse boasts an impressive array of features tailored to meet the diverse needs of modern enterprises.', 'metadata': {'detection_class_prob': 0.9040734767913818, 'coordinates': {'points': ((199.1422882080078, 1200.2735595703125), (199.1422882080078, 1279.280272486111), (1467.9168701171875, 1279.280272486111), (1467.9168701171875, 1200.2735595703125)), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'last_modified': '2024-06-01T21:53:28', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 1, 'parent_id': '1fbe1a100b42c04a32346c5eb5858075', 'file_directory': '../megaparse/tests/input_tests', 'filename': 'MegaFake_report.pdf'}}\n", - "Parsing NarrativeText\n", - "{'type': 'NarrativeText', 'element_id': '65c50deab71fcbf19a3065136b8e2594', 'text': 'Multiple Format Support: Mega Parse supports a wide range of document formats including PDF, DOCX, and PPTX. This versatility allows users to handle various document types without needing multiple tools. Whether you are working with text documents, presentations, or scanned PDFs, Mega Parse has you covered.', 'metadata': {'detection_class_prob': 0.9433316588401794, 'coordinates': {'points': ((198.0052490234375, 1330.345947265625), (198.0052490234375, 1499.6782169305554), (1429.8336181640625, 1499.6782169305554), (1429.8336181640625, 1330.345947265625)), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'last_modified': '2024-06-01T21:53:28', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 1, 'parent_id': '1fbe1a100b42c04a32346c5eb5858075', 'file_directory': '../megaparse/tests/input_tests', 'filename': 'MegaFake_report.pdf'}}\n", - "Parsing NarrativeText\n", - "{'type': 'NarrativeText', 'element_id': 'a1dec3d0f0f460c45a6f7d5b029db1fb', 'text': 'High-Speed Processing: One of the standout features of Mega Parse is its ability to convert documents at a rapid pace. With processing speeds of up to 120 pages per minute, it significantly enhances productivity by reducing the time spent on document conversion.', 'metadata': {'detection_class_prob': 0.9479358196258545, 'coordinates': {'points': ((196.70298767089844, 1550.4149169921875), (196.70298767089844, 1720.0761613749999), (1455.1326904296875, 1720.0761613749999), (1455.1326904296875, 1550.4149169921875)), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'last_modified': '2024-06-01T21:53:28', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 1, 'parent_id': '1fbe1a100b42c04a32346c5eb5858075', 'file_directory': '../megaparse/tests/input_tests', 'filename': 'MegaFake_report.pdf'}}\n", - "Parsing NarrativeText\n", - "{'type': 'NarrativeText', 'element_id': '593ccfd14e6fbc92cf874e60f390b49a', 'text': 'Markdown Output: Mega Parse converts documents into a structured Markdown format. Markdown is a lightweight markup language with plain text formatting syntax, which is widely used because of its simplicity and ease of conversion to other formats. This makes it ideal for RAG ingestion, where structured and easily interpretable data is paramount.', 'metadata': {'detection_class_prob': 0.9448536038398743, 'coordinates': {'points': ((194.30455017089844, 1771.965576171875), (194.30455017089844, 1984.5536891527777), (1493.1640671666669, 1984.5536891527777), (1493.1640671666669, 1771.965576171875)), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'last_modified': '2024-06-01T21:53:28', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 1, 'parent_id': '1fbe1a100b42c04a32346c5eb5858075', 'file_directory': '../megaparse/tests/input_tests', 'filename': 'MegaFake_report.pdf'}}\n", - "Parsing NarrativeText\n", - "{'type': 'NarrativeText', 'element_id': 'cd8c52bb2d7b70db718471e8994e8652', 'text': 'Page 1', 'metadata': {'detection_class_prob': 0.715349018573761, 'coordinates': {'points': ((199.8196258544922, 2062.177734375), (199.8196258544922, 2094.729626681885), (298.4814147949219, 2094.729626681885), (298.4814147949219, 2062.177734375)), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'last_modified': '2024-06-01T21:53:28', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 1, 'parent_id': '1fbe1a100b42c04a32346c5eb5858075', 'file_directory': '../megaparse/tests/input_tests', 'filename': 'MegaFake_report.pdf'}}\n", - "Parsing table
My Mega fake report|#1756394
31/05/2024
\n", - "Table | My Mega fake report | #1756394 |\n", - "|---------------------|------------|\n", - "| | 31/05/2024 |\n", - "\n", - " improved\n", - "Parsing NarrativeText\n", - "{'type': 'NarrativeText', 'element_id': 'f1f61897e9596797b75759bf6f4a0e32', 'text': 'Accuracy: Accuracy in text extraction and formatting is a critical aspect of any document parser. Mega Parse ensures high accuracy, maintaining the integrity and structure of the original documents. This is particularly important for documents that contain complex formatting and embedded elements.', 'metadata': {'detection_class_prob': 0.9468010067939758, 'coordinates': {'points': ((199.31332397460938, 463.8409423828125), (199.31332397460938, 631.5938280416664), (1498.865478515625, 631.5938280416664), (1498.865478515625, 463.8409423828125)), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'last_modified': '2024-06-01T21:53:28', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 2, 'parent_id': '1fbe1a100b42c04a32346c5eb5858075', 'file_directory': '../megaparse/tests/input_tests', 'filename': 'MegaFake_report.pdf'}}\n", - "Parsing NarrativeText\n", - "{'type': 'NarrativeText', 'element_id': '76cf7dddff60ac00c26a19f82a808a8c', 'text': 'Customizable Parsing Rules: Users can define custom parsing rules to suit specific needs, allowing for greater control over the conversion process. This flexibility ensures that Mega Parse can be adapted to a wide variety of use cases.', 'metadata': {'detection_class_prob': 0.9284847974777222, 'coordinates': {'points': ((200.0, 683.8502197265625), (200.0, 807.9121891527775), (1486.739013671875, 807.9121891527775), (1486.739013671875, 683.8502197265625)), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'last_modified': '2024-06-01T21:53:28', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 2, 'parent_id': '1fbe1a100b42c04a32346c5eb5858075', 'file_directory': '../megaparse/tests/input_tests', 'filename': 'MegaFake_report.pdf'}}\n", - "Parsing NarrativeText\n", - "{'type': 'NarrativeText', 'element_id': 'c4a3c67ab50d3883fef309baedfdadc4', 'text': 'Batch Processing: Mega Parse supports batch processing, enabling the simultaneous conversion of multiple documents. This feature is particularly useful for organizations dealing with large volumes of documents, as it streamlines the workflow and saves time.', 'metadata': {'detection_class_prob': 0.9300038814544678, 'coordinates': {'points': ((200.0, 860.6484985351562), (200.0, 984.7760009765625), (1498.7467393333332, 984.7760009765625), (1498.7467393333332, 860.6484985351562)), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'last_modified': '2024-06-01T21:53:28', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 2, 'parent_id': '1fbe1a100b42c04a32346c5eb5858075', 'file_directory': '../megaparse/tests/input_tests', 'filename': 'MegaFake_report.pdf'}}\n", - "Parsing NarrativeText\n", - "{'type': 'NarrativeText', 'element_id': 'bb060c3a215a08c3b42f33e4ce5cdbeb', 'text': 'Error Handling: Advanced error handling capabilities ensure that any issues encountered during the conversion process are managed effectively, minimizing disruptions and maintaining workflow efficiency.', 'metadata': {'detection_class_prob': 0.8979511857032776, 'coordinates': {'points': ((196.416015625, 1036.5633544921875), (196.416015625, 1160.548911375), (1386.8544921875, 1160.548911375), (1386.8544921875, 1036.5633544921875)), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'last_modified': '2024-06-01T21:53:28', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 2, 'parent_id': '1fbe1a100b42c04a32346c5eb5858075', 'file_directory': '../megaparse/tests/input_tests', 'filename': 'MegaFake_report.pdf'}}\n", - "Parsing Title\n", - "{'type': 'Title', 'element_id': '26badc3ed49908af019ea9017895fab1', 'text': 'Benefits of Mega Parse', 'metadata': {'detection_class_prob': 0.8408889174461365, 'coordinates': {'points': ((200.0, 1269.0089111328125), (200.0, 1331.1937255859375), (780.8248291015625, 1331.1937255859375), (780.8248291015625, 1269.0089111328125)), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'last_modified': '2024-06-01T21:53:28', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 2, 'file_directory': '../megaparse/tests/input_tests', 'filename': 'MegaFake_report.pdf'}}\n", - "Parsing NarrativeText\n", - "{'type': 'NarrativeText', 'element_id': '7923c5f5216462840306adcb52b05ae9', 'text': 'The implementation of Mega Parse offers numerous benefits that can transform the way organizations manage their documents.', 'metadata': {'detection_class_prob': 0.9141799211502075, 'coordinates': {'points': ((199.60433959960938, 1356.43408203125), (199.60433959960938, 1438.4758835972218), (1496.486572265625, 1438.4758835972218), (1496.486572265625, 1356.43408203125)), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'last_modified': '2024-06-01T21:53:28', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 2, 'parent_id': '26badc3ed49908af019ea9017895fab1', 'file_directory': '../megaparse/tests/input_tests', 'filename': 'MegaFake_report.pdf'}}\n", - "Parsing NarrativeText\n", - "{'type': 'NarrativeText', 'element_id': '4a9e0b7c692b589908082703fafc93a2', 'text': 'Efficiency: By significantly speeding up the document conversion process, Mega Parse increases overall efficiency. This is especially beneficial for industries that handle large volumes of documents on a daily basis, such as legal firms, financial institutions, and research organizations.', 'metadata': {'detection_class_prob': 0.94883131980896, 'coordinates': {'points': ((198.958740234375, 1489.6474609375), (198.958740234375, 1658.8738280416667), (1483.9267578125, 1658.8738280416667), (1483.9267578125, 1489.6474609375)), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'last_modified': '2024-06-01T21:53:28', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 2, 'parent_id': '26badc3ed49908af019ea9017895fab1', 'file_directory': '../megaparse/tests/input_tests', 'filename': 'MegaFake_report.pdf'}}\n", - "Parsing NarrativeText\n", - "{'type': 'NarrativeText', 'element_id': 'a1b1485a56ede20ff77e777a2ba8179f', 'text': \"Versatility: Mega Parse's ability to handle multiple document types makes it a versatile tool for various industries. Whether you need to convert legal documents, technical manuals, or business presentations, Mega Parse is equipped to handle the task.\", 'metadata': {'detection_class_prob': 0.9241225719451904, 'coordinates': {'points': ((199.05169677734375, 1709.4368896484375), (199.05169677734375, 1835.1921891527777), (1468.0664058333332, 1835.1921891527777), (1468.0664058333332, 1709.4368896484375)), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'last_modified': '2024-06-01T21:53:28', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 2, 'parent_id': '26badc3ed49908af019ea9017895fab1', 'file_directory': '../megaparse/tests/input_tests', 'filename': 'MegaFake_report.pdf'}}\n", - "Parsing NarrativeText\n", - "{'type': 'NarrativeText', 'element_id': '52db73241ccc435f4623b3109714ff02', 'text': 'Enhanced Knowledge Management: Converting documents to Markdown facilitates easier content management and retrieval. Markdown files are not only lightweight but', 'metadata': {'detection_class_prob': 0.9148008227348328, 'coordinates': {'points': ((200.0, 1887.0443115234375), (200.0, 1967.4309669305553), (1458.177734375, 1967.4309669305553), (1458.177734375, 1887.0443115234375)), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'last_modified': '2024-06-01T21:53:28', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 2, 'parent_id': '26badc3ed49908af019ea9017895fab1', 'file_directory': '../megaparse/tests/input_tests', 'filename': 'MegaFake_report.pdf'}}\n", - "Parsing NarrativeText\n", - "{'type': 'NarrativeText', 'element_id': '04e582563851711ff6303ec3f5bc60db', 'text': 'Page 2', 'metadata': {'detection_class_prob': 0.6826891899108887, 'coordinates': {'points': ((200.0, 2061.138916015625), (200.0, 2094.729626681885), (299.0777893066406, 2094.729626681885), (299.0777893066406, 2061.138916015625)), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'last_modified': '2024-06-01T21:53:28', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 2, 'parent_id': '26badc3ed49908af019ea9017895fab1', 'file_directory': '../megaparse/tests/input_tests', 'filename': 'MegaFake_report.pdf'}}\n", - "Parsing table
My Mega fakereport|#1756394 31/05/2024
\n", - "Table | My Mega fake | report | |#1756394 31/05/2024 |\n", - "|--------------|--------|---------------------|\n", - "| | | |\n", - "\n", - " improved\n", - "Parsing NarrativeText\n", - "{'type': 'NarrativeText', 'element_id': '204ab9d43a76958e946ef25f5dd3e3db', 'text': 'also highly compatible with various knowledge management systems, making it easier to organize, search, and utilize information.', 'metadata': {'detection_class_prob': 0.907528817653656, 'coordinates': {'points': ((200.0, 421.9421391527776), (200.0, 499.3550613750001), (1483.7049560546875, 499.3550613750001), (1483.7049560546875, 421.9421391527776)), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'last_modified': '2024-06-01T21:53:28', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 3, 'parent_id': '26badc3ed49908af019ea9017895fab1', 'file_directory': '../megaparse/tests/input_tests', 'filename': 'MegaFake_report.pdf'}}\n", - "Parsing NarrativeText\n", - "{'type': 'NarrativeText', 'element_id': '9976f98fde3a4fa72b392ef65b8e4ec1', 'text': 'Improved Workflow: Mega Parse simplifies the process of preparing documents for machine learning and AI applications. By converting documents into a structured format, it reduces the time and effort required to preprocess data, allowing teams to focus on higher-level tasks.', 'metadata': {'detection_class_prob': 0.9413748383522034, 'coordinates': {'points': ((193.52938842773438, 553.0362548828125), (193.52938842773438, 719.7530224861111), (1498.8444005, 719.7530224861111), (1498.8444005, 553.0362548828125)), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'last_modified': '2024-06-01T21:53:28', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 3, 'parent_id': '26badc3ed49908af019ea9017895fab1', 'file_directory': '../megaparse/tests/input_tests', 'filename': 'MegaFake_report.pdf'}}\n", - "Parsing NarrativeText\n", - "{'type': 'NarrativeText', 'element_id': '72e5c6dd8ee282c36404a017a9b5abd2', 'text': 'Cost Savings: The efficiency and speed of Mega Parse can lead to significant cost savings. Reduced processing times and improved workflow efficiency mean that resources can be allocated more effectively, ultimately lowering operational costs.', 'metadata': {'detection_class_prob': 0.9315865635871887, 'coordinates': {'points': ((198.14947509765625, 771.4175415039062), (198.14947509765625, 896.417236328125), (1428.92578125, 896.417236328125), (1428.92578125, 771.4175415039062)), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'last_modified': '2024-06-01T21:53:28', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 3, 'parent_id': '26badc3ed49908af019ea9017895fab1', 'file_directory': '../megaparse/tests/input_tests', 'filename': 'MegaFake_report.pdf'}}\n", - "Parsing NarrativeText\n", - "{'type': 'NarrativeText', 'element_id': 'b6e4baa9fed51af8a09963845913e37d', 'text': 'Scalability: Mega Parse is designed to scale with the needs of an organization. As document volumes grow, Mega Parse can handle the increased load without compromising performance, making it a future-proof solution for document management.', 'metadata': {'detection_class_prob': 0.9432715177536011, 'coordinates': {'points': ((197.68289184570312, 947.6447143554688), (197.68289184570312, 1116.4693280416666), (1406.2500105000001, 1116.4693280416666), (1406.2500105000001, 947.6447143554688)), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'last_modified': '2024-06-01T21:53:28', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 3, 'parent_id': '26badc3ed49908af019ea9017895fab1', 'file_directory': '../megaparse/tests/input_tests', 'filename': 'MegaFake_report.pdf'}}\n", - "Parsing Title\n", - "{'type': 'Title', 'element_id': 'a680e88f19f77795f8d682a5d6a7d031', 'text': 'Comparative Performance', 'metadata': {'detection_class_prob': 0.870198667049408, 'coordinates': {'points': ((198.4755859375, 1269.5745849609375), (198.4755859375, 1330.1820068359375), (854.2064819335938, 1330.1820068359375), (854.2064819335938, 1269.5745849609375)), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'last_modified': '2024-06-01T21:53:28', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 3, 'file_directory': '../megaparse/tests/input_tests', 'filename': 'MegaFake_report.pdf'}}\n", - "Parsing NarrativeText\n", - "{'type': 'NarrativeText', 'element_id': 'de0739f8b56bc9d2caa11ca01ef28b0f', 'text': 'The following table provides a comprehensive comparative analysis of Mega Parse against other document parsers based on fictional performance metrics. This comparison highlights the strengths of Mega Parse in various key areas.', 'metadata': {'detection_class_prob': 0.932475745677948, 'coordinates': {'points': ((200.0, 1358.32763671875), (200.0, 1472.0147100152183), (1488.2640380859375, 1472.0147100152183), (1488.2640380859375, 1358.32763671875)), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'last_modified': '2024-06-01T21:53:28', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 3, 'parent_id': 'a680e88f19f77795f8d682a5d6a7d031', 'file_directory': '../megaparse/tests/input_tests', 'filename': 'MegaFake_report.pdf'}}\n", - "Parsing table
MetricMega ParseParser AParser BParser CParser D
Supported FormatsPDF, DOCX, PPTXPDF, DOCXDOCX, PPTX| PDF, PPTXPDF, DOCX, XLSX
Conversion Speed (pages/min)
\n", - "Table | Metric | Mega Parse | Parser A | Parser B | Parser C | Parser D |\n", - "|--------------------------|-------------------|----------------|----------------|----------------|-------------------|\n", - "| Supported Formats | PDF, DOCX, PPTX | PDF, DOCX | DOCX, PPTX | PDF, PPTX | PDF, DOCX, XLSX |\n", - "| Conversion Speed (pages/min) | | | | | |\n", - "\n", - " improved\n", - "Parsing NarrativeText\n", - "{'type': 'NarrativeText', 'element_id': 'bcde831ddc3b7e8b4db9627b547db6ea', 'text': 'Page 3', 'metadata': {'detection_class_prob': 0.49129852652549744, 'coordinates': {'points': ((200.0, 2063.436767578125), (200.0, 2094.729265570774), (298.49969482421875, 2094.729265570774), (298.49969482421875, 2063.436767578125)), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'last_modified': '2024-06-01T21:53:28', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 3, 'parent_id': 'a680e88f19f77795f8d682a5d6a7d031', 'file_directory': '../megaparse/tests/input_tests', 'filename': 'MegaFake_report.pdf'}}\n", - "Parsing table
My Mega fake report|#1756394
31/05/2024
\n", - "Table ```markdown\n", - "| My Mega fake report | |#1756394 |\n", - "|---------------------|-----------|\n", - "| | 31/05/2024|\n", - "```\n", - "\n", - " improved\n", - "Parsing table
__Plain Text
oe i:a 1eeeeee
mu“ee
Integration CapabilityExcellentGoodGood
Batch ProcessingYesNoYes
seoneeens i
Multilingual Support OCR (Optical CharacterYesa Yesepeeae
Recognition) Price (per user/month)ss $25| ”\" °
Customer Support Rating (out of 5)484.241
Free Trial AvailableYesYesNo
Cloud IntegrationYesNoNo
Security FeaturesAdvancedBasicAdvancedBasicIntermediate
\n", - "Table | | | _ | _ | Plain Text | |\n", - "|-----------------------------|---------|-----------|-----------|------------|---------------|\n", - "| oe i: | a 1 | ee | | ee | ee |\n", - "| | | | mu | “ee | |\n", - "| Integration Capability | Excellent | Good | | | Good |\n", - "| Batch Processing | Yes | No | | | Yes |\n", - "| seoneeens i | | | | | |\n", - "| Multilingual Support OCR (Optical Character Recognition) | Yes | a Yes | epee | ae | |\n", - "| Price (per user/month) | | ss $25 | |” | \" ° | |\n", - "| Customer Support Rating (out of 5) | 48 | 4.2 | | | 41 |\n", - "| Free Trial Available | Yes | Yes | | | No |\n", - "| Cloud Integration | Yes | No | | | No |\n", - "| Security Features | Advanced| Basic | Advanced | Basic | Intermediate |\n", - "\n", - " improved\n", - "Parsing NarrativeText\n", - "{'type': 'NarrativeText', 'element_id': '5503cb79d2097f2f296c666dd4d37fa2', 'text': 'Page 4', 'metadata': {'detection_class_prob': 0.6166406273841858, 'coordinates': {'points': ((200.0, 2063.1318359375), (200.0, 2094.729265570774), (299.2191162109375, 2094.729265570774), (299.2191162109375, 2063.1318359375)), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'last_modified': '2024-06-01T21:53:28', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 4, 'parent_id': 'a680e88f19f77795f8d682a5d6a7d031', 'file_directory': '../megaparse/tests/input_tests', 'filename': 'MegaFake_report.pdf'}}\n", - "Parsing table
My Mega fake report|#1756394
31/05/2024
\n", - "Table | My Mega fake report | #1756394 |\n", - "|---------------------|----------|\n", - "| | 31/05/2024 |\n", - "\n", - " improved\n", - "Parsing table
User Community SizeLargeMediumMediumSmallMedium
“neepefefs
meee “ery
Platform Compatibilitya Windows, Mac, Linux| Windows,WindowsMac, Linux| Windows, Linux
Data Privacy ComplianceiMediumHighMedium
Al-Driven EnhancementsYes
File Size Limit (per document)500MB750MB200MB500MB
Customizable Output TemplatesLimited
Collaboration FeaturesLimited
Document Version ControlYesNoYesNoYes
Import/Export OptionsExtensiveModerateExtensiveLimitedModerate
\n", - "Table | User Community Size | Large | Medium | Medium | Small | Medium |\n", - "|----------------------------|---------------------------|-------------------------|----------|--------------|--------------|\n", - "| “nee | pe | fe | fs | | |\n", - "| meee “ery | | | | | |\n", - "| Platform Compatibility | a Windows, Mac, Linux | \\| Windows | Windows | Mac, Linux | \\| Windows, Linux |\n", - "| Data Privacy Compliance | i | Medium | High | | Medium |\n", - "| Al-Driven Enhancements | | | | | Yes |\n", - "| File Size Limit (per document) | | 500MB | 750MB | 200MB | 500MB |\n", - "| — | | | | | |\n", - "| Customizable Output Templates | | Limited | | | |\n", - "| Collaboration Features | | | | | Limited |\n", - "| Document Version Control | Yes | No | Yes | No | Yes |\n", - "| Import/Export Options | Extensive | Moderate | Extensive| Limited | Moderate |\n", - "\n", - " improved\n", - "Parsing NarrativeText\n", - "{'type': 'NarrativeText', 'element_id': '147ae73148c14eda9f727036e40559a5', 'text': 'Page 5', 'metadata': {'detection_class_prob': 0.6519767642021179, 'coordinates': {'points': ((200.0, 2062.960693359375), (200.0, 2094.729265570774), (298.39019775390625, 2094.729265570774), (298.39019775390625, 2062.960693359375)), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'last_modified': '2024-06-01T21:53:28', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 5, 'parent_id': 'a680e88f19f77795f8d682a5d6a7d031', 'file_directory': '../megaparse/tests/input_tests', 'filename': 'MegaFake_report.pdf'}}\n", - "Parsing table
My Mega fake report|#1756394
31/05/2024
\n", - "Table | My Mega fake report | #1756394 |\n", - "|---------------------|----------|\n", - "| | 31/05/2024 |\n", - "\n", - " improved\n", - "Parsing table
Feedback MechanismYesNoYesNoYes
\n", - "Table | Feedback Mechanism |\n", - "|--------------------| \n", - "| Yes | \n", - "| No | \n", - "| Yes | \n", - "| No | \n", - "| Yes | \n", - "\n", - " improved\n", - "Parsing NarrativeText\n", - "{'type': 'NarrativeText', 'element_id': 'd4c69e80cb05a2aae54ffc46d65e3bc0', 'text': 'Note: All data presented in this table is fictional and for illustrative purposes only.', 'metadata': {'detection_class_prob': 0.8161468505859375, 'coordinates': {'points': ((197.05209350585938, 565.0863647460938), (197.05209350585938, 599.3543835972222), (1385.2050739166666, 599.3543835972222), (1385.2050739166666, 565.0863647460938)), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'last_modified': '2024-06-01T21:53:28', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 6, 'parent_id': 'a680e88f19f77795f8d682a5d6a7d031', 'file_directory': '../megaparse/tests/input_tests', 'filename': 'MegaFake_report.pdf'}}\n", - "Parsing Title\n", - "{'type': 'Title', 'element_id': '62ea19dc1fa74633cf4a56ce35dd4a7c', 'text': 'Conclusion', 'metadata': {'detection_class_prob': 0.7667871117591858, 'coordinates': {'points': ((199.31866455078125, 710.5487670898438), (199.31866455078125, 769.3364868164062), (476.8610534667969, 769.3364868164062), (476.8610534667969, 710.5487670898438)), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'last_modified': '2024-06-01T21:53:28', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 6, 'file_directory': '../megaparse/tests/input_tests', 'filename': 'MegaFake_report.pdf'}}\n", - "Parsing NarrativeText\n", - "{'type': 'NarrativeText', 'element_id': '171cd30e071a7615f885a4a576469480', 'text': 'Mega Parse stands out as a leading document parser due to its extensive format support, high-speed processing, and accuracy. Its ability to convert a variety of document types into Markdown format makes it an invaluable tool for organizations looking to streamline their document management processes and enhance their knowledge management systems. With features like customizable parsing rules, batch processing, and advanced error handling, Mega Parse is well-equipped to meet the demands of modern enterprises. Its scalability and cost-effectiveness further reinforce its position as a top choice for document parsing and conversion needs. By leveraging Mega Parse, organizations can improve their workflow efficiency, reduce operational costs, and better manage their information assets in the age of big data and artificial intelligence.', 'metadata': {'detection_class_prob': 0.9476631283760071, 'coordinates': {'points': ((194.57557678222656, 799.4579467773438), (194.57557678222656, 1273.9976613749998), (1479.8709716796875, 1273.9976613749998), (1479.8709716796875, 799.4579467773438)), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'last_modified': '2024-06-01T21:53:28', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 6, 'parent_id': '62ea19dc1fa74633cf4a56ce35dd4a7c', 'file_directory': '../megaparse/tests/input_tests', 'filename': 'MegaFake_report.pdf'}}\n", - "Parsing NarrativeText\n", - "{'type': 'NarrativeText', 'element_id': '56073fb5860946dcb3c676a534ad35db', 'text': 'Page 6', 'metadata': {'detection_class_prob': 0.43293169140815735, 'coordinates': {'points': ((199.91726684570312, 2064.1737093207735), (199.91726684570312, 2094.729265570774), (298.55853271484375, 2094.729265570774), (298.55853271484375, 2064.1737093207735)), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'last_modified': '2024-06-01T21:53:28', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 6, 'parent_id': '62ea19dc1fa74633cf4a56ce35dd4a7c', 'file_directory': '../megaparse/tests/input_tests', 'filename': 'MegaFake_report.pdf'}}\n" + "Started parsing the file under job_id 2216e572-99ce-4b37-bdb6-b2d8ff2e18d1\n", + "Started parsing the file under job_id 1c36b5f9-bc30-475f-901d-ea79f9425205\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']\n", + "- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n" ] } ], @@ -184,6 +43,10 @@ "md_content = converter.convert()\n", "converter.save_md(md_content, Path(\"../megaparse/tests/output_tests/MegaFake_report_llama_parse_megaparse.md\"))\n", "\n", + "converter = MegaParse(file_path=\"../megaparse/tests/input_tests/MegaFake_report.pdf\", llama_parse_api_key=api_key)\n", + "md_content = converter.convert(gpt4o_cleaner = True)\n", + "converter.save_md(md_content, Path(\"../megaparse/tests/output_tests/MegaFake_report_llama_parse_megaparse_gptcleaner.md\"))\n", + "\n", "\n", "converter = MegaParse(file_path=\"../megaparse/tests/input_tests/MegaFake_report.pdf\")\n", "md_content = converter.convert()\n", @@ -199,14 +62,14 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Started parsing the file under job_id 7c1d3024-8724-488e-87ff-625275e1824d\n" + "Started parsing the file under job_id bed037cd-cdb1-45d1-971b-dc99094650b2\n" ] } ], @@ -249,7 +112,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -261,7 +124,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -278,7 +141,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -311,6 +174,7 @@ " return modifications\n", " \n", "diff_megaparse_unstructured = compare_files(\"../megaparse/tests/output_tests/MegaFake_report_unstructured_parse_megaparse.md\", \"../megaparse/tests/output_tests/MegaFake_report.md\")\n", + "diff_megaparse_llama_gptcleaner = compare_files(\"../megaparse/tests/output_tests/MegaFake_report_llama_parse_megaparse_gptcleaner.md\", \"../megaparse/tests/output_tests/MegaFake_report.md\")\n", "diff_megaparse_llama = compare_files(\"../megaparse/tests/output_tests/MegaFake_report_llama_parse_megaparse.md\", \"../megaparse/tests/output_tests/MegaFake_report.md\")\n", "diff_llamaparse = compare_files(\"../megaparse/tests/output_tests/MegaFake_report_llama.md\", \"../megaparse/tests/output_tests/MegaFake_report.md\")\n", "diff_unstructured = compare_files(\"../megaparse/tests/output_tests/MegaFake_report_unstructured.md\", \"../megaparse/tests/output_tests/MegaFake_report.md\")" @@ -318,15 +182,16 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Diff megaparse unstructured: 120\n", + "Diff megaparse unstructured: 114\n", "Diff megaparse llama: 26\n", + "Diff megaparse llama gptcleaner: 11\n", "Diff llama parse: 31\n" ] } @@ -334,8 +199,8 @@ "source": [ "print(f\"Diff megaparse unstructured: {diff_megaparse_unstructured}\")\n", "print(f\"Diff megaparse llama: {diff_megaparse_llama}\")\n", - "print(f\"Diff llama parse: {diff_llamaparse}\")\n", - "\n" + "print(f\"Diff megaparse llama gptcleaner: {diff_megaparse_llama_gptcleaner}\")\n", + "print(f\"Diff llama parse: {diff_llamaparse}\")\n" ] } ],