add: gpt cleaner for header and footer (#13)

QuivrHQ · Jun 3, 2024 · 20cc562 · 20cc562
1 parent d966abb
commit 20cc562
Show file tree

Hide file tree

Showing 3 changed files with 76 additions and 180 deletions.
diff --git a/megaparse/Converter.py b/megaparse/Converter.py
@@ -233,7 +233,7 @@ def _unstructured_parse(self, file_path: str):
         unstructured_parser = UnstructuredParser()
         return unstructured_parser.convert(file_path)
 
-    def convert(self, file_path: str) -> str:
+    def convert(self, file_path: str, gpt4o_cleaner = False) -> str:
         parsed_md = ""
         if self.llama_parse_api_key:
             parsed_md = self._llama_parse(self.llama_parse_api_key, file_path)
@@ -248,7 +248,7 @@ def convert(self, file_path: str) -> str:
                 strict=self.handle_header,
                 remove_pagination=self.handle_pagination,
             )
-            md_cleaned = md_processor.process()
+            md_cleaned = md_processor.process(gpt4o_cleaner=gpt4o_cleaner)
             return md_cleaned
 
     def save_md(self, md_content: str, file_path: Path | str) -> None:
@@ -261,7 +261,7 @@ def __init__(self, file_path: str, llama_parse_api_key: str | None = None) -> No
         self.file_path = file_path
         self.llama_parse_api_key = llama_parse_api_key
 
-    def convert(self) -> str:
+    def convert(self, **kwargs) -> str:
         file_extension: str = os.path.splitext(self.file_path)[1]
 
         if file_extension == ".docx":
@@ -277,7 +277,7 @@ def convert(self) -> str:
         else:
             print(self.file_path, file_extension)
             raise ValueError(f"Unsupported file extension: {file_extension}")
-        return converter.convert(self.file_path)
+        return converter.convert(self.file_path, **kwargs)
 
     def save_md(self, md_content: str, file_path: Path | str) -> None:
         os.makedirs(os.path.dirname(file_path), exist_ok=True)

diff --git a/megaparse/markdown_processor.py b/megaparse/markdown_processor.py
@@ -1,6 +1,8 @@
 import os
 from collections import Counter
 from typing import LiteralString
+from langchain_openai import ChatOpenAI
+
 
 class MarkdownProcessor:
     def __init__(self, md_result: str, strict: bool, remove_pagination: bool):
@@ -67,25 +69,54 @@ def save_cleaned_result(self, cleaned_result: str, output_path: str):
         with open(output_path, "w") as f:
             f.write(cleaned_result)
 
-    def process(self):
+    def remove_header_llm(self):
+        llm = ChatOpenAI(model="gpt-4o")
+        # Define the prompt
+        messages = [
+            (
+                "system",
+                "You are a document cleaner and you are used to remove repetitive headers / footer from parsed files in markdown.",
+            ),
+        ]
+
+        prompt = f"""You are a document cleaner and you are used to remove repetitive headers / footer from parsed files in markdown.
+        Here is a md file : "{self.md_result}"
+        I want you to identify repetitive texts that could be associate to a document header and footer. Please identify the headers, the footer and remove them from the document.
+        Answer with only the cleaned document in markdown format. 
+        Result : """
+
+        messages.append(("human", self.md_result)) #type: ignore
+
+        result = llm.invoke(messages)
+
+        return result.content
+
+
+
+    def process(self, gpt4o_cleaner = False):
         """Process the markdown result by removing duplicate paragraphs and headers."""
-        pages = self.split_into_pages()
-        paragraphs = self.split_into_paragraphs(pages)
-        #other_pages_paragraphs = self.split_into_paragraphs(pages[1:])
 
-        cleaned_paragraphs, duplicate_paragraphs = self.remove_duplicates(paragraphs)
-        header_components_count = self.identify_header_components(duplicate_paragraphs)
+        if gpt4o_cleaner:
+            cleaned_result = self.remove_header_llm()
 
-        if self.strict:
-            final_paragraphs = self.remove_header_lines(cleaned_paragraphs[5:], header_components_count)
-            final_paragraphs = cleaned_paragraphs[:5] + final_paragraphs
         else:
-            final_paragraphs = cleaned_paragraphs
+            pages = self.split_into_pages()
+            paragraphs = self.split_into_paragraphs(pages)
+            #other_pages_paragraphs = self.split_into_paragraphs(pages[1:])
+
+            cleaned_paragraphs, duplicate_paragraphs = self.remove_duplicates(paragraphs)
+            header_components_count = self.identify_header_components(duplicate_paragraphs)
+
+            if self.strict:
+                final_paragraphs = self.remove_header_lines(cleaned_paragraphs[5:], header_components_count)
+                final_paragraphs = cleaned_paragraphs[:5] + final_paragraphs
+            else:
+                final_paragraphs = cleaned_paragraphs
 
-        # Combine first page paragraphs with cleaned paragraphs from other pages
-        all_paragraphs = final_paragraphs
-        cleaned_result = "\n\n".join(all_paragraphs)
+            # Combine first page paragraphs with cleaned paragraphs from other pages
+            all_paragraphs = final_paragraphs
+            cleaned_result = "\n\n".join(all_paragraphs)
 
-        cleaned_result = self.merge_tables(cleaned_result)
+        cleaned_result = self.merge_tables(str(cleaned_result))
         return cleaned_result