Skip to content

Commit

Permalink
add: gpt cleaner for header and footer (#13)
Browse files Browse the repository at this point in the history
  • Loading branch information
chloedia authored Jun 3, 2024
1 parent d966abb commit 20cc562
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 180 deletions.
8 changes: 4 additions & 4 deletions megaparse/Converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ def _unstructured_parse(self, file_path: str):
unstructured_parser = UnstructuredParser()
return unstructured_parser.convert(file_path)

def convert(self, file_path: str) -> str:
def convert(self, file_path: str, gpt4o_cleaner = False) -> str:
parsed_md = ""
if self.llama_parse_api_key:
parsed_md = self._llama_parse(self.llama_parse_api_key, file_path)
Expand All @@ -248,7 +248,7 @@ def convert(self, file_path: str) -> str:
strict=self.handle_header,
remove_pagination=self.handle_pagination,
)
md_cleaned = md_processor.process()
md_cleaned = md_processor.process(gpt4o_cleaner=gpt4o_cleaner)
return md_cleaned

def save_md(self, md_content: str, file_path: Path | str) -> None:
Expand All @@ -261,7 +261,7 @@ def __init__(self, file_path: str, llama_parse_api_key: str | None = None) -> No
self.file_path = file_path
self.llama_parse_api_key = llama_parse_api_key

def convert(self) -> str:
def convert(self, **kwargs) -> str:
file_extension: str = os.path.splitext(self.file_path)[1]

if file_extension == ".docx":
Expand All @@ -277,7 +277,7 @@ def convert(self) -> str:
else:
print(self.file_path, file_extension)
raise ValueError(f"Unsupported file extension: {file_extension}")
return converter.convert(self.file_path)
return converter.convert(self.file_path, **kwargs)

def save_md(self, md_content: str, file_path: Path | str) -> None:
os.makedirs(os.path.dirname(file_path), exist_ok=True)
Expand Down
59 changes: 45 additions & 14 deletions megaparse/markdown_processor.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import os
from collections import Counter
from typing import LiteralString
from langchain_openai import ChatOpenAI


class MarkdownProcessor:
def __init__(self, md_result: str, strict: bool, remove_pagination: bool):
Expand Down Expand Up @@ -67,25 +69,54 @@ def save_cleaned_result(self, cleaned_result: str, output_path: str):
with open(output_path, "w") as f:
f.write(cleaned_result)

def process(self):
def remove_header_llm(self):
llm = ChatOpenAI(model="gpt-4o")
# Define the prompt
messages = [
(
"system",
"You are a document cleaner and you are used to remove repetitive headers / footer from parsed files in markdown.",
),
]

prompt = f"""You are a document cleaner and you are used to remove repetitive headers / footer from parsed files in markdown.
Here is a md file : "{self.md_result}"
I want you to identify repetitive texts that could be associate to a document header and footer. Please identify the headers, the footer and remove them from the document.
Answer with only the cleaned document in markdown format.
Result : """

messages.append(("human", self.md_result)) #type: ignore

result = llm.invoke(messages)

return result.content



def process(self, gpt4o_cleaner = False):
"""Process the markdown result by removing duplicate paragraphs and headers."""
pages = self.split_into_pages()
paragraphs = self.split_into_paragraphs(pages)
#other_pages_paragraphs = self.split_into_paragraphs(pages[1:])

cleaned_paragraphs, duplicate_paragraphs = self.remove_duplicates(paragraphs)
header_components_count = self.identify_header_components(duplicate_paragraphs)
if gpt4o_cleaner:
cleaned_result = self.remove_header_llm()

if self.strict:
final_paragraphs = self.remove_header_lines(cleaned_paragraphs[5:], header_components_count)
final_paragraphs = cleaned_paragraphs[:5] + final_paragraphs
else:
final_paragraphs = cleaned_paragraphs
pages = self.split_into_pages()
paragraphs = self.split_into_paragraphs(pages)
#other_pages_paragraphs = self.split_into_paragraphs(pages[1:])

cleaned_paragraphs, duplicate_paragraphs = self.remove_duplicates(paragraphs)
header_components_count = self.identify_header_components(duplicate_paragraphs)

if self.strict:
final_paragraphs = self.remove_header_lines(cleaned_paragraphs[5:], header_components_count)
final_paragraphs = cleaned_paragraphs[:5] + final_paragraphs
else:
final_paragraphs = cleaned_paragraphs

# Combine first page paragraphs with cleaned paragraphs from other pages
all_paragraphs = final_paragraphs
cleaned_result = "\n\n".join(all_paragraphs)
# Combine first page paragraphs with cleaned paragraphs from other pages
all_paragraphs = final_paragraphs
cleaned_result = "\n\n".join(all_paragraphs)

cleaned_result = self.merge_tables(cleaned_result)
cleaned_result = self.merge_tables(str(cleaned_result))
return cleaned_result

Loading

0 comments on commit 20cc562

Please sign in to comment.