From 68cc8aa67206c65886bce145bd87ebe5a4cfe635 Mon Sep 17 00:00:00 2001 From: Raduan77 Date: Thu, 9 Jan 2025 11:14:50 +0100 Subject: [PATCH] add support for EML --- src/markitdown/_markitdown.py | 94 +++++++++++++++++++++++++++++++++++ tests/test_files/test.eml | 33 ++++++++++++ tests/test_markitdown.py | 21 ++++++++ 3 files changed, 148 insertions(+) create mode 100644 tests/test_files/test.eml diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 2e7e5ffd..63b41309 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -17,6 +17,9 @@ from typing import Any, Dict, List, Optional, Union from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse from warnings import warn, resetwarnings, catch_warnings +from email import policy +from email.parser import Parser +from email.utils import parseaddr import mammoth import markdownify @@ -1075,6 +1078,96 @@ def _get_llm_description(self, local_path, extension, client, model, prompt=None return response.choices[0].message.content +class EmlConverter(DocumentConverter): + """Converts EML (email) files to Markdown. Preserves headers, body, and attachments info.""" + + def convert(self, local_path: str, **kwargs: Any) -> DocumentConverterResult: + """Convert an EML file to markdown. + + Args: + local_path: Path to the EML file + **kwargs: Additional arguments (unused) + + Returns: + DocumentConverterResult containing the converted markdown + """ + # Check if this is an EML file + file_ext = kwargs.get("file_extension", "").lower() + if not file_ext.endswith(".eml"): + return None + + with open(local_path, "r", encoding="utf-8") as fp: + # Use policy=default to handle RFC compliant emails + msg = Parser(policy=policy.default).parse(fp) + + # Initialize result with email subject as title + result = DocumentConverterResult(title=msg.get("subject", "Untitled Email")) + + # Build markdown content + md_parts = [] + + # Add email headers + md_parts.append("## Email Headers\n") + + # From and To in a more readable format + from_name, from_email = parseaddr(msg.get("from", "")) + to_name, to_email = parseaddr(msg.get("to", "")) + + md_parts.append( + f"**From:** {from_name} <{from_email}>" + if from_name + else f"**From:** {from_email}" + ) + md_parts.append( + f"**To:** {to_name} <{to_email}>" if to_name else f"**To:** {to_email}" + ) + md_parts.append(f"**Subject:** {msg.get('subject', '')}") + md_parts.append(f"**Date:** {msg.get('date', '')}") + + # Add CC if present + if msg.get("cc"): + md_parts.append(f"**CC:** {msg.get('cc')}") + + md_parts.append("\n## Email Content\n") + + # Handle the email body + if msg.is_multipart(): + for part in msg.walk(): + if part.get_content_type() == "text/plain": + md_parts.append(part.get_content()) + elif part.get_content_type() == "text/html": + # If we have HTML content but no plain text, we could convert HTML to markdown here + # For now, we'll just note it's HTML content + if not any( + p.get_content_type() == "text/plain" for p in msg.walk() + ): + md_parts.append(part.get_content()) + else: + md_parts.append(msg.get_content()) + + # List attachments if any + attachments = [] + if msg.is_multipart(): + for part in msg.walk(): + if part.get_content_disposition() == "attachment": + filename = part.get_filename() + if filename: + size = len(part.get_content()) + mime_type = part.get_content_type() + attachments.append( + f"- {filename} ({mime_type}, {size:,} bytes)" + ) + + if attachments: + md_parts.append("\n## Attachments\n") + md_parts.extend(attachments) + + # Combine all parts + result.text_content = "\n".join(md_parts) + + return result + + class ZipConverter(DocumentConverter): """Converts ZIP files to markdown by extracting and converting all contained files. @@ -1273,6 +1366,7 @@ def __init__( self.register_page_converter(IpynbConverter()) self.register_page_converter(PdfConverter()) self.register_page_converter(ZipConverter()) + self.register_page_converter(EmlConverter()) def convert( self, source: Union[str, requests.Response], **kwargs: Any diff --git a/tests/test_files/test.eml b/tests/test_files/test.eml new file mode 100644 index 00000000..15f6b85c --- /dev/null +++ b/tests/test_files/test.eml @@ -0,0 +1,33 @@ +Content-Type: multipart/mixed; boundary="===============8484938434343225034==" +MIME-Version: 1.0 +Subject: Test Email Document +From: John Doe +To: Jane Smith +Date: Wed, 18 Dec 2024 10:00:00 +0000 +CC: cc.person@example.com + +--===============8484938434343225034== +Content-Type: text/plain; charset="us-ascii" +MIME-Version: 1.0 +Content-Transfer-Encoding: 7bit + + +This is a test email with multiple parts. + +It contains: +- Plain text content +- An attachment +- Various headers + +Best regards, +John Doe + +--===============8484938434343225034== +Content-Type: application/txt +MIME-Version: 1.0 +Content-Transfer-Encoding: base64 +Content-Disposition: attachment; filename="test.txt" + +VGhpcyBpcyB0ZXN0IGF0dGFjaG1lbnQgY29udGVudA== + +--===============8484938434343225034==-- diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 316e670e..7a7be55b 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -126,6 +126,20 @@ "髙橋淳,35,名古屋", ] +EML_TEST_STRINGS = [ + "## Email Headers", + "**From:** John Doe ", + "**To:** Jane Smith ", + "**Subject:** Test Email Document", + "**CC:** cc.person@example.com", + "## Email Content", + "This is a test email with multiple parts", + "- Plain text content", + "- An attachment", + "## Attachments", + "- test.txt (application/txt, 31 bytes)", +] + LLM_TEST_STRINGS = [ "5bda1dd6", ] @@ -197,6 +211,13 @@ def test_markitdown_local() -> None: text_content = result.text_content.replace("\\", "") assert test_string in text_content + # Test EML processing + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.eml")) + assert result.title == "Test Email Document" + for test_string in EML_TEST_STRINGS: + text_content = result.text_content.replace("\\", "") + assert test_string in text_content + # Test HTML processing result = markitdown.convert( os.path.join(TEST_FILES_DIR, "test_blog.html"), url=BLOG_TEST_URL