From e498854c3b6c51b6a567acfc4bff096eebba78ca Mon Sep 17 00:00:00 2001 From: makermotion <22776403+makermotion@users.noreply.github.com> Date: Sun, 22 Dec 2024 00:37:12 +0300 Subject: [PATCH 1/4] feat: outlook .msg converter --- pyproject.toml | 19 +++------ src/markitdown/_markitdown.py | 76 +++++++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+), 14 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3e14cec..741207d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,9 +10,7 @@ readme = "README.md" requires-python = ">=3.10" license = "MIT" keywords = [] -authors = [ - { name = "Adam Fourney", email = "adamfo@microsoft.com" }, -] +authors = [{ name = "Adam Fourney", email = "adamfo@microsoft.com" }] classifiers = [ "Development Status :: 4 - Beta", "Programming Language :: Python", @@ -35,6 +33,7 @@ dependencies = [ "pdfminer.six", "puremagic", "pydub", + "olefile", "youtube-transcript-api", "SpeechRecognition", "pathvalidate", @@ -54,9 +53,7 @@ path = "src/markitdown/__about__.py" markitdown = "markitdown.__main__:main" [tool.hatch.envs.types] -extra-dependencies = [ - "mypy>=1.0.0", -] +extra-dependencies = ["mypy>=1.0.0"] [tool.hatch.envs.types.scripts] check = "mypy --install-types --non-interactive {args:src/markitdown tests}" @@ -64,20 +61,14 @@ check = "mypy --install-types --non-interactive {args:src/markitdown tests}" source_pkgs = ["markitdown", "tests"] branch = true parallel = true -omit = [ - "src/markitdown/__about__.py", -] +omit = ["src/markitdown/__about__.py"] [tool.coverage.paths] markitdown = ["src/markitdown", "*/markitdown/src/markitdown"] tests = ["tests", "*/markitdown/tests"] [tool.coverage.report] -exclude_lines = [ - "no cov", - "if __name__ == .__main__.:", - "if TYPE_CHECKING:", -] +exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"] [tool.hatch.build.targets.sdist] only-include = ["src/markitdown"] diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 789c1e5..32d5ba2 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -21,6 +21,7 @@ import mammoth import markdownify +import olefile import pandas as pd import pdfminer import pdfminer.high_level @@ -1076,6 +1077,80 @@ def _get_llm_description(self, local_path, extension, client, model, prompt=None return response.choices[0].message.content +class OutlookMsgConverter(DocumentConverter): + """Converts Outlook .msg files to markdown by extracting email metadata and content. + + Uses the olefile package to parse the .msg file structure and extract: + - Email headers (From, To, Subject, Date) + - Email body content + - Attachments (listed but not converted) + """ + + def convert( + self, local_path: str, **kwargs: Any + ) -> Union[None, DocumentConverterResult]: + # Bail if not a MSG file + extension = kwargs.get("file_extension", "") + if extension.lower() != ".msg": + return None + + try: + msg = olefile.OleFileIO(local_path) + # Extract email metadata + md_content = "# Email Message\n\n" + + # Get headers + headers = { + "From": self._get_stream_data(msg, "__substg1.0_0C1A001F"), + "To": self._get_stream_data(msg, "__substg1.0_0E04001F"), + "Subject": self._get_stream_data(msg, "__substg1.0_0037001F"), + } + + # Add headers to markdown + for key, value in headers.items(): + if value: + md_content += f"**{key}:** {value}\n" + + md_content += "\n## Content\n\n" + + # Get email body + body = self._get_stream_data(msg, "__substg1.0_1000001F") + if body: + md_content += body + + msg.close() + + return DocumentConverterResult( + title=headers.get("Subject"), text_content=md_content.strip() + ) + + except Exception as e: + raise FileConversionException( + f"Could not convert MSG file '{local_path}': {str(e)}" + ) + + def _get_stream_data( + self, msg: olefile.OleFileIO, stream_path: str + ) -> Union[str, None]: + """Helper to safely extract and decode stream data from the MSG file.""" + try: + if msg.exists(stream_path): + data = msg.openstream(stream_path).read() + # Try UTF-16 first (common for .msg files) + try: + return data.decode("utf-16-le").strip() + except UnicodeDecodeError: + # Fall back to UTF-8 + try: + return data.decode("utf-8").strip() + except UnicodeDecodeError: + # Last resort - ignore errors + return data.decode("utf-8", errors="ignore").strip() + except Exception: + pass + return None + + class ZipConverter(DocumentConverter): """Converts ZIP files to markdown by extracting and converting all contained files. @@ -1285,6 +1360,7 @@ def __init__( self.register_page_converter(IpynbConverter()) self.register_page_converter(PdfConverter()) self.register_page_converter(ZipConverter()) + self.register_page_converter(OutlookMsgConverter()) def convert( self, source: Union[str, requests.Response, Path], **kwargs: Any From 997f244d8f11da3f5aa5f21e644f8607078aba70 Mon Sep 17 00:00:00 2001 From: makermotion <22776403+makermotion@users.noreply.github.com> Date: Sun, 22 Dec 2024 01:01:04 +0300 Subject: [PATCH 2/4] correct formatting --- pyproject.toml | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 741207d..67f6825 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,9 @@ readme = "README.md" requires-python = ">=3.10" license = "MIT" keywords = [] -authors = [{ name = "Adam Fourney", email = "adamfo@microsoft.com" }] +authors = [ + { name = "Adam Fourney", email = "adamfo@microsoft.com" }, +] classifiers = [ "Development Status :: 4 - Beta", "Programming Language :: Python", @@ -53,7 +55,9 @@ path = "src/markitdown/__about__.py" markitdown = "markitdown.__main__:main" [tool.hatch.envs.types] -extra-dependencies = ["mypy>=1.0.0"] +extra-dependencies = [ + "mypy>=1.0.0", +] [tool.hatch.envs.types.scripts] check = "mypy --install-types --non-interactive {args:src/markitdown tests}" @@ -61,14 +65,20 @@ check = "mypy --install-types --non-interactive {args:src/markitdown tests}" source_pkgs = ["markitdown", "tests"] branch = true parallel = true -omit = ["src/markitdown/__about__.py"] +omit = [ + "src/markitdown/__about__.py", +] [tool.coverage.paths] markitdown = ["src/markitdown", "*/markitdown/src/markitdown"] tests = ["tests", "*/markitdown/tests"] [tool.coverage.report] -exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"] +exclude_lines = [ + "no cov", + "if __name__ == .__main__.:", + "if TYPE_CHECKING:", +] [tool.hatch.build.targets.sdist] only-include = ["src/markitdown"] From 3207e3ab380c21f63d9fd970de41c348c65a5b7a Mon Sep 17 00:00:00 2001 From: makermotion <22776403+makermotion@users.noreply.github.com> Date: Sun, 22 Dec 2024 12:30:53 +0300 Subject: [PATCH 3/4] add test, adjust docstring --- src/markitdown/_markitdown.py | 3 +-- tests/test_files/test_outlook_msg.msg | Bin 0 -> 13312 bytes tests/test_markitdown.py | 13 +++++++++++++ 3 files changed, 14 insertions(+), 2 deletions(-) create mode 100644 tests/test_files/test_outlook_msg.msg diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 32d5ba2..d72196b 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -1083,7 +1083,6 @@ class OutlookMsgConverter(DocumentConverter): Uses the olefile package to parse the .msg file structure and extract: - Email headers (From, To, Subject, Date) - Email body content - - Attachments (listed but not converted) """ def convert( @@ -1101,7 +1100,7 @@ def convert( # Get headers headers = { - "From": self._get_stream_data(msg, "__substg1.0_0C1A001F"), + "From": self._get_stream_data(msg, "__substg1.0_0C1F001F"), "To": self._get_stream_data(msg, "__substg1.0_0E04001F"), "Subject": self._get_stream_data(msg, "__substg1.0_0037001F"), } diff --git a/tests/test_files/test_outlook_msg.msg b/tests/test_files/test_outlook_msg.msg new file mode 100644 index 0000000000000000000000000000000000000000..05b087b77c785c8b57a479485b9715fb7dfecbb4 GIT binary patch literal 13312 zcmeHN-BVk~6+c3bjg7ILwn=KIj;=}Trgp#*2rveM2nZVkG6mtd9nCNjlCWAKQAxyM zJj|q-$xPdqzW6l{{rCs^(npW~gZ8mAopjpA=DE{8CG_{(s}+Li<6J|AWX6m4-g|b> z?(dwl=j@mK*T1~{&)@y&(!b?y~s1$oPDb zzCjk~blees#sL(WabNo9xsPR^kLX*voQG{cD~qxqeG-$RR3zgSUgBs|MoUMcvLQ*y zNgm$|rnC%ty-lCX3-QHU3oA>L@u|sJ-`vVld}V%RIdXepa(2FN>fS;-fhfBpQ|37* zTT+57TaaN(R(+0)K_-?ZQM!g_0enB-$5oaHWVDj^fvX7Wop!Lb`seGv)*P0aMUG0Z z+=rz~uw@Ps6yz4P3PIYSbYq^FHX3A1=`!Rm$lIvz$Df0`45TI%L=KyFA#nD~0G>ho zIdUD(0rSn?_K!|4B$zfmkHLq_e5c+?J??|1!8`+*RQp=S5%;;7z z(rG-6EoWdwjv*~LE)&3wqpTM?OhD=hP|J{f9a3$Z_cAPVAKz(6ejggAVa@j>3(j%$ z@cbS`S>lky%CQ9>%pQ{*Q^uzl-vvm~3%Rz<2vIhAa2};Olq603+`yY9^v8nnyd1nC zBctF>p(pxZ+VF0}Mm@%_=w}E2Mo^N1b=OSI5Ik}O?S}CjLfIf%(nidOOTWAx_~X+* zT)6m2K|b}Jk9S~I{gBdgUh2b>TBnUR5j;PDDkPm@#u2AZ#f#9jxI%_4<;8D=cAgrll|}MtwH-T&ta{#*S>`DSTkOj!x#Ou`DpFQ z5eM&K)}J_Lq#Sy1s?Q4OOx4fRh!O|>p2gFaTX`LD*;RuvL@Dfg=f$Hx_K7erMCT`| z)#F5|k_{!2g>ue3);8Be{e%69Y^k{Mu(FX4If-m4m(G=)L^_kr1|KJ~#X_Z$%Adhk zFp(`k%9YE`WGR=-kM8WCaIKgxmQbH4WzxaKdMT4B1QV(JRwg*Q(|snfwH8WjCQIqW zWmx(XSUUHL}xnBI1ZS@T`Vlq*hVBbm!P8<}!Bxt^)(Yq-DG7iF#yL8tr7 zJ~2bVurqc8g4T&w$6c=x6h&9g&d#1odHVXgteKvKu47&=e)4-m-hKMFWY4Fabyc1| z{Z(%JwvGMrn%UpN#c?7M3CJZwf2+nnRNMc$^}~j~TCI;@wJ)^Tye4@Tqj_2R{i4~uY6VlUX_Jv4rG{xL_0rRdb` z;%qb)_j-z;&$|9=*R5vr&lAYZ`NGaWd-IJu|JZf0+5BVc?fmm*wLjY8I1=w}8)Z(2 zww!wf-us?K?nXX`d>)x==F7<33w;~;JIJpfv)yZW^FG-2(dg||>wEB~9lnqJ1LUj7 zKSXAmAK`ti`c1kQ-+jn8khy0GA@?H>Acv6$ksD!+qI|4L-Mc8Cs;is9_w72mzCT{* zaW!QbUf=k%g!3V9PI>>yGiwQFMlb1AdC3P;n8euW>UJaIncE3X*)Tn$!yv8Vk9RaY zf#z__v1)E=_U;kgJ4Bba(eo|w8GOv+wLNqrzGh{_FPpnMp2F9wcb2QwyCB|4hLGuD zjng&kRH!`W1M}NQW3DfkGIp3+Pha6&%hb&N z#2vs{RBM0kqqX?-l`8~CYK=dw)}L#X7GL9k)IV(ePdENtAGP?Ls~#XTwbFkGtC@EE zxi)L@=___0(h7eVzk_J;xi8S-(^pP12#}iNPa*Dj1FxI^xF^x#zY5-u2#(YmUmyS6 z4{7l)g7h%EKrR03;B7zua{Oo1_}qhP@o9VR zA(@)%Pu;ox=;EK-87;nU{vp2mo2O>|;oke`@wxu%^yj``i%;LRACU+>D0i@zARKZgXX`+v@9T72GNF1p7g zqyOFwQDf9qoQ|z8+Bdo|=~ngp^Cl>}rc|p`uV(fSV(y*6ENDB_>GGr7ICe)7kX`uWd${PWh!O*Z~Hr`a{l)@k0& z#@}oF^WILY|2TM$I{vSEz~}#tJL>r7eWh0auOj}rrfTsgz_knM-~6Pm)F{V8oWmx|2)rW@uT2B_4^;s;ye$t&F2*A JMf%!W;Qy?8Su_9u literal 0 HcmV?d00001 diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 4a981bd..a0626d1 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -63,6 +63,15 @@ "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", ] +MSG_TEST_STRINGS = [ + "# Email Message", + "**From:** test.sender@example.com", + "**To:** test.recipient@example.com", + "**Subject:** Test Email Message", + "## Content", + "This is the body of the test email message", +] + DOCX_COMMENT_TEST_STRINGS = [ "314b0a30-5b04-470b-b9f7-eed2c2bec74a", "49e168b7-d2ae-407f-a055-2167576f39a1", @@ -232,6 +241,10 @@ def test_markitdown_local() -> None: result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv")) validate_strings(result, CSV_CP932_TEST_STRINGS) + # Test MSG (Outlook email) processing + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_outlook_msg.msg")) + validate_strings(result, MSG_TEST_STRINGS) + @pytest.mark.skipif( skip_exiftool, From a5d67878f2972f0b2206165e72f4eea13f7f7863 Mon Sep 17 00:00:00 2001 From: makermotion <22776403+makermotion@users.noreply.github.com> Date: Sun, 22 Dec 2024 13:25:05 +0300 Subject: [PATCH 4/4] change misleading docstring --- src/markitdown/_markitdown.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index d72196b..5bb9c67 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -1081,7 +1081,7 @@ class OutlookMsgConverter(DocumentConverter): """Converts Outlook .msg files to markdown by extracting email metadata and content. Uses the olefile package to parse the .msg file structure and extract: - - Email headers (From, To, Subject, Date) + - Email headers (From, To, Subject) - Email body content """