microsoft · FeuRicardo · Dec 19, 2024 · Dec 19, 2024 · Dec 19, 2024 · Dec 19, 2024
diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py
@@ -1,4 +1,5 @@
 # type: ignore
+from io import BytesIO
 import base64
 import binascii
 import copy
@@ -66,12 +67,15 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
 
     - Altering the default heading style to use '#', '##', etc.
     - Removing javascript hyperlinks.
-    - Truncating images with large data:uri sources.
+    - Using mlm for transcription the images, otherwise, truncation images with large data:uri sources.
     - Ensuring URIs are properly escaped, and do not conflict with Markdown syntax
     """
 
     def __init__(self, **options: Any):
         options["heading_style"] = options.get("heading_style", markdownify.ATX)
+
+        self.mlm_client = options.get("mlm_client")
+        self.mlm_model = options.get("mlm_model")
         # Explicitly cast options to the expected type if necessary
         super().__init__(**options)
 
@@ -124,7 +128,8 @@ def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:
 
         alt = el.attrs.get("alt", None) or ""
         src = el.attrs.get("src", None) or ""
-        title = el.attrs.get("title", None) or ""
+        title = el.attrs.get("title", None) or ""      
+
         title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
         if (
             convert_as_inline
@@ -133,8 +138,13 @@ def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:
             return alt
 
         # Remove dataURIs
-        if src.startswith("data:"):
-            src = src.split(",")[0] + "..."
+        if src.startswith("data:image/"):
+            if self.mlm_client is not None and self.mlm_model is not None:
+                md = ImageConverter()
+                result = md._convert(src, mlm_client=self.mlm_client, mlm_model=self.mlm_model)
+                src = result.text_content if result is not None else src.split(",")[0] + "..."              
+            else:
+                src = src.split(",")[0] + "..."
 
         return "![%s](%s%s)" % (alt, src, title_part)
 
@@ -199,11 +209,11 @@ def convert(
 
         result = None
         with open(local_path, "rt", encoding="utf-8") as fh:
-            result = self._convert(fh.read())
+            result = self._convert(fh.read(), **kwargs)
 
         return result
 
-    def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
+    def _convert(self, html_content: str, **kwargs) -> Union[None, DocumentConverterResult]:
         """Helper function that converts and HTML string."""
 
         # Parse the string
@@ -216,10 +226,14 @@ def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
         # Print only the main content
         body_elm = soup.find("body")
         webpage_text = ""
+
+        # add mlm_client and mlm_model to the options
+        #options = copy.deepcopy(kwargs)
+
         if body_elm:
-            webpage_text = _CustomMarkdownify().convert_soup(body_elm)
+            webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm)
         else:
-            webpage_text = _CustomMarkdownify().convert_soup(soup)
+            webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
 
         assert isinstance(webpage_text, str)
 
@@ -713,7 +727,7 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
 
             result = mammoth.convert_to_html(docx_file, style_map=style_map)
             html_content = result.value
-            result = self._convert(html_content)
+            result = self._convert(html_content, **kwargs)
 
         return result
 
@@ -778,7 +792,9 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
             return None
 
         md_content = ""
-
+        self._mlm_client = kwargs.get("mlm_client")
+        self._mlm_model = kwargs.get("mlm_model")     
+
         presentation = pptx.Presentation(local_path)
         slide_num = 0
         for slide in presentation.slides:
@@ -795,8 +811,8 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
                     try:
                         alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")
                     except Exception:
-                        pass
-
+                        pass                  
+                                            
                     # A placeholder name
                     filename = re.sub(r"\W", "", shape.name) + ".jpg"
                     md_content += (
@@ -806,6 +822,7 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
                         + filename
                         + ")\n"
                     )
+                    md_content += self._convert_image_to_markdown(shape)
 
                 # Tables
                 if self._is_table(shape):
@@ -850,6 +867,29 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
             text_content=md_content.strip(),
         )
 
+    def _convert_image_to_markdown(self, shape) -> str:   
+        if not self._is_picture(shape):
+            return ""
+
+        image_converter = ImageConverter() if (self._mlm_client is not None) and (self._mlm_model is not None) else None
+
+        if image_converter is not None:            
+            image = shape.image
+            content_type = image.content_type  
+            blob = image.blob
+
+            try:
+                ext = f"data:{content_type};base64"
+                image_base64_uri = f"{ext},{base64.b64encode(blob).decode('utf-8')}"
+                image_description = image_converter._convert(image_base64_uri, mlm_client=self._mlm_client, mlm_model=self._mlm_model)
+
+                return ("\n" + image_description.text_content.strip() + "\n")
+            except Exception as e:
+                print("Error converting image to markdown")
+                sys.stderr.write(f"Error converting image to markdown: {e}")                
+
+        return ""
+
     def _is_picture(self, shape):
         if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
             return True
@@ -1037,7 +1077,37 @@ class ImageConverter(MediaConverter):
     """
     Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured).
     """
+    def _convert(self, data_base64_uri, **kwargs) -> Union[None, DocumentConverterResult]:
+        # Bail if not an image
+        try:
+            content_type = data_base64_uri.split(",")[0].split(";")[0]
+            if content_type.lower() not in ["data:image/jpg", "data:image/jpeg", "data:image/png"]:
+                return None
+        except Exception:
+            return None
+
+        # Try describing the image with GPTV
+        mlm_client = kwargs.get("mlm_client")
+        mlm_model = kwargs.get("mlm_model")
+        md_content = ""
+
+        if mlm_client is not None and mlm_model is not None:            
+            md_content = (
+                "\n# Image Description:\n"
+                + self._get_mlm_description(
+                    data_base64_uri,
+                    mlm_client,
+                    mlm_model,
+                    prompt=kwargs.get("mlm_prompt"),
+                ).strip()
+                + "\n"
+            )
 
+        return DocumentConverterResult(
+            title=None,
+            text_content=md_content,
+        )
+
     def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
         # Bail if not an image
         extension = kwargs.get("file_extension", "")
@@ -1064,39 +1134,29 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
                 if f in metadata:
                     md_content += f"{f}: {metadata[f]}\n"
 
-        # Try describing the image with GPTV
-        llm_client = kwargs.get("llm_client")
-        llm_model = kwargs.get("llm_model")
-        if llm_client is not None and llm_model is not None:
-            md_content += (
-                "\n# Description:\n"
-                + self._get_llm_description(
-                    local_path,
-                    extension,
-                    llm_client,
-                    llm_model,
-                    prompt=kwargs.get("llm_prompt"),
-                ).strip()
-                + "\n"
-            )
+        image_base64_uri = self._get_image_base64(local_path, extension)
+        md_content += self._convert(image_base64_uri, **kwargs).text_content        
 
         return DocumentConverterResult(
             title=None,
             text_content=md_content,
         )
-
-    def _get_llm_description(self, local_path, extension, client, model, prompt=None):
-        if prompt is None or prompt.strip() == "":
-            prompt = "Write a detailed caption for this image."
-
-        data_uri = ""
+
+    def _get_image_base64(self, local_path, extension):
         with open(local_path, "rb") as image_file:
             content_type, encoding = mimetypes.guess_type("_dummy" + extension)
             if content_type is None:
                 content_type = "image/jpeg"
             image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
-            data_uri = f"data:{content_type};base64,{image_base64}"
+
+            return f"data:{content_type};base64,{image_base64}"  
+
+    def _get_mlm_description(self, data_base64_uri, client, model, prompt=None):
+        if prompt is None or prompt.strip() == "":
+            prompt = "Write a detailed caption for this image."
 
+        sys.stderr.write(f"MLM Prompt:\n{prompt}\n")
+
         messages = [
             {
                 "role": "user",
@@ -1105,7 +1165,7 @@ def _get_llm_description(self, local_path, extension, client, model, prompt=None
                     {
                         "type": "image_url",
                         "image_url": {
-                            "url": data_uri,
+                            "url": data_base64_uri,
                         },
                     },
                 ],
@@ -1115,7 +1175,6 @@ def _get_llm_description(self, local_path, extension, client, model, prompt=None
         response = client.chat.completions.create(model=model, messages=messages)
         return response.choices[0].message.content
 
-
 class OutlookMsgConverter(DocumentConverter):
     """Converts Outlook .msg files to markdown by extracting email metadata and content.
 
@@ -1477,6 +1536,9 @@ def convert_stream(
 
             # Convert
             result = self._convert(temp_path, extensions, **kwargs)
+        except Exception as e:
+            sys.stderr.write(f"Error converting stream to markdown: {e}")
+            pass
         # Clean up
         finally:
             try:
@@ -1548,22 +1610,22 @@ def _convert(
     ) -> DocumentConverterResult:
         error_trace = ""
         for ext in extensions + [None]:  # Try last with no extension
-            for converter in self._page_converters:
-                _kwargs = copy.deepcopy(kwargs)
-
-                # Overwrite file_extension appropriately
-                if ext is None:
-                    if "file_extension" in _kwargs:
-                        del _kwargs["file_extension"]
-                else:
-                    _kwargs.update({"file_extension": ext})
+            _kwargs = copy.deepcopy(kwargs)
+            # Overwrite file_extension appropriately
+            if ext is None:
+                if "file_extension" in _kwargs:
+                    del _kwargs["file_extension"]
+            else:
+                _kwargs.update({"file_extension": ext})
 
-                # Copy any additional global options
-                if "llm_client" not in _kwargs and self._llm_client is not None:
-                    _kwargs["llm_client"] = self._llm_client
+            # Copy any additional global options
+            if "mlm_client" not in _kwargs and self._llm_client is not None:
+                _kwargs["mlm_client"] = self._llm_client
 
-                if "llm_model" not in _kwargs and self._llm_model is not None:
-                    _kwargs["llm_model"] = self._llm_model
+            if "mlm_model" not in _kwargs and self._llm_model is not None:
+                _kwargs["mlm_model"] = self._llm_model      
+
+            for converter in self._page_converters:
 
                 if "style_map" not in _kwargs and self._style_map is not None:
                     _kwargs["style_map"] = self._style_map

diff --git a/tests/test_files/test.docx b/tests/test_files/test.docx
diff --git a/tests/test_files/test.pptx b/tests/test_files/test.pptx
diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py
@@ -1,8 +1,9 @@
 #!/usr/bin/env python3 -m pytest
 import io
 import os
+from dotenv import load_dotenv
 import shutil
-
+from openai import OpenAI, AzureOpenAI
 import pytest
 import requests
 
@@ -134,6 +135,7 @@
     "data:image/svg+xml,%3Csvg%20width%3D",
 ]
 
+
 CSV_CP932_TEST_STRINGS = [
     "名前,年齢,住所",
     "佐藤太郎,30,東京",
@@ -189,8 +191,20 @@ def test_markitdown_remote() -> None:
     #     assert test_string in result.text_content
 
 
-def test_markitdown_local() -> None:
-    markitdown = MarkItDown()
+def test_markitdown_local(use_mlm = False) -> None:
+    if (use_mlm):
+        load_dotenv()   
+        client = AzureOpenAI(
+            api_key=os.getenv("AZURE_OPENAI_API_KEY"),  
+            api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
+            azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
+        )
+        llm_model="gpt-4oModel"
+
+        markitdown = MarkItDown(llm_client=client, llm_model=llm_model)
+    else:
+        markitdown = MarkItDown()
+
 
     # Test XLSX processing
     result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
@@ -305,7 +319,6 @@ def test_markitdown_exiftool() -> None:
         target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
         assert target in result.text_content
 
-
 def test_markitdown_deprecation() -> None:
     try:
         with catch_warnings(record=True) as w:
@@ -361,6 +374,8 @@ def test_markitdown_llm() -> None:
 
 if __name__ == "__main__":
     """Runs this file's tests from the command line."""
+    test_markitdown_remote()
+    test_markitdown_local(True)
     # test_markitdown_remote()
     # test_markitdown_local()
     test_markitdown_exiftool()