Skip to content

Commit

Permalink
add options to keep data uris
Browse files Browse the repository at this point in the history
  • Loading branch information
VoidIsVoid authored Jan 9, 2025
1 parent f58a864 commit 42fb33a
Showing 1 changed file with 44 additions and 15 deletions.
59 changes: 44 additions & 15 deletions src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):

def __init__(self, **options: Any):
options["heading_style"] = options.get("heading_style", markdownify.ATX)
self.keep_data_uris = options.pop("keep_data_uris", False)
# Explicitly cast options to the expected type if necessary
super().__init__(**options)

Expand Down Expand Up @@ -133,10 +134,10 @@ def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:
return alt

# Remove dataURIs
if src.startswith("data:"):
if not self.keep_data_uris and src.startswith("data:"):
src = src.split(",")[0] + "..."

return "![%s](%s%s)" % (alt, src, title_part)
return "![%s%s](%s)" % (alt, title_part, src)

def convert_soup(self, soup: Any) -> str:
return super().convert_soup(soup) # type: ignore
Expand Down Expand Up @@ -189,6 +190,10 @@ def convert(
class HtmlConverter(DocumentConverter):
"""Anything with content type text/html"""

def __init__(self, keep_data_uris: Optional[bool] = False):
self.keep_data_uris = keep_data_uris
super().__init__()

def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
Expand Down Expand Up @@ -217,9 +222,13 @@ def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
body_elm = soup.find("body")
webpage_text = ""
if body_elm:
webpage_text = _CustomMarkdownify().convert_soup(body_elm)
webpage_text = _CustomMarkdownify(
keep_data_uris=self.keep_data_uris
).convert_soup(body_elm)
else:
webpage_text = _CustomMarkdownify().convert_soup(soup)
webpage_text = _CustomMarkdownify(
keep_data_uris=self.keep_data_uris
).convert_soup(soup)

assert isinstance(webpage_text, str)

Expand All @@ -232,6 +241,10 @@ def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
class RSSConverter(DocumentConverter):
"""Convert RSS / Atom type to markdown"""

def __init__(self, keep_data_uris: Optional[bool] = False):
self.keep_data_uris = keep_data_uris
super().__init__()

def convert(
self, local_path: str, **kwargs
) -> Union[None, DocumentConverterResult]:
Expand Down Expand Up @@ -347,7 +360,9 @@ def _parse_content(self, content: str) -> str:
try:
# using bs4 because many RSS feeds have HTML-styled content
soup = BeautifulSoup(content, "html.parser")
return _CustomMarkdownify().convert_soup(soup)
return _CustomMarkdownify(keep_data_uris=self.keep_data_uris).convert_soup(
soup
)
except BaseException as _:
return content

Expand All @@ -369,6 +384,10 @@ def _get_data_by_tag_name(
class WikipediaConverter(DocumentConverter):
"""Handle Wikipedia pages separately, focusing only on the main document content."""

def __init__(self, keep_data_uris: Optional[bool] = False):
self.keep_data_uris = keep_data_uris
super().__init__()

def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
Expand Down Expand Up @@ -403,11 +422,13 @@ def convert(
assert isinstance(main_title, str)

# Convert the page
webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify().convert_soup(
body_elm
)
webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify(
keep_data_uris=self.keep_data_uris
).convert_soup(body_elm)
else:
webpage_text = _CustomMarkdownify().convert_soup(soup)
webpage_text = _CustomMarkdownify(
keep_data_uris=self.keep_data_uris
).convert_soup(soup)

return DocumentConverterResult(
title=main_title,
Expand Down Expand Up @@ -609,6 +630,10 @@ def _convert(self, notebook_content: dict) -> Union[None, DocumentConverterResul


class BingSerpConverter(DocumentConverter):
def __init__(self, keep_data_uris: Optional[bool] = False):
self.keep_data_uris = keep_data_uris
super().__init__()

"""
Handle Bing results pages (only the organic search results).
NOTE: It is better to use the Bing API
Expand Down Expand Up @@ -640,7 +665,7 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
slug.extract()

# Parse the algorithmic results
_markdownify = _CustomMarkdownify()
_markdownify = _CustomMarkdownify(keep_data_uris=self.keep_data_uris)
results = list()
for result in soup.find_all(class_="b_algo"):
# Rewrite redirect urls
Expand Down Expand Up @@ -701,6 +726,9 @@ class DocxConverter(HtmlConverter):
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
"""

def __init__(self, keep_data_uris: Optional[bool] = False):
super().__init__(keep_data_uris=keep_data_uris)

def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a DOCX
extension = kwargs.get("file_extension", "")
Expand Down Expand Up @@ -1337,6 +1365,7 @@ def __init__(
llm_model: Optional[str] = None,
style_map: Optional[str] = None,
exiftool_path: Optional[str] = None,
keep_data_uris: Optional[bool] = False,
# Deprecated
mlm_client: Optional[Any] = None,
mlm_model: Optional[str] = None,
Expand Down Expand Up @@ -1389,12 +1418,12 @@ def __init__(
# Later registrations are tried first / take higher priority than earlier registrations
# To this end, the most specific converters should appear below the most generic converters
self.register_page_converter(PlainTextConverter())
self.register_page_converter(HtmlConverter())
self.register_page_converter(RSSConverter())
self.register_page_converter(WikipediaConverter())
self.register_page_converter(HtmlConverter(keep_data_uris=keep_data_uris))
self.register_page_converter(RSSConverter(keep_data_uris=keep_data_uris))
self.register_page_converter(WikipediaConverter(keep_data_uris=keep_data_uris))
self.register_page_converter(YouTubeConverter())
self.register_page_converter(BingSerpConverter())
self.register_page_converter(DocxConverter())
self.register_page_converter(BingSerpConverter(keep_data_uris=keep_data_uris))
self.register_page_converter(DocxConverter(keep_data_uris=keep_data_uris))
self.register_page_converter(XlsxConverter())
self.register_page_converter(XlsConverter())
self.register_page_converter(PptxConverter())
Expand Down

0 comments on commit 42fb33a

Please sign in to comment.