diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 789c1e5..9ca5d67 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -93,9 +93,15 @@ def convert_a(self, el: Any, text: str, convert_as_inline: bool): if href: try: parsed_url = urlparse(href) # type: ignore - if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]: # type: ignore + if parsed_url.scheme and parsed_url.scheme.lower() not in [ + "http", + "https", + "file", + ]: # type: ignore return "%s%s%s" % (prefix, text, suffix) - href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path)))) # type: ignore + href = urlunparse( + parsed_url._replace(path=quote(unquote(parsed_url.path))) + ) # type: ignore except ValueError: # It's not clear if this ever gets thrown return "%s%s%s" % (prefix, text, suffix) @@ -718,7 +724,20 @@ class XlsxConverter(HtmlConverter): Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table. """ - def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: + def _clean_colname(self, colname: Any) -> Any: + # Remove Pandas header placeholders + if isinstance(colname, str) and colname.startswith("Unnamed:"): + return None + return colname + + def convert( + self, + local_path, + na_rep: Any = "", + drop_empty_cols: bool = False, + drop_empty_rows: bool = False, + **kwargs, + ) -> Union[None, DocumentConverterResult]: # Bail if not a XLSX extension = kwargs.get("file_extension", "") if extension.lower() != ".xlsx": @@ -726,9 +745,22 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: sheets = pd.read_excel(local_path, sheet_name=None) md_content = "" - for s in sheets: - md_content += f"## {s}\n" - html_content = sheets[s].to_html(index=False) + for name, sheet in sheets.items(): + md_content += f"## {name}\n" + sheet = sheet.rename(columns=lambda col: self._clean_colname(col)) + + if drop_empty_cols: + # also consider headers to be part of the column + sheet = sheet.loc[:, sheet.notna().any() | sheet.columns.notna()] + + if drop_empty_rows: + sheet = sheet.dropna(axis=0, how="all") + + # convert remaining NaN's to empty string + # because .to_html(na_rep="") does not apply to headers + sheet.columns = sheet.columns.fillna(na_rep) + + html_content = sheet.to_html(index=False, na_rep=na_rep) md_content += self._convert(html_content).text_content.strip() + "\n\n" return DocumentConverterResult( @@ -870,7 +902,9 @@ def _get_metadata(self, local_path): else: try: result = subprocess.run( - [exiftool, "-json", local_path], capture_output=True, text=True + [exiftool, "-json", local_path], + capture_output=True, + text=True, ).stdout return json.loads(result)[0] except Exception: diff --git a/tests/test_files/test.xlsx b/tests/test_files/test.xlsx index 3a41e17..9153d52 100644 Binary files a/tests/test_files/test.xlsx and b/tests/test_files/test.xlsx differ diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 4a981bd..1eefba1 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -54,6 +54,9 @@ "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0", ] +XLSX_TEST_EXCLUDES = ["Unnamed:", "NaN"] + + DOCX_TEST_STRINGS = [ "314b0a30-5b04-470b-b9f7-eed2c2bec74a", "49e168b7-d2ae-407f-a055-2167576f39a1", @@ -174,7 +177,7 @@ def test_markitdown_local() -> None: # Test XLSX processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx")) - validate_strings(result, XLSX_TEST_STRINGS) + validate_strings(result, XLSX_TEST_STRINGS, XLSX_TEST_EXCLUDES) # Test DOCX processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx"))