From 5de769f1bcd7b32b095255da36e929a62da14054 Mon Sep 17 00:00:00 2001 From: Hew Li Yang Date: Mon, 16 Dec 2024 15:27:03 +0800 Subject: [PATCH 1/8] chore: excel improvements --- src/markitdown/_markitdown.py | 27 +++++++++++++++++++++++---- tests/test_files/test.xlsx | Bin 11562 -> 11770 bytes tests/test_markitdown.py | 3 +++ 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 96997cf..daf1127 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -80,9 +80,15 @@ def convert_a(self, el: Any, text: str, convert_as_inline: bool): if href: try: parsed_url = urlparse(href) # type: ignore - if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]: # type: ignore + if parsed_url.scheme and parsed_url.scheme.lower() not in [ + "http", + "https", + "file", + ]: # type: ignore return "%s%s%s" % (prefix, text, suffix) - href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path)))) # type: ignore + href = urlunparse( + parsed_url._replace(path=quote(unquote(parsed_url.path))) + ) # type: ignore except ValueError: # It's not clear if this ever gets thrown return "%s%s%s" % (prefix, text, suffix) @@ -504,6 +510,11 @@ class XlsxConverter(HtmlConverter): Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table. """ + def _clean_colname(self, colname: str | Any) -> str | Any: + if isinstance(colname, str) and colname.startswith("Unnamed:"): + return "" + return colname + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: # Bail if not a XLSX extension = kwargs.get("file_extension", "") @@ -514,7 +525,13 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: md_content = "" for s in sheets: md_content += f"## {s}\n" - html_content = sheets[s].to_html(index=False) + sheet = sheets[s] + sheet.columns = list(map(self._clean_colname, sheet.columns)) + html_content = ( + sheet.dropna(how="all", axis=1) + .dropna(how="all", axis=0) + .to_html(index=False, na_rep="") + ) md_content += self._convert(html_content).text_content.strip() + "\n\n" return DocumentConverterResult( @@ -629,7 +646,9 @@ def _get_metadata(self, local_path): else: try: result = subprocess.run( - [exiftool, "-json", local_path], capture_output=True, text=True + [exiftool, "-json", local_path], + capture_output=True, + text=True, ).stdout return json.loads(result)[0] except Exception: diff --git a/tests/test_files/test.xlsx b/tests/test_files/test.xlsx index 3a41e176eb860d6d78d92bcb2f00b2524d925df5..56ec4978178a08dbf5c627d2e2792c61486a7b25 100755 GIT binary patch delta 3944 zcmZ9Pc{~)}yT@nD*pq$9Iv5#)M0PPDvSb+hzKu0w%lfqqqbS?NO!g&(C|mYy6G=&S zl{IS(StcPn*Z1Dn>({;a{Pn!f^ZDbv&htE<^FA+Z5AAAYXt8;ptSu5j001pL0Kg0Y z0KzdcVg7f0T>brhu7vwyDlMk{rj=Quot)1!ht;%A%iL@pm|17uV|v69w{*G97$`l$ z%IeS#6;Oj6QUpGJ9Z$ZWXIZPC&M#1yqr&ePWBc{-Hk_F0do6gCI#6I_*;hD^Tzur7 zTRa@nuwc~n65GV8y=UaLH}<&*YhP`v4>m+|vG%IU=nU-Yw6QQ%ezLO& z0%`B2>}t_Fu9(u9SxtyS-za3IwrM^XSK--+x)PM$a8SpUIvEQ?1%h@$B)B6IEy?6J zpY1!OxMBWl)Hgo4i4EA%{!|SSbU-LoE!H#4IGWAbU?umde?15D`Zu#n6<{p-?HgMQ z@)%=#`9{T0&2SRY+_+uu3xnFO)(@?`gIYXGam;lLf2_NJDH%?>&mdJuNmei=-s`k5 zkL>6$=q@IxI2y9=x|Q=)(d_Y>xqV*4BV7*dX6Cn{Vi(!xAKPA8EUk2Y+^uaN)tI$i zfCDbaV9hTT^ZNC)_+HTSH9WC$|LlVA=eoImyN6_00nQhA#L@zi9un0U3!5Phnp$Xc zV@KwNswJK)v(`Mtp-XSb{nW{Q>Yy+Yi*EHP!pO^On2I6PsSw46h10dpbng;2f1JFp z1^94oG=zB)L6on3Og(3XBrQb<`Q4vd$LWAQY+R7Uvk@ZTg|KqlO*0WwBh%P5}T%-JWcdy{xhJ8 zF`SC$kc@tGPe66;*5r+zuKS&|$c~Lf;rN9Yc2UIp5{Z7y(8i)8!w+OOo-ttkT-=T{ zw!Zi%4OQnt?25;1r?F~09p^ND@?{q}Mt}N^%bdYt>W{-C&H40MZ3pi0!(faN4HPnt z_WF^m;Tk=4D;~Flk`b3;sTbve1w``HjCK3%lw zMNCH(UEmS%9=avP9}aU@HcGTeBVJS~%zq{FF+UHa(9x&Q_ zv&-UN(>R0%cD>ol#;hmGt^##3M=r>!LSSj2gk$L1L3D148A9z*Wn#CNM{psD_-@4&+~Z!_$caxPSHbGqhi=$Bx7O;1km1)2?{-dj7 z=+Nq|6-A^jc^ZG2FoGD{q0DmaBS5YCExu|wiv_o`$|5IlKrEu7$n@>eD$`4 zKG0YH{~~s@DppjK37+4 zDYx|>&52ns!4<;s6_>+4I?Mf$|b2lrn*amt7y3`*G5M1W3+ z0l}21p6v}r7>l3@w0IdWd#Lj(AheIli2t}%!9$2kV~NNkW>13X1uT3A3TjH28*r~2 z%*Df-P#H$Kl<8-uWqbQdhD|eF&rn#B+o})oU7NYqJGJjJ1^r$*rGc5?)aEYAkA2SN z>OaD!Gv)IQ?bi`;>|TK>@&?)$x?vnW8D0;qIGfG4&76d2JEW599!jf{Qw2p=gI0iN zKmg$E?A(OU&WPERw7jb65MjfHZTf)amnZU+&ho< z{vAua*=*0y&sb?ri3tvB>|+|_Z{S47l`(hX>JK^mcn0#Sl@PdX_w{z?E>7KwKEHg_ z(|nZ-m;8woZ@EkWcO}@)k$U6j>ol!9jM0u!wxlX6)28f@fNaSS>UIw@vCYbjF~V#DlQH4uWfFJG zu)-T9p215ZV=#p~wTv}Bg5+)iylCap>|t!nL}j_0tI4-W(flx#SUU|$av}3^ceF^- zZ6sIfVnsZsPy{wWTi^oY^R&JB!W~d}wEdarCI%^z^VNGaPLlgU{_KeMAmw`O^NX2% z4@*(kcN&L3`OW%G#iRt`XQIbIam2ujD5SI-1{NKP9 z{TxN{@(Xo|hvRW}?zSUiJ1fFrFKXx{i*jjHyxpN@ymDAiT1WU;QX#V=b99w1_l}eg=MprH*P&ZUH;*C+Jk%YqS>}I@j{OEr9-xzfp{u5Q>xbss~XQ!{% zBicx~D;zayyO0<2z_Uwfh>K4uqVqLQNyn9UA|K56j=;!!T256nQO`(L<8DE{`*_%7 z6DD$cd!X-XG9K2+7yDArT6vBsGvvb|Xuo&Sbj{_&tP24kShM;tfa1WXwC6-}nfBb| zE@9>V#gKf$wBNIwu$s``~owEc#2RUB3i##(zfpgt{(V#+s|Y9nS!ps8;IbUE1tRM_tZ zF>5QO&YO65nhn3c))VV?ad+Bs=Z}`zv0%QdN84I-wFUk`2ZUDy`K(41m{B1SrJ z6HS!c6kyVj?d&TZ$R@3pfR(0x_EPJWc@8u$$*kbyG}8UZBu4hq@i&eaXv1~hVT%YH z?(CAC-5JK+9_*NXhs$#hTOrtIQmOTm>wVZOgmL43on@K2te&&nl1W30vXOSFv-Hn( zT;bhs%?snoAjT+1_j}cu&#*D{$`j15lqLeJc0*z9E_p2UT+m-#Jc4hB1Pvb5K`a_> z7@l1!)=tq=l|}S&pRRc=PM09pp{{@C^83&{?)!?fId4z-0^+|d?0);cA zmFGT%_J}znGWFFH7ZUdrHKwyoJHDu}xw;L+Qlg zcKwkrQwgq^eJ<#=xFDFg-yP{3Y{)H#`GhTFOF&xp)m}C!_D^y%z!MMXQ&H1jtdr}( zpF!aFCUIbQrpv%Cgd?MEE`m0~eUVtIE?+$+yGfMZhoo(GNZdkCH)*P^p<9l@B=zc| zqZct1tqy@VY($j5^JZ6tgosiScrI8=k);BerilrF(W|+~4e6pKnpF(Okesj0};0ifF7=P(hOxH@| zVH6wL<@U2nVk7s$sobJBw@K;I=ktGd1_GEEg1{UA8UWpSGYJF!`4FyEb$;i@=UEm2 z9nj4mZF1K?AV|igzUnf|c@h4DL)J5D^YVsMbY@1YWsa4$B_$jG@(`Sc5n1I%ikYx< z@a)JQZ`Vxva@4-sx->gQ)&Bd?xHb0)C?Xt6PelmOjT|orK24(QO#a@&W zuiCM-#iqm-VDt_rZRljLPQ)N8VdB+I}^nUhsSY z`v9YYwvy_+ub(=$R|GkwBG^ehV)*;_~gmpNas+^O+BZ;8G@Dpq#Rj6Eq2%Umlgcp)XZ zMM#WrbRMP(6MjM@2qsc+DxxT1ON5V*CB?$`p8=-=09gMX;5>Rm=$3-=|KDc(-vXpK Ofl6A0Mi&0>=6?a}m@=;b delta 3750 zcmZ9PXEfYhyT(TyB@v@MqW36c3?k7PL=RCyv?%3(974&-pk8PJjlcSy{U&+o-F-E_~<#fGmzzO5OvX}Xw=)%vGNk>*?FB5QE-+$ zg@luWe#!U+kyG79rqHCWQDAE7ree6`TGgkE55>AG2{e5{N)m*Z9&8_mf`)!hqJN_@ za$8)qHGXEL0!=uw{WrS%-v)C4>xxh2LvoMUK!yWo3*qrQuLNr%&FIBz5O+(B;q_OGCSIF5GQ;1n6r<#Cx+u{p~0q6D_Qw_IqG?gU-#Q~~~?UQXteRFf;nen=NLZ)NzcsCwh5b}jOY9#zOwB_#hMfbU4~aL)R!Yb$S6V*h#(10Q2X zSaELF99MbRg<>KHpU{b``@&Qg(#w~!U+${(_AelRgC*QArPwH4YAv7hJmJ2xe>BIj zzZTr4dordo$)msj`hJ6PflKP05}zTaL9io8y#@D-I#&06Mo^b-r965o2fF!mNUPyz#rO?gM$zlw_H9*vaklRK?okBkL$*7Q-bRsp zt?%O$I7?|#J$cvrJ0NggE!+oRFmxyCQA-0g(5=jc6F$Pwf68Xbvzs9rDF!G`K7S_8 z6}`Z7eMyT|2x{sMEqw>Z(V`S@e#Z_DMQ%HH?U5x^9381OT)vy0ZrO}DWIgKKu;h|{ z`>cEI^|;)lYj>FH2)qFkfS$zQ>? z1JRHD+I!^r#QwDBG-pFE z7g2HbNF99mJ^QJq=W=|rZB{(;9*YGa1>`RHc-J-Fm^MKa}W z|84kRyGl3ML(ZpmNHk=+M0|am^JI@6t_r$*rS+C#3)w`!rvnIW$WpIbFOyue%7sHS zubvdoZ`;4ZeMj`&2g0h+c{;iKEn%mZ9NfGo7@LhO<%b)xC!s3Mt;0l5#jqy^a9)3| z`JTRA-^(QjsWT_}i;()L!yM{un(|pQ+=RgmnFW%XuQUdvbmXTN=>g;*&;v#g=sE~% z4P^teFaw^Y67(@dt)B|!^I$T;aS@v063vm#ouc|?WQ=VWi>rc+X&5n4M@JOJ$c$E+ z>L7kia`LVJ7U%x?iHp8QjX(i;d}XclKwL}97cD@9fX>W-y1mT}**)+KK6I+C_55KK zfk?Iu)tIGkz(~R^hv$Uq$et=B_-BAa{Qe9Rc9LZTA>#1VEFk=?ix0JdU%h1@zn%ZH*y#IGze0osEmM>9kerW%51I#P#CwO9 z-mLg3^&V_F=~~h_HG8N=y8Nr0po-#Jm{<@-eCOLpz;eT{Wi+$qsUuv@q-kxgYq)oY%3 z0BE^7A6rHXHJ*cgrzx4lF{I7ifTPIP`IV;6;u-`8MXn~!wT)F;5a{yq4?{06^V7a3 zW=haUKN4IfVe9TDHxn~%gEJJQ1lN|m682bV>Au;y=M(mAx^%p-%3liTxjRcdJzuIw z+ENU=IQBmY^(vC{P^sXyI>wi|8MGER&d;};6NuXZmqAT3+brudzbf2-waq&2^M;%R znW&Gj;q^KF^>xKJUJO;K5b-4Qfmr3FT~y}}W*@rN1&XPN~k}Lz`t`8xJa*OE@|R+_#=7K=UA7W3&G=0pJM*4qIqXJKRWzm|ULnh>JLaPKsA&BC z?yLQ@owYUEv>itMt_DZtD=)$GrdHmG zAtbNuDy#VST7^(!s`_2SeXYKfCNj&;07d}o%LUkf=oK|4dSU3w;*$p;x8^ zQw(0{P{q6jyl+=c8GRr2!4i*ioozMf?-OR#%6r z&Ihuss1as^6>|RJxesa;(6jv>y~jx@`*-W_0&G{Al_NW@4K%`EG*_`tk_9nQ&6s{9 zh6_{sG=aOU-6@Cd3T~Q7a^nK@=WNln-?VfwVmgj@B;OI98_rl=$8C{6>)tIR-M&Xp zi}^j=y4AHfB;DoJCgLbF%e$<3m86Qn_;`i9OPjFgUzUJ^vW9XS_shypNmP5UJH6gK z2VPpZzmS^FwywNK?>IUWFViJ2L}E^gm~Tvy@ROC^o45)j*?yLz&oC zTi^ZtThJD-Jmws~(w{!~ahJ^jgYf17&`E6wF~tDGr>V@IqfcTF@YM1XBs$_>k`7;A zN@*N7YQP@5L(em(0kW!{Fj=@Xu5Bhgp<3zoVh|OIKIsMDZ+x!m8uvd?`CqbNt>~We zFx{!QH%Aai!mbMxS3=ZuPEeWl?v1B>^wbYpOCGH{NccI4f0<8UEdbbC54S-&w}g64)oL`CY6dzR5sA*O zFZ(olX%q2;`9?Sibb_?(vv_Q*YPZxflkbGc_oc)b168B2&SH^;oY?EDVsFr>M!QH^ zr9y5{jW9Fr8nsl;J0X1%5XhxwQh<*HcyzCWhpB5p|BUV`9yV<8P-Aewub%<)?8{;h_jtT!Q{;YWASB?xU6-{ zy&329cS^F-aB;Zrh=iS(qQ$p}L*|dd$sdrpuiqsQuXap5zbd?gSsYfiW-a}A4bW=r zzRJQKdR10%l?}RU?fcaS9G}Su;Y>EpN*2=|je@cf8WT;0*&IFIs1O>6wI|MVa#5(h7&p|6*>AKcE{`)?%h#V-4T8Dnkn?feG!S1FXlG|4c{3ckPSLq9F1g#Hw~v+d6XT=uKrvLWY;nGLl1*$>4yi{YL~eyhxt+($$}tJMsBl@j-@}qk zT)#m|dr{^RZ^p0!?Shvz(Uy0vp6Z~N9*gd2*OMLoyp>I{@;GDf4Tkdeh@oJ-1|fx2HnC3@=${UB57~mAw+k-YCtAkS!@47kVCn-nA;@(^mC1mKmWEI4f4QbHf?w8*Z1Z-K~`<36uju> zyI3ps2hx8Xt7oXUrMGD2bO|7{%qy!*d{8fyiGa{PaN_;(vr3&G|>?~raXVK<;Eq-E?_5ivQ^ zTrRAG7=rW*H@1*R5IZ0yN;<@YE#?)$^1wt%$N8|8{OnjK7(M&HUHnISApbe@&x!<# ZfeCQ@ckTV(gy None: # Test XLSX processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx")) + # Check assertions for test_string in XLSX_TEST_STRINGS: text_content = result.text_content.replace("\\", "") assert test_string in text_content + # Check negations + assert "Unnamed:" not in text_content # Test DOCX processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx")) From 19dc6a36410a1c9d42fd961b34a1d6ad5b0406e2 Mon Sep 17 00:00:00 2001 From: Hew Li Yang Date: Mon, 16 Dec 2024 15:44:30 +0800 Subject: [PATCH 2/8] chore: update test excel with a nan --- tests/test_files/test.xlsx | Bin 11770 -> 11739 bytes tests/test_markitdown.py | 1 + 2 files changed, 1 insertion(+) diff --git a/tests/test_files/test.xlsx b/tests/test_files/test.xlsx index 56ec4978178a08dbf5c627d2e2792c61486a7b25..0dcbeb9b15bc026e88c46f179e8df38705cc70b1 100755 GIT binary patch delta 2611 zcmV-33e5HTTiaW(F9!up$a!RjlQah-f1v*$=v^VHt1Uz+Y)irhNQ%I5lkHPLOLRn7 z5?zvx9Tffd4&`g=Md79uY*Hjghi7IEhnFAMO>Mm}skmqpn_OVqVr?Nx-aaSx=SlAL zY)fg{mb4ZvOYB#s?2lI;{<&NUIiCtKw*a73iCt;Eh!Ijn#Tu$E#DcZ3G83|)e;S7J z8L5S2v{V&ix~Y+iaflkqTYG~L%X2WqY{mjW&sdWUUU80f!bD6#MjTQkaSvu+iMb7YYD#){scqg?skHuldT$FNTI|`UjQ`P zkUsIBU_{895UfvQa2}G6Bzxf|e}{WkGJGeiaE}!pzzF_hGy>A&JIWj+<}%BF`cSAsw-gYL_?t?RnpTV@Ef|Mg$WCSP;BUK-hU1Yu$J*o zdv{t|@P3`%K>Y9N3uFUwUTzQTwF$TP)D|)(Pv1x{9F4pzcRZ4hoSyFwoME01oWRY= zsD~r3mwDg885zcfpi5nCe-mw>6WfQZo-Ezc^=^gWc*)BHjc<7SIHv!p*{yvy4x5tp zz}ZS2Bw>cu`jNK~ad07W!Kv5%u;;^Y)h#~qQdf!Xfu#0x|1$nu0b|nl&37Pu!z{7i zFiJO!a(EomykUfnD0RgXsCB*7c7ne*1(iURHhmXl5O{3%kk=(Ke?A>s<30{CaeZeR zu*mTv5;^@6M~+uuKV;J*7$|Ui7`M~HZuf%L#htWF*X`FQ*zFq&w$?XFcRlErgO|j| zg9!J12WMX3_`SXhx%OOVHL|K{*1CwTRhs1xQl6J=^-d#?HWksc)NLtL{N*lA; zkSfq+7Z(J4kI4>$f8G##K+(@h?)apSo#8O_ogmG;<{ z=Nb=ghw0#a`@wiT_%g}<+iijVH=vt@o3@iF+R5bV2LJ#7v(XAo0uA3Bi3kM%06!D} z02lz1Tq_!XoRdv&+b|4gwjdi?M znD-%@b3qOc?;M}vEHlnuDQPL3^|lQOlG!Qd>-6v0J>y~`UFUxjCYv({?=lwfg33P@ zQnr|XSosB3d=Pdw!Qytga*$0$f2aSh8ID;r>BXFVf9QJENUU(oia2O1`k0lQd`{VV zH8#)>348zFI|;mg1qC7iie!}hvR1ze9&48^u!&>8*4FOg!WaA{jVN}-+Km1B)sdQe zc(UrJY(j?!JQoeLd{E&sxlEQto)%fYn4I5#?EVYLZ5N^{CLOz`Y>v<11-2`yCVA8q zR$=gHiI#ZmaC#x(tb@=P8+HU?xIGx|V|BaXH`SNjl-xgoo_INlyLccy@$1etc9q9cS z4!isDusb8-+Z3?_XzeFu`bo4ART|ORi2XBSH!S117MHP>$Exnfx=H71fgOodEJZ=m zz$v^3_OE6744HnExu=x0idlVB?B{uZ$9?M*2UZEZqB3Nt45P}5vePQAsYs$?q11P; zaj&>sQCViFEH#(3<4P-16BP?@IPhG6V}^SSvLbRgSO0y+Dy>VJpao`?zkn1uGS7?P($d=(22-8Lu9QR&Sk0Cl%?W|PD!VKxTMVY ziv7IQL}inqve6yqR4VpV2vKq1Bd*vyvq^X@p+%(`M^&+tXPrcc+n92$Y8~X9$YAcapIKm z4m?nN4_@!*f25hsaU^-|CM~#ssR9k>>gW)@SaT{V$9@DJLF;Y@zIk3%xOL(q$LAfK z&pWUWbFWPW=Qxq5Uj)8=fH!oyBe=O3=`h~xH@K`7XO1BWJcjbrf;&^38c$uKL%272 zg(-!#Q=j8XqTbO^5w_r^DNc>&1J)tFHZ2PAejX?-f*f}eIE!gf$`^%yHr_EJ;v3UX z;b&AGJ>q3mY>r6@oWwML_z_JW5x;1UdKCq?nZ@|knxncLs~UpG*9oTtyKh}H33Gw($D9L@1ChsP4%0lPT6J@}%;RqJm4N#Gx|4HCf;1k#R=aI?}RH3119D?u&$pzNZBlMp5wf5gj!EWIz# zrl?ecNDv|q6t6^~@&vtPD}RAJW0?}HtP~{Z2nH?F>#La8lvf&_bX9}41sKnfl%c#% z(9YVLVr=*hBr}0?C?{?!Etz#ko#UGEPnH9Y!f=iySSFZdxCQ$4Ktv~`-~+PW>#_p~ zj-dnz${I`pf`7agv^0MXe{{;oDGPh7U8hsrF2lQah-e_;PX&^tp?res-&Q&^US4UiOp<0iXL0WHxH z5na3_9Xlxc?;Xn5)Qg3iRN<5ET|YOi;9$g7s+(&O>5Ye~DLVbGT=vf)BC^_E^CIjFMkQqadwLD-i_mgJ}Mq zo9A9#&SfPYHqv$YV)2>RHdB@BR9u_1kVX_kw-3X1B@Pqdm0B+1rL16;!b4nqwU_gq zf&t`yR2iWfZp6543`lKD{a0nxVUNoeMC?q&ze}kEoE?x5xC1TEBRP4EOT{YN%W4ddfY&0}JQYAk<0zaWkp4+fs2*rl(=KaU8 z0BaTPw0CEe1Mk<#4aEPRzd|-3=f(E0UfXc@o|;@m^yzDujhIiFk9z(fMnR8J6#7Yu zLXV~Kn2h{*9DW05R1oDYe_xt%n`jF?bQxszWa*Z#cPo@cOIaLfd?nil+5U%SxAx6C zY)jgM6f1p@gdIBTN7+Ec!G*pDPQC4iA%o$nTYQv-DTi(kB(-+}Zk zv!VNrQLtv_Fa%c;E~-!e_a>U`gCkf1|%TV zV`%1!K4N{^M}va&Q7XJr9jp;-!8+P14x6X|{Jr^gAkPcZ{OlY>!NB*Je{AfBG3fzCmeC9`Iv^;H1BU!0>-l|}j8i{5khSj5KUl{?+)FM>v`!HtyT6|IhFqysyNW#@WM}hU3q-)7vv`Z_?@0 zV|K3b=r&G9=i84alhHrZ^zYpk_&)=>Nw{r0nc|&HuD$~Rv(E}n0u4$7()o`j+cm+*F$x$Da6UHezwcnz zPNm#LwQRtAJnuR1vU)gaMGjz{G)+O1D5eA&VM^KT3i|qEGh0&PJa0;_OalcygQE}Y z&!1LDWB0BG@B|5sD`@R~n=|G_4Vt^iw9w#4Wvu3Z9)InQxz++N`$bx@G>-3>=CYv! zA-A_MOjSt%Peyba8b3fRDDE+SSIgFoG%aqyX>RvjI}=8?h}cRc{kexyqDB6^YmDVv zg?XQnITz$)@lEl|Mw!0Y_CeU)1Pj|`^G=op{TshMGpv(P#Ft_m`$yNSQsN1RtcZoGpkI=Flg=qy zuX+RhlyG$K+Qjqi3n&mCP$Z$`k1_g1@Njm?0!UnR-|H24n$PF$oG!+S(&q<*?qmnd!DzQ7KD z0nf%#{jK_SVwUxI$6R~+alPKd`F=jNpFe-^KHv9dzq|c9f854@y;b|&ALGxLx9@+$ z^>(>_+~4=b7rQ^Tmr43{($2pa`E8_+lWz}?Q+v2T@oiBY06P7XGX6=>iYiw0T5)_T z4&!S)-^FIU%k)-{Q{BYR>VPd3DcBT$1(gO)@&VYrF4JX{>E<$z)RImxS04ns^||xD z^@2TD37n|XXH@CuDl6(vC%7h)1i^+{-@nF#;4)EVnNejay`&u%T2VDYu;C2{UJW>R zc)%iwBFA&}-)F26U(y3QV6GCFPmJJ_8b1hjW;H@|e2y7q9_KPGIjs}S8!nW8-P&Uf z8VG}0Ix4cxD6*CfXIlzZwiH~^Dd`25)cHZMTVFM*vdO5jksaq&3U*|KAlUN}7i>*z z?g5KzcvPk-a;ASF?WAXP<08{2HEOU6uWKIX!X1wztmEDKfVkMKs4`+u-g_gQcR7Gt zKJSa35B$P?B8p5-7LDSmgB2ftdT>J>3=5}@_u!uN19*L0|0C{fP9mAtY0`l!7l@qQ z(L>JfJbx-(bM$kA$O{;2rHkO203YEBvTrHGG_-~D$|Od z4_*&_twtC0Ze1ugK~8EJB#s(g!Z${n?;ZjAM$K8|9JynF-Xx372`__$QK5jGqK60Q zH)B(-sNlM@SiedQ1e+6Kqj>&m;nspz>;rIhRR!N9m0SZCUZy@!aw5!1R4LPf>tYdk ze6L>g>LW$;$;M`VO(DTic+y(on|_FA!8gwq8!URhxq>%EVj9?-1RKSZ3mX2Y0^$od2L#^O@zVf|CVKeO%;!4d?T znmP@$vm`YE39BnXE&HJCqJxw1B^!Sv%e}0;FVLo}Rfw$W#W{j zJv6Stne51iwA5pZltQsk&WpKv^O<x*0B6Lfvn6Zy;- None: assert test_string in text_content # Check negations assert "Unnamed:" not in text_content + assert "NaN" not in text_content # Test DOCX processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx")) From 42027aac2d1f6172c9ba09ab6458be29cc12c2b8 Mon Sep 17 00:00:00 2001 From: Hew Li Yang Date: Mon, 16 Dec 2024 17:35:44 +0800 Subject: [PATCH 3/8] chore: type annot --- src/markitdown/_markitdown.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index daf1127..34d6551 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -510,7 +510,7 @@ class XlsxConverter(HtmlConverter): Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table. """ - def _clean_colname(self, colname: str | Any) -> str | Any: + def _clean_colname(self, colname: Any) -> Any: if isinstance(colname, str) and colname.startswith("Unnamed:"): return "" return colname From c2aae4dddab8e9e04fae9141aa1905e1df22d91a Mon Sep 17 00:00:00 2001 From: Hew Li Yang Date: Tue, 17 Dec 2024 14:03:39 +0800 Subject: [PATCH 4/8] chore: make cleaning optional --- src/markitdown/_markitdown.py | 24 ++++++++++++++++-------- tests/test_markitdown.py | 11 ++++++++--- 2 files changed, 24 insertions(+), 11 deletions(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 376c75c..a72a963 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -527,7 +527,16 @@ def _clean_colname(self, colname: Any) -> Any: return "" return colname - def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: + def _clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame: + return ( + df.rename(columns=lambda col: self._clean_colname(col)) + .dropna(how="all", axis=1) + .dropna(how="all", axis=0) + ) + + def convert( + self, local_path, beautify: bool = True, **kwargs + ) -> Union[None, DocumentConverterResult]: # Bail if not a XLSX extension = kwargs.get("file_extension", "") if extension.lower() != ".xlsx": @@ -535,14 +544,13 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: sheets = pd.read_excel(local_path, sheet_name=None) md_content = "" - for s in sheets: - md_content += f"## {s}\n" - sheet = sheets[s] - sheet.columns = list(map(self._clean_colname, sheet.columns)) + for name, sheet in sheets.items(): + md_content += f"## {name}\n" + df = self._clean_dataframe(sheet) if beautify else sheet html_content = ( - sheet.dropna(how="all", axis=1) - .dropna(how="all", axis=0) - .to_html(index=False, na_rep="") + df.to_html(index=False, na_rep="") + if beautify + else df.to_html(index=False) ) md_content += self._convert(html_content).text_content.strip() + "\n\n" diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 2f061dc..bb666e9 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -42,6 +42,7 @@ "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0", ] + DOCX_TEST_STRINGS = [ "314b0a30-5b04-470b-b9f7-eed2c2bec74a", "49e168b7-d2ae-407f-a055-2167576f39a1", @@ -139,14 +140,18 @@ def test_markitdown_local() -> None: markitdown = MarkItDown() # Test XLSX processing - result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx")) + # XlsxConverter has an additional kwarg `beautify`, which defaults to True + result = markitdown.convert( + os.path.join(TEST_FILES_DIR, "test.xlsx"), beautify=False + ) + result_cleaned = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx")) # Check assertions for test_string in XLSX_TEST_STRINGS: text_content = result.text_content.replace("\\", "") assert test_string in text_content # Check negations - assert "Unnamed:" not in text_content - assert "NaN" not in text_content + assert "Unnamed:" not in result_cleaned.text_content + assert "NaN" not in result_cleaned.text_content # Test DOCX processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx")) From 5c60d8ca12ce03a89747f9be1cb124f2edfac052 Mon Sep 17 00:00:00 2001 From: Hew Li Yang Date: Tue, 17 Dec 2024 21:17:40 +0800 Subject: [PATCH 5/8] chore: finer flags, forward `na_rep` --- src/markitdown/_markitdown.py | 30 ++++++++++++++++-------------- tests/test_markitdown.py | 14 ++++++-------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index a72a963..67f31af 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -523,19 +523,18 @@ class XlsxConverter(HtmlConverter): """ def _clean_colname(self, colname: Any) -> Any: + # Remove Pandas header placeholders if isinstance(colname, str) and colname.startswith("Unnamed:"): return "" return colname - def _clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame: - return ( - df.rename(columns=lambda col: self._clean_colname(col)) - .dropna(how="all", axis=1) - .dropna(how="all", axis=0) - ) - def convert( - self, local_path, beautify: bool = True, **kwargs + self, + local_path, + na_rep: Any = "", + drop_empty_cols: bool = False, + drop_empty_rows: bool = False, + **kwargs, ) -> Union[None, DocumentConverterResult]: # Bail if not a XLSX extension = kwargs.get("file_extension", "") @@ -546,12 +545,15 @@ def convert( md_content = "" for name, sheet in sheets.items(): md_content += f"## {name}\n" - df = self._clean_dataframe(sheet) if beautify else sheet - html_content = ( - df.to_html(index=False, na_rep="") - if beautify - else df.to_html(index=False) - ) + sheet = sheet.rename(columns=lambda col: self._clean_colname(col)) + + if drop_empty_cols: + sheet = sheet.dropna(axis=1, how="all") + + if drop_empty_rows: + sheet = sheet.dropna(axis=0, how="all") + + html_content = sheet.to_html(index=False, na_rep=na_rep) md_content += self._convert(html_content).text_content.strip() + "\n\n" return DocumentConverterResult( diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index bb666e9..aeba9b4 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -140,18 +140,16 @@ def test_markitdown_local() -> None: markitdown = MarkItDown() # Test XLSX processing - # XlsxConverter has an additional kwarg `beautify`, which defaults to True - result = markitdown.convert( - os.path.join(TEST_FILES_DIR, "test.xlsx"), beautify=False - ) - result_cleaned = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx")) + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx")) + text_content = result.text_content.replace("\\", "") + # Check assertions for test_string in XLSX_TEST_STRINGS: - text_content = result.text_content.replace("\\", "") assert test_string in text_content + # Check negations - assert "Unnamed:" not in result_cleaned.text_content - assert "NaN" not in result_cleaned.text_content + assert "Unnamed:" not in result.text_content + assert "NaN" not in result.text_content # Test DOCX processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx")) From 113f7748b79a0b0ac060d331dec727adf0c04e55 Mon Sep 17 00:00:00 2001 From: Hew Li Yang Date: Tue, 17 Dec 2024 21:38:40 +0800 Subject: [PATCH 6/8] chore: simplify xlsx tests --- tests/test_markitdown.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index aeba9b4..a7c3064 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -148,8 +148,8 @@ def test_markitdown_local() -> None: assert test_string in text_content # Check negations - assert "Unnamed:" not in result.text_content - assert "NaN" not in result.text_content + assert "Unnamed:" not in text_content + assert "NaN" not in text_content # Test DOCX processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx")) From 7b64e6ebfd370fc361ab0ace9a0b0dcfa89d8700 Mon Sep 17 00:00:00 2001 From: Hew Li Yang Date: Sun, 22 Dec 2024 21:22:41 +0800 Subject: [PATCH 7/8] chore: consider header for column-wise drop --- src/markitdown/_markitdown.py | 9 +++++++-- tests/test_files/test.xlsx | Bin 11739 -> 12088 bytes 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 67f31af..a576196 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -525,7 +525,7 @@ class XlsxConverter(HtmlConverter): def _clean_colname(self, colname: Any) -> Any: # Remove Pandas header placeholders if isinstance(colname, str) and colname.startswith("Unnamed:"): - return "" + return None return colname def convert( @@ -548,11 +548,16 @@ def convert( sheet = sheet.rename(columns=lambda col: self._clean_colname(col)) if drop_empty_cols: - sheet = sheet.dropna(axis=1, how="all") + # also consider headers to be part of the column + sheet = sheet.loc[:, sheet.notna().any() | sheet.columns.notna()] if drop_empty_rows: sheet = sheet.dropna(axis=0, how="all") + # convert remaining NaN's to empty string + # because .to_html(na_rep="") does not apply to headers + sheet.columns = sheet.columns.fillna(na_rep) + html_content = sheet.to_html(index=False, na_rep=na_rep) md_content += self._convert(html_content).text_content.strip() + "\n\n" diff --git a/tests/test_files/test.xlsx b/tests/test_files/test.xlsx index 0dcbeb9b15bc026e88c46f179e8df38705cc70b1..9153d5292cbf75d168e4a753df4241f38913045c 100755 GIT binary patch delta 3801 zcmZWs2{hE-+n=$9WE(?d-v^DYERlpTvM0tqwjsO7(D1dDEn-Hd5VDi3Wh;|4YbeUT z@4M_F^ZLE-IdA{v-t*k^+;i_e_uS`xp6B^IW0tix?1sAJ6c7*<=mH1?;sc2|HJj#x zL7)(-3U+=n!1SyBoD?Imto4QlG5gDtmIogmbcONI(3lGjQ@G(*nOzPF?KK%q!s0qf(At1nTtdpi)}FO5DJZFx8hv0dA>5 zKi@D(<-pRbgq;)9&K^qB*PnYi3ppso0@0zGg&~&?cyLDUmgxRFyMuIodSU|RtBiGr z{$2X)yW}!)(#Amo=sN}{Nu}05jFFK>OUztx`wZ+)lb&0@TMOy*bwpRxNA(2s#|1R( z9UJt-0mx@@K{q;x@5CgFAA8T}9YW7!Uik&SzGB`nT?X1Jn%h%du^U+{?A~2_Cn|O0 z8B3Y+_cw&kp_}%byg9*kikiC-bVrHMAr-Fi#y>D0sDWX{Iw^^%xKH`&}}RSZ16 z%wCKxVT-LYHcL;-|H9431>nZ<{30Zv27)^?P^#hGJxhW4H zg8MP*$E^TDDsSs{`OWyIKyC2-Q&V90$|kaCIie(6<-Q*%XP)p)RQv4!f~@I;*P?c5 zvCfR}&}cU*-jA!=9P1qa_1=ckwEC;@hDNVQz@`b(SrSZ$h8t zR3vZp%WY!7!nG^nlsF@JbF%Lbd-8;p(%{N({YTm_*7q))MqWjtTEh7tob_pkJS}7; zrj-MeuZ=E!vMrEjBt1T$9SKb*jI{)Lv}~w;c!F3S7?Rp_YOqnbgGxE>=nIGJ@A^E= zezrI%wI89dOAM2RA!s{O@_|v|(c28;pvs_1zwVzOSrNEAQ3g38ZX?2-q+Vrt;>=Yo zw4t{B;-+@VuszAD zu#*I%dQ@^tg!^s+)6MKbR)egc`}$)vXOqx-w5Fp6)`@D^cK$vESH}EL_Qif3{|qnt8h_oI;h1$= zh`vKg_zx(lhry_S?QtR75jO{Q5#fnXigQkX{>UU>=T>3vgf!w20q=yAcFK=sg?s1e z9bsGLmMi1vsEF;3S^&yR1DnM@FQ*0GV3Fg{1k(7_YGo~hn&j17>fJH7-0ags>npnngjXRfPewC4e} zr)rHB1_tm3x~)=-ZCl|}d_5ES@9Lx7>!~$|-~OTA zmK}DcCW*xy5zi+-`7a;cjhydz^35=c1KFey5{G0~Q_Dbi3^^Wc&uBxF&?{ zFV{0s(6o^f8V5f&?h?=O@g70RYrY?aDUe*{*cnE6sB-colIDARXVGFvbgcdidh_QgrY$c&DgC+vN|k zu}U9{Wj%)2dD=zotQzayuYNq79_-Z~(Qp_Et;3QOx2u8_9%vrey_WM(!;9uqSfF4d zo^;25{ADeTXO+FgOYhQAs)Xe*tUc$sPwfy;3f`umc_;2N7fg^{)F4m&*x}bdtL;&; z2yg=%DRsSMWluFND&!KjUM^PVp>KP7pDy+hgV9TO9AZ9ppmUcUai{38vtx_hyqg!WkEjMP)Ef1AlR z)@L1VT?UI0xl+jwT@4RG?C8~Jxi@fJ+*5CEYH(%g>t-|LRnF4B-Rb497dJh#j-yOf zIQk&x%m%$Vm&`6SUe2rSJ+r{gtyUBsl996j&}R>8s9hH$dOTlEYnri@6swES0&+sS zy5vP;<%_~o54&uY2;XFM=OmgmzBG9)Zd1-z|Kigh&!4XPq@)tY;^O^G7NLHzSrpQW zbBSSNmA|(;XUNkVgoHY=Snl<5s@##8mja#r@6Vs}3=%2%?Qgy>Ib?tfg!p}_4g6crh@5hH{ zpUh4s_J>wBR?mW9haXkDGI|ZBNC$+q-#d>UPcGT+6FquOmVIJhXL_HmE;oG8d*qe2 z{w&(-CO`S|tXAxH(#-_XmT0~N1+0eSrtD3krUEVFx zBdk?8KkAqLRW+TwEef;TMw{~NU@q=CBku|$vLOc21ZSQ*q*KR5XJjv&^(_s?=Mutj#}D*(|ru`(oh`qR*#JEzlg zbsH)*oSp^7*j7q|F>Bwo2=QXOEk#o-s0N}MZWgr=RbDCfD7)m!RhJW`ZEZ^C-c@!+ z;_;$X9)!i%oA!9K_gClfyj&|egu!()$a^!$&VdlaDV#p5g|KSb0aJgX+0j_`XjY=`?y9WDuRV1cJ zCIG=2X#V2dWs1&)@mw(Oa0isTgSrdK)$=OG!(BN+jh0NBkzy+xU|-yHW@SnejJrU* z#lu*cLnDY6V$lG+K4Dx%If^X}w$i6HP3=(;V<-*Y1E+*jIm~Bkc`XXp1T=vb-^4(p z|ES^nq!2)JRmP7zuBm_2*BF77%3606rStLP&T}48`dNU>!=(u^en=nmyuttr09$Ol z;ZWXUvL}Tmh7^_DCo!E+R(T2B0#jV)smQ3R61)Zm+|0r)4%9|8sUs8*eAsX`_nor^ zS4v1BMwaN2VJF|-llgQ)>)Qy426%sd;k|HeMt%lJnZ}zPiUvnFAV%5Y6>~`mM6i%D z;{M8lW{+E!IK4rAvDz5fvnA~=LmLqnw1=Mbbu2aRRik1uOU;Xvp8A|#pRY$E-C!+L zZ6h-v!O|l$4*nA)m@Qzu&4k?YA0ek`Hw>q6q8A1?soKz4q^eQ}In5?N@o*bjVGrnn zVi~m#n;t!l$_9`+`kB8%~reWXVC(ed&AdD>X3P4O)9p#`<15Z|EW`?E9 zNMm2Uob8;)f82@PidG#CPl1?kFAVrqV`XG=1ak#b84W_;_XVHHfA<1aiHZCsqy1Z% z;U!>3;3<3vjQjtHe-;c1mciG+_&NWdY>7w{e8x4=bs8VMz`3?94<4-{`H0VC_+#YgiB;Y-fdE=x_1( zpIH5S!8d{bMlK#K2_s7u#3u=G;)N_#5g{QvI`+;k^Gkf3>vuKePV delta 3414 zcmY*cXHXM}5)OnGgCPV65SmnhBnpTUr9(LBy*CvIMIyaQiNZku>7jR|83d`)q^WeJ zHvy%dfC2_Av{0URZ|2>+{jsw# z0Dv?4IxZnvg6V=bP#PM>c|sJenb7;Cif#M#8qBn)a{B0ta}!UFo|^O2J|#cF4152j z-*+g)m>|&%XyW zNPP06^(5Rxrp}(D!jt;2Qd>2CWTsolR5|2FloK?kZ^l*abPD9BB`wHxpS9H&r)$nN zsVMVA+W4WUi`wnnlq+`W#@SIu?!r5qlo7QT>i$4DFw>=(9v4=$mh&j0s^m2w7x{zdmITNh*eMH0%IP zfOL8T!A>MBqX?ns{L-eKAo)2OD}mQ}!=M@0?M>P-QPJ_kTEgs76ZzOr4*`-#_ao6l&3qjk0|%D0m& z1kPG*K{pF8NH+tD%wp}_I$~IjHF^=1S9&5|*2=Ke7ZVWC-?_=`^obe$Av?c zZ~4t=vG|0!SHJFhN~F64VXNnlLXJZsLJg!n2Y$X{sg)d`=W7>d z_Cyv_-nKu8RWBAco0t^dw?)#q0oY~SiP80@4(?J-ci~aIpM<=*--`3@? z3J*BeD(51Q+!%G|VMSj^Q{t=|YnRA(4bdXo$-A zsMfc`(xm(qsh|0mhC40^eGvzL2U@M2ZRh(}5!-6Gr**^u>)U~A9~zGW0+eqq=;9|& z*O~g)rh3AtziOB%ucEPw{yGUvOwCFAcB8g5bF4%}_J{O)XX?$Lt9-)#ixT4XymW2z zxg#$x*dG{J99)Wdd9CdL!V#}oTYcr~jnNP8`UXy~RD{SFeAo4&9Jjc9BWU0{q9>er@W}JQlB``qQfD~N*d8}{FcwTI zN<`O3xSiO;t!U|k3yN(?D{uROR#+u(UJH>5omM)NlWXrCWSZUIBu}#rgZoXDZk1iL zxhm@0AjzB*_Vu_9DLbsZ?_+-c6$}8Jp92B^kO|xW*hG7YHAceF3^pjz_{0#=`nWjZ8BOwJ#Q|GQL>GC^2h!q+X;_O7eQSlXVQ=mgC1C z+8dXevoKZ<$YZWnlU*}P(;l%lr8wJ+H&b%wQegruZY}i;k`L(&1ZE1AtM_MiLK&e< zu=pBRPPIES>l%zX)SmsjW0Teu!QBRgW?W}rcdmDeIm%#CzPwfTCszCunt7qgxVwQP zl7USh${5EOzZ6)1;F1GxUX~4fRa^*q9)g`R9`xcZPIEQt#3%T-n`a#e=K88aAQaxh zRn)bSI}&aZdB2)2f>P%s*0+9A<`mZ)gzPp5Y+||B7%4LF-H0J&xN%dK67Ex;ECaYi8!#>Ko1z(FE~;-ud{`@@PPBFq!{0;a(v?GGJO-WQY?6i!~EW^Yivf)wpWO( za{Ss^sRg^1bY1UqzYpqCY zjI#m840S!TaZMX( z%*Fxs#O&HWa*PgPt{3vbSe%FTNmlFR6wy1UcZWK#XQmz6*JZXsQ`Z_slUoIFw<;UW zr9E4WU(7%p4b?Y?$6^O=sXJBkjNr&0(#lrj^tCElN(t`|;!fMAusBk2{W#x-spP&X zlacqb@QV*x)*lratah>G>^DDkjrmRR?nn)kq{q2!4{hZvwEB$hFs;!k8zjdK*GrDm ze(vD>-2RUB<6dy#>QT3WLX~C8UqR#0lYzUvM1EWGNbl9OJ&1x~D2qL1gHW11>@5^+jrNTdt*w!zED=q!9gmDni~Q4`E>URJ}zmI7%YwrAgtM_S402L)NC{ zG70^J4d9Gm=MD6Qy>NbUbOdi84D-azXoxfF#v(ylXq9KuV8z9hPjMAzhELQ2<@LxC z4q5g1S-V`59&*q(XZIdX=ajVZK1W3K&w?u8?W?x{au&wBQNmyRJy$=^tBdqk=2@po zSbHJiGkpge5{n9d$cA69v#Jrpa0FUcgr=}mTkZCKv}Q0-STcN_>rlqi!droe5HK6t zU6TQ|ujb@qma4mZW2X1)u2HbZyu*7^#D3TkO?QH*ZXN|Ni7`D^Jbc+F@ z>R)-8J?|n>9aqQ7pP5EzLWNk^Yp})N={lT{gvSbw*|4fAc9Bgc2K40C^Q_r43q*mR zS!5GE*Jw&wal8K5$b60i{tzN^?Xj=;We+4e+y8d{TF#z91~BoBSE4AvQY;JCLVx^A z1$()48Cbu#RS#rJET#huA#`aY2<`AjSpvjwXg~rSQe{_R@=@Am*PizyqX~0gxm$$u zM6cHoD14!+@Kt4a@w}b4iW>^^8Ie`(_Yj%&(|p<6vXnn4Qj{EuFN0081Q3jHfnBsLV9=Ue(YHt*R}i%R!lYUe9#YvADDU6nw)p)FxzYjv htp8WHScEiwMUnREWfJr%0s;X*01dx;gGv6K^dE_0Nv8k+ From ba3011721c5b94311caa3018df706f976968dc53 Mon Sep 17 00:00:00 2001 From: Hew Li Yang Date: Sun, 22 Dec 2024 21:39:12 +0800 Subject: [PATCH 8/8] chore: update tests --- tests/test_markitdown.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index b1a0f08..1eefba1 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -54,6 +54,8 @@ "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0", ] +XLSX_TEST_EXCLUDES = ["Unnamed:", "NaN"] + DOCX_TEST_STRINGS = [ "314b0a30-5b04-470b-b9f7-eed2c2bec74a", @@ -175,11 +177,7 @@ def test_markitdown_local() -> None: # Test XLSX processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx")) - validate_strings(result, XLSX_TEST_STRINGS) - - # Check negations - assert "Unnamed:" not in text_content - assert "NaN" not in text_content + validate_strings(result, XLSX_TEST_STRINGS, XLSX_TEST_EXCLUDES) # Test DOCX processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx"))