From c207d69b0c6807c74ae60a5339f7bc660457d2b9 Mon Sep 17 00:00:00 2001 From: Alex Stefanescu Date: Tue, 22 Oct 2024 19:57:47 +0200 Subject: [PATCH 1/2] Add Workbook metadata to Table entities --- ingestors/cli.py | 16 +++++++++++++++- ingestors/tabular/ods.py | 11 +++++++++++ ingestors/tabular/xls.py | 11 +++++++++++ ingestors/tabular/xlsx.py | 11 +++++++++++ tests/fixtures/staff_list.xlsx | Bin 0 -> 8396 bytes tests/test_tabular.py | 24 ++++++++++++++++++++++++ 6 files changed, 72 insertions(+), 1 deletion(-) create mode 100644 tests/fixtures/staff_list.xlsx diff --git a/ingestors/cli.py b/ingestors/cli.py index d0ffad3c1..a1d3bdc70 100644 --- a/ingestors/cli.py +++ b/ingestors/cli.py @@ -8,7 +8,12 @@ from ftmstore import get_dataset from servicelayer.cache import get_redis from servicelayer.logs import configure_logging -from servicelayer.taskqueue import Dataset, Task +from servicelayer.taskqueue import ( + Dataset, + Task, + get_rabbitmq_channel, + declare_rabbitmq_queue, +) from servicelayer import settings as sl_settings from servicelayer.archive.util import ensure_path from servicelayer import settings as sls @@ -78,6 +83,7 @@ def _ingest_path(db, dataset, path, languages=[]): entity.make_id(checksum) entity.set("fileName", path.name) log.info("Queue: %r", entity.to_dict()) + manager.queue_entity(entity) if path.is_dir(): DirectoryIngestor.crawl(manager, path) @@ -116,6 +122,7 @@ def analyze(dataset): def debug(path, languages=None): """Debug the ingest for the given path.""" settings.fts.DATABASE_URI = "sqlite:////tmp/debug.sqlite3" + settings.TESTING = True # collection ID that is meant for testing purposes only debug_datatset_id = 100 @@ -126,6 +133,13 @@ def debug(path, languages=None): database_uri=settings.fts.DATABASE_URI, ) db.delete() + channel = get_rabbitmq_channel() + qos_mapping = { + settings.STAGE_INGEST: settings.RABBITMQ_QOS_INGEST_QUEUE, + settings.STAGE_ANALYZE: settings.RABBITMQ_QOS_ANALYZE_QUEUE, + } + for queue_name in qos_mapping.keys(): + declare_rabbitmq_queue(channel, queue_name, qos_mapping[queue_name]) _ingest_path(db, debug_datatset_id, path, languages=languages) worker = get_worker() worker.process(blocking=False) diff --git a/ingestors/tabular/ods.py b/ingestors/tabular/ods.py index 4073b120a..eebc1aed3 100644 --- a/ingestors/tabular/ods.py +++ b/ingestors/tabular/ods.py @@ -63,11 +63,22 @@ def ingest(self, file_path, entity): table = self.manager.make_entity("Table", parent=entity) table.make_id(entity.id, name) table.set("title", name) + # add workbook metadata to individual tables + for metadatum in [ + "authoredAt", + "author", + "summary", + "generator", + "date", + "processingAgent", + ]: + table.set(metadatum, entity.get(metadatum)) # Emit a partial table fragment with parent reference and name # early, so that we don't have orphan fragments in case of an error # in the middle of processing. # See https://github.com/alephdata/ingest-file/issues/171 self.manager.emit_entity(table, fragment="initial") + log.debug("Sheet: %s", name) self.emit_row_tuples(table, self.generate_csv(sheet)) if table.has("csvHash"): self.manager.emit_entity(table) diff --git a/ingestors/tabular/xls.py b/ingestors/tabular/xls.py index 90a6134ab..617aa4eb7 100644 --- a/ingestors/tabular/xls.py +++ b/ingestors/tabular/xls.py @@ -59,6 +59,17 @@ def ingest(self, file_path, entity): table = self.manager.make_entity("Table", parent=entity) table.make_id(entity.id, sheet.name) table.set("title", sheet.name) + # add workbook metadata to individual tables + for metadatum in [ + "authoredAt", + "modifiedAt", + "author", + "summary", + "generator", + "language", + "processingAgent", + ]: + table.set(metadatum, entity.get(metadatum)) # Emit a partial table fragment with parent reference and name # early, so that we don't have orphan fragments in case of an error # in the middle of processing. diff --git a/ingestors/tabular/xlsx.py b/ingestors/tabular/xlsx.py index a70727cd9..80cf795cc 100644 --- a/ingestors/tabular/xlsx.py +++ b/ingestors/tabular/xlsx.py @@ -48,6 +48,17 @@ def ingest(self, file_path, entity): table = self.manager.make_entity("Table", parent=entity) table.make_id(entity.id, name) table.set("title", name) + # add workbook metadata to individual tables + for metadatum in [ + "authoredAt", + "modifiedAt", + "author", + "summary", + "generator", + "language", + "processingAgent", + ]: + table.set(metadatum, entity.get(metadatum)) # Emit a partial table fragment with parent reference and name # early, so that we don't have orphan fragments in case of an error # in the middle of processing. diff --git a/tests/fixtures/staff_list.xlsx b/tests/fixtures/staff_list.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..ae20e6f73c7873d526d50c704b69138b041d0b6b GIT binary patch literal 8396 zcmbVx1yo$iwk_`N?iyTz6Ck(~5U!txR=GZ zskC#Vg&ufEJ~@9nFOHN~*r+7eNX<*}jn1C5IwBt&baW<#i-Y#=OM8rypZ}c+a6!qQ zb%B#X(I7%YCjzFK!PK`1+;`{X$-y`9!LJirKvvyB@`-b-s=`VC4~gCaZ`U{ll+9yh z#S!QY(_J$tQSmTmeGPZ+JG1chV4dA)94AZ74^#edO~J}`Hc##4DapK^@hof6Bv89` zD=6jr6bUrppqq!HcrKXx*7q$QBw`Swamp!Ur%#V;FI$hDn#jK-}J5HswpBOg{1SQ??XTsgn@!m`>!yOKz`w2 z%kdXd?hej2CJqiZ>>hTuzffyVkgx}FVo9H8g3^6g=&C*zFIRkun}QJySW{XK2iNT8 zh0IPS7Q-P+>r0iEUY_0Dn+@RgnJ!kUp5#@_h$k0(8)nk0F2w#>sVCYD`jUxHx=Yu= zC{fE^F=ZMErM$WzfT>{0A!$OM?(tG@Q85zeP?uoMLAvUh&R$39CGNS>Gm!+<`kC9P zF`@n5W46H_zeVrO=E!gsl1e3Ee%=Xsd_c|-jfFfftX~dwPCWJSr*_AXx^dIPW9F6C z(nZ8~a$136%%H5;kyZ~sPNioE$45q~mqh(0qs$|trq%S6llB6vl9z-Qnm;qlSzDnW zaAW?4Z1)p_N;~X7;fDDaZjTXr_j1;LMjXrcKuh}?qXim1ID!eRP_TeEx>~6_YuaSq zY_o>}t$VsCY#^zS#Izl>H(#Hb#^WN=C^SCm4PzRjiHv#;eJsA#fMuOf;cyOc=og84 zjsMyj9$7fhT1_3kbXxs^&f0g-22&_d zHe0e1oJTh7csp2y>;9asI+0|WLW|#XD`2^(1l7-LaNe08>oC)axCV3V_p})5Y|mE* z=2v58^}__!XmKwPyqZCV?>VO{?WU`?Ees@EuL;-dPT3lf_BxTdS~@+spb_<>_1#3u z(iyID(QOsy#A_WFmc~VzeNV(BwiQT40A5@5pXl6bfr-dun7|AdX#+uv>AEf^!~qMB zQMnY~6K}5WB=$FDVIyCk`r*AxUU@MdMPNLK+P-6aASpt0HoC(LY-ZEylfTM%wWn5e z!rFMoMsg*6ko%faUMEcD2kKWlf=H|sg@g7>fw%c*Of_Y?=R1^51OIT8I{PA*pF z=B_RrzuveWc4D)pvdgRhTJu5q@aFK?#20*F!=&%>Sr)Ttyo2B^Hw%X?|TfO|Zy=SoG zBcOVc?{LTNx-Mcjp|&L~qlq!03k$sHt9?SM{t=^rBqR#Fe1cS6K+1|dOj~% zj|AE}5$Hxv@e6VMY!Czu=*4wImQUjWJi+RKH}SRwV>oJG)87TxDERr%84fhNXHf zFoThCDwa0PzoaPiA`;x;TM#K5+7%|!BI78-e`=!i7j*hLTDQIOfN!5oK7%W(Z6$BUQXXN*s zth%OZJ}K1gHp%G(qj7to$jL-9h6Oq)GDgV*FI(V8Xw;J9)rtC`kKbDFdKWaE0LzNR zm6&W}@Nq%gC(YnazYe%Amql-cZhmBJZQ!))jPPPzSD2>Wkj)VsutagMI%k4lsV$v# zf84F7r{a_?{1(lIphb4L!SZQFh5u)Y)03fMTH{Gh)Vcflll9(a?x~#y9PJRw%ycx* z0fwmj4qGQO_8B2#y%tVASJyCHGTfo5m`-&<(&FcaUqx|=4v*+$l z?*oIvnk$mntKT-ESif|=a3U044sD>E~jb=!E6+=-;2v5!UsaBsX; zYg94bdSqsw<_g6eu$2|$cA5l_F&Fp=-QU6_4BO5RbYSfW4%seC_uiW9Y!+SGKuJ|) z#%f`IZ7E3-&_YY{xB@=~zAJVN1>iNTgTqv}x79!Kp?D&xcf`$c%-&Ch+$Tj*86o?vQB0dg ztR~Xy=U8^iBInl=Z4=rrh9@Js<yCKNfQ>wEH11<4(XBj^(XThDzI@M>@EedGv9QNsxTe;b~KE z-;_w#Fo(0y5hEnUB8M?2{uEqE#!2~cqD8V7)o{iO1spuOuFXwdsWnxB35!kuoL)Z%Se>+Hf@ofd-k{41D}pII*xXv-bB#nQr#$E+5I&7 zh+5Eo%}e9vOb4L=VI2OL)ce#Mr{3kfy*x-vy@ASAGJ1Qfy%{NUKiZp`U;d zt)*4n37`pt&_&DxU5HiJ-h!XiEUjC~T46-C2JlL;I@$3&P3`hH=M*6q4)nV9p|o-`8PdN;VB}KMby!&?XYR#RJU^e0Md=SiY(lg;VA;R$W(vQ#&9-yuDVf>n6ZPCz71;(=N@MiCGu`IT(O?A`_J;9lZRJ&Ybb`lw3WhvLPxrY;`G| zr@iakqJX&bfYSNLUfL+x>Kvx5>EB{v-@jR}S@c-cF1$c0z|7_TJ>dR!x%>iZ+u$!T zk$nvj6ViW)3EKY^6Dfa-iNe=#BOX2^tHFw`&+F7U+5&jjkQ)OF`qOyv!%YH-3d(s5 zHWBvTwjjkmaTfoCosk^pTmhzn1m&Z9t8{s~-S;HT%2n| zCn0gK2oLf)rYR)bF;4T01iIXFH)*Tb2BThdPD`9;i)OVMhukPgxWaBk-#<|)uQ#)L zT3q_B)=@j~BE0_FWYxh%8zuvWO5aO^ z(8EWEcHAevZ9Fhfb68BeX*3;wRuJ%w0r}#zB=)f9FNup1dnR5;-(ymDA(Sx6cF-QzwaoYZkl|3;jCi9rLMRCnh&# z@5!gTrl=S(g&`KX7r1hgV4Nbpms#xIr<@|hLV?y-J`C1fhekhUt~_{!Gn97>7TGFE z8%p|k)2sbcFWHEcP7g%VGU&<0QH;EHaU}uS z39okEP!s5GjP7|_++ocUV=|v)zE7ANO+%)4sTsF=r#^Z>%)b|R`ZI8p)C!sJjDGcO zKYvdtekk5s!|`U0wVI(Mi~eok?2`~peW3r^EAPZ2W($LKe60rI;-}@zx0Z{|ZBax_ zWub@5=si*His#qs5cM!mZU9?#4ljGdwkmb!+vg^*qOs=cWW0K(-zkA;ip|39A8OV0 zaUQ8$|2Swc~%jJ3PoJN~I5ptqF(vi{4 z4flC;o-(G=pKwX$m6#+3`d)2{^R z3vu}#&(+sVx~r^{cjm1Fz!Pvt&jcbEG?m8N3Z?a@!Q=sO9sC$u`nlVjULG-q%zURo zpclw#V3RUu9?A8IF@%wu+@KSa7m=Py2vYpILVcX>i{%|u7I1=<@iul8gG z8q+CS^!B265#u7H$or~R%a3pAa-O_(P9C*5?^rE%zXYB%d0>$;EhP+ zkOemD8M+@(+s0?xV^64kJe?Lk7C7xi#XC9lkzrrk7-7zZG8#%=o}P@OCO25*3Xk~* zJWm=E%g^nHSeD15YZYt_)e4G=uc^3nznbm zcQ4p_3)0?j=tH19`)YanE2DH)aTrJeIm(V8{ZsV8`)hf*SQ$H;o4s^(2H0D=JVc(P zQDIduCmPgJmLaz1#whGou!@~Z0}29>xQK-raQ|sTC$p2+hZ`>#G+UWyA5P5qn0R43ljax26K#`{YoMs&Y_oYL4eTu$K)IO-%z=F+M55uXir;J zb(WK?`QW8?ODr8dUg{`S=Qtn*`5g_lvT!O`p7n=!+Vw1l#+RT8dLp6+Pk%BTtG<$JacM69HW>KmFPwM z_&|cUfxJAgb@cpEU=lQ5VYw7oKlve_S4Bk-+JL51#@zFc7MVf`2k=BdFM|slI-~)ksJsmy;2W;Az6bu<|0=yFEF}Hw9T)i3IUknRH0;1614}aO8#Ut6NJ9CLjLZ-E zF55mWXx2!W3AzqJ=cX3UoX4SQS*>M3Y5#la_k+8xPZWt|z1J8PSojlson7v72H@IP z8pZlC2My?)hwPP?og@@%Ro)3)WGcR;nCAN$(d6wlB_SWnjiP8gkpGo#Y#3 zI0A#IcCI$fqy+pD0{lW-y+&Q7(pwaCt(DCuPBO=^8Sii{c_*jC-LFUv(znqja*Hx! zneMh5k#P|+T4hHUAL2-~YA9Y1WaXM-|Ifmhtu*=`YD?7|l zVTB&t;)gd9&PI%OAK`qu6rfixupKYvM_EBfH-zDfx%YjmFWmS7qnYctMTDn)*s$3q zWZ@ekV1R_bgoKuJ4h2wSbJJ8`!p?>Ap13XXX+D-$+FOQRnx3M@d3J`MN zpSYlxnZ(AqK(!*!>@kiP&@R_e%3H+gx?q9m*`2FEt8k@?ZaD-^d+KFa^`G!tZU*=P zOK^BroLq4h-{UYH;L6)m2qT?W$GKH7TB-(@m;ErExnfr8NQNSxo-W22VxFs-k1M3# zydvY=!+hsD2OCAD2IG3|{PpN~_sYU044_xLq4847t`WfAc-3Una=+hGbWL0*rli{&_p!s zX=t{fWNZsj@BC<#P2Zys+oqZsE}p#E1XS;r%&u1hTVf9riWEKzh=+!P;{L}ZB-Y~z zHFk7-n9r>Dda58!yspJtJjE3j8-6!>edG}$N?xKYF#l+^CFj@b0bOJtZ&azJ>0?*> z%iE!l!E;H|BBoUrpGYiZChxY>08sV@O8CZPgp zg|oqb&E)d1+JJR|GW#oC-ImTZTZ@E%(oB<5I@&U~DvdgUj><(9^UJjNbQ~VZ3h~cL zm6=;KgcZsMTa8|j;5Zb8wvu&Be*OAOYU!^6lkgyjHbP$iJx}u>wSI4rH!iLYcE3O! z?y(Qz#DW_3ZUr69S5kjKM+yhUyI=wilDXP&49qaAK5n;8zDSw#&gk&s$v(RHY+0p1 znfMf6#SM)X5KNv}uX(ZN%zzjT%}`FKn|&X=zyBco8t#NGua$VN|{{K@XlBJuAOF*N6DP|H&Wq|4d; zf1#@e_s>9xQ{e0KiViD7NZ zV_Ny6an|~W(L$^V4GOw%5S}#3)+O39>Q~7V@haX zeh5D?g3PBn5LYXR4}`0jB$NT=Tw9%Wj7dsTzQ z6j8<}p2h}E7J9y+t;Qa^bZwz{1DJA&Y!Y6ta~r+B69oQBLh}kPaSr* zhj?z?_P}8!`8x@qefH%gHVAWHA++x?Rv)mBYel6T(({OGXDR4E9z2a=(8 zdIt$6n4P#=Dlaj_6vOY8W5j;V8v_Qyv^!+ZG!P%wI5Z3%)bCR3pd#t8YV|6Ypx)6e6&%0n^kw;4hr7sSuw(%heeA7@z~a;d-VGu*!h|DQt4 zpI#nkWB%RCJVYV?{Roe}{L0V%>E-dE?ccpzqy0xOzw$VLdU>qPA6VzNkwD@!$o56bqp6+j$5c=(sj{d4H!`1Bwvewzv5V+Q?O zWc)e&F&RGuvERl+{Al>!A??r6kLl^(p$ma5(EmW+e>!-)PyXFOB`Fltf8aDVMR Date: Thu, 21 Nov 2024 11:38:32 +0100 Subject: [PATCH 2/2] Remove unnecessary logging --- ingestors/tabular/ods.py | 1 - ingestors/tabular/xlsx.py | 1 - tests/test_tabular.py | 3 --- 3 files changed, 5 deletions(-) diff --git a/ingestors/tabular/ods.py b/ingestors/tabular/ods.py index eebc1aed3..ce620451d 100644 --- a/ingestors/tabular/ods.py +++ b/ingestors/tabular/ods.py @@ -78,7 +78,6 @@ def ingest(self, file_path, entity): # in the middle of processing. # See https://github.com/alephdata/ingest-file/issues/171 self.manager.emit_entity(table, fragment="initial") - log.debug("Sheet: %s", name) self.emit_row_tuples(table, self.generate_csv(sheet)) if table.has("csvHash"): self.manager.emit_entity(table) diff --git a/ingestors/tabular/xlsx.py b/ingestors/tabular/xlsx.py index 80cf795cc..e37c99902 100644 --- a/ingestors/tabular/xlsx.py +++ b/ingestors/tabular/xlsx.py @@ -64,7 +64,6 @@ def ingest(self, file_path, entity): # in the middle of processing. # See https://github.com/alephdata/ingest-file/issues/171 self.manager.emit_entity(table, fragment="initial") - log.debug("Sheet: %s", name) self.emit_row_tuples(table, self.generate_rows(sheet)) if table.has("csvHash"): self.manager.emit_entity(table) diff --git a/tests/test_tabular.py b/tests/test_tabular.py index 1e7ed6147..ef5880e01 100644 --- a/tests/test_tabular.py +++ b/tests/test_tabular.py @@ -58,9 +58,6 @@ def test_metadata_inheritance(self): fixture_path, entity = self.fixture("staff_list.xlsx") self.manager.ingest(fixture_path, entity) table_entities = self.get_emitted("Table") - import pprint - - pprint.pprint([x.to_dict() for x in table_entities]) parent_entity = self.get_emitted("Workbook").pop() self.assertEqual(len(table_entities), 3)