From 49ec6351ca2017439858dfe4f31021848dedec0e Mon Sep 17 00:00:00 2001 From: "jasmin.ziegler" Date: Wed, 27 Mar 2024 16:05:37 +0100 Subject: [PATCH] fix: additional column for patid to keep pat identifier for joins, include new datadictionary (#165) * additional column for patid to keep pat identifier for joins * updated and uploaded datadictionary --- .../datadictionary_bzkf_q4_22.xlsx | Bin 0 -> 13439 bytes src/obds_fhir_to_opal/obds_fhir_to_opal.py | 8 +++++++- 2 files changed, 7 insertions(+), 1 deletion(-) create mode 100644 src/obds_fhir_to_opal/datadictionary_bzkf_q4_22.xlsx diff --git a/src/obds_fhir_to_opal/datadictionary_bzkf_q4_22.xlsx b/src/obds_fhir_to_opal/datadictionary_bzkf_q4_22.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..927aa781454bec3af0478d5f2838a0e70b523732 GIT binary patch literal 13439 zcmeHuWO?(Xgu+=I*IoSAtulXLF-58T-w zcK@oY*V9$Cm({9gD@cQYq5^>fK>`5*5dslLiE%jt0|7;W0Rf=^L4ME{wzqRJwR6!| z@pLeC)}`~XwITQl`hg+`=)?Q(|9kx(UV&bfA-iq{#5VOKevx*y@Q=c(s-W3zo=8peJ$Lz?dxEdD(sKgHLa~G7!J8=XA2=hr%F|OBb3zDO#e_v zjSeSlegHxdjRwT(x*ufN&j3!u7Al0d3#+hWny;;nr7mWuffb8j^8zW{;=bc)n<=4WH&BZ<;7o-D_jiRtls^MNEzuz%Sx}0mPiJR94ZC4OK&vrrF zaE-h4^hHJ5k8;d+h4VPxiOu=(nHo(XHoV2c%^A3(M*eVf9uMqCe35v-i1K2+m3EdF zc%T6I?U>iW^f{L2jSen%U9D^oUnlJj4es;1n!UY&04e;3t=6kD5Z}D_$h?~(%)71X zJDJ)z)6@NN|F4byA2!T?n|gVItb7jxZ0NbfbJ*bB+*%x>khHs?L<^ycUx4H)LVa`| z3HDkCIS!%J?cx8-8rAQ6p1f4nn3Z(!i6rDHiqgKDH=DN{+CGeb( z@@YkA^>_B1qr`~}|AmyoLpc6VoH8fVX=sDaM&?VEK7%%dKOX?9$`+iK)ka^OxQIOU zO>8=^MKU^2UVLcfG6$82Sdj0TCd3CwbAI}3)v;WTWO??oL-mvmUJM0B;uo*KhpoRF ziBw6U;p6*?h=TwD0e~QZJ#6UzVJGhPPS!^D_SS!dw13$Q@OxZ)NBO^fwI+;N^)kQ) zpNF&uO|>~DyebTHc~ge8HLO4|)F6k++8CGcdp@Nnov~Htr%vXPuv{~anmuveY~vEU zu2WKc!vSqx&KR$U*1MBeza#im&)j?mj}|3ST|r0BHnC_eCvVasKvcq^WRTVql%%4K z!(z&pBc0kmAm|J*rVgq^XcpDnf5?jJV9iOljL1?%bl(T``oRu8*xPQf=tjr3`(aOB zV!;fB!zAX!(0&WFeAYwP`~c?pC08eLt7ia2Q-seEN1dcdx(QVInR(QR>AQ7z+#J#v z%zz3XR{^s&J*jgo6>O~U(PIrY%NYNyliJ%6%-?AaaqD*ml6S3Pg8%|Tcvp#c&G~no zDOc6C&tO3H&8~Y3c$TYyBLq!Wdzk&|s=VNu#UdJk;}ED?oyI6zTKeX@ke{zlzs8lc z*xx_!W8M4g=jCA~O(u4wb*@R^Ad(n-YRPc$D#^jX-e`23GBz2r8pRNb%ZRm$*VSdu zrP|?fveE)P&>)v!6Qk;hr)&Ck!kl>NHnQ5hRO)DF*}8{o0q4k3rn53SULVJe6dB1f zaqlB7rd1m{Wp;5LWfO}x;uwrsfL~T*>DlD)4Xp#|%!2G*VQD##5h=8xowmYmk|ocE zM8Q%iP_29Zj}w+>p?efqxKp{n0(J3ZqPbEvYiH2O^)ENnPf)jzp8J7o*7&V(lou== zNNk}HYFy9aYFoh6KA^<+YZjpSb5^9_KVFEn&O4*OChpWz+|TJX!yq~!3mmeTrX0+qQaM*|c)srrE7!LHc}&DG;VmZopt>>9 zL#>Bk@u@8Z!Vuir)SYQ>In;9;egwYR1@FgcAezC7|+XY4uD&)f72U1?Y6e}04!N||+Mtn1ni_gOmDd4K_~o%QMi z`F>JzK0vPvihj_Da@aHlAN3=V8FJR$4a5+{(F8%0H>?rSpHdk&aSu0IEdKm1JhI1f z0Z&4CpU&5?#V%yC5ow7yn)~^p zvB7`J#rkgEK>Mi@bYnhtxu$CTtSByRxdDBR56gb#Fq?>(bC52AL8CSMske^|qB2-X z;j{7LQTKc;C>TlgNlC%y9P?Q+Z!2aGofV$EecrA46z$E$+ZY?=-3Uv=5di9lTeB;N ziKtEN)OD1`(-%9;pjmdQNOSHSAUwB8wGV5Wl+;Rt#4Giagt&+seqHYS^1!GQt+xg!Dr!TrOaoLxL^Or8I5!Mz&V zvCABozEyQ^gje2V7?2>J6p{^zRILi6n3HekNZ>VGPX-xhs>@#QUTnzY4q19kEm9#a ze%|}IXDmH68;qmGqoMJLbw$VGbSuXzw#Q5@4D{j>7b$%<{|qH(R7Z^F#s7A^R<6V? zhQ}660YTt_mKUwGH+x;`A`Gpw_9?;P^H2y)bzr5Aw<;ynI8|GTf!@IraaCs$aU=un zw6Xv+O^2;CJDO-Xny#YQ+_(W@kwg*xf{sM1kw86sD2MBj0f}%?$|<4QO&=$iY(cR| zr5(@$il?0NRl-?sh6e{P0q~Wngy8PO7W0y|C*kyD;%>Orb?yTVhK{NnRfA670s1E^ zG?@;^%Pev)XqwgX^odgcO!83!5`lG}6lk*iyiS)KPUdb$!IKE?(E}-^Hktu2Q*LgY zBVglGGhIQ!JFm$Z9-55o2~N15Xh6~`mFkRA$QwR4@#^N*`A@!DnwD$4wVhz;o={gs zbu*V(!^}#FG-5l)i(QNDAI@{5PJKOd*3iNCnIXMIN#t{OSWX9Yq2xvg`89tqAIFsI zNw>muAKbtt8?emK%)67R01#w0cb))db22AC83BMBctc#Q(98#5EOjBXIPaBDaKIbz zW9Cu2&x)RW#zSW29gZ8{+Na38-Qp;W*d=L$~7HRcF)tyz5x~03o)}N-z(% z&rGr0P*&w0QP@0R#2~#%b)&m|6+hc==mK4G&Hhkjy5RlFS(Ig)NBxDj^{anow5urR zSLeNZ{s}+Q{+J_&?9=pBcSSdQqtR81M<-MLV4;JzE=X2U$YHr60zT@nC&1o8i%AsT zXp5{FZbBKYX~dQ9(&5Us0FKs>3v$8?Z3y&JRh!*YJ502Me_G1R6$tlJkB--EqX1hO zg6hg@Qj%YGnpb-fLd2*<%y${yGjd}W3x6`KHFSYe&7UEBP^JV%?_W%(78vAX2{=8e zbb&$mWmV)0y!ttEYp#>$a^GSYyK@l4bBCi#vjf&?Q$|d-!(P{7$Eh#j>_x7NL!tn% zMQA~(`8L6}gZw!Cof2U&1@<=(buFvpTZ3ORf1AQ_+cHUaz55HH z_g>;(f!Nu?)YQfKpV{1>4aWbM%LT`FMRYTuir#^|3VV1b&WFGXE3*?ds{Q~mdRzuy zNsYTheetfvG40&jbC}p;%Xs1`!8U`Br z*Pad{Cnps?5T}6#Qxl>{@`Bmp*Sk~`162~ox4$GA?#VFp5Q#lmUTS4~r4La@FIRs~ zv+JuK#Ko(m`FaxLi_c@1*D)?}H4!Yy8E3%vRf2O+^z+9QKy zx|^rlD4|9@V{KMO;p-i)u<$f#VkyzP1Lam{ZdtMvWi!ZPag|G?C zAt>ne)Wgt812{E9h}-XP&Ru+L4hi03W0GJa6X-DolC1zXbe3A@+Dovk}IPp?Cr=TLrx94fDUSbd?)V50My zZhEWgCLuO6Bo_VaN`{7fe z4%c^L*fPJBNze<;jSLHRog(9gt#GfJ`TO_X7BgwFl(sF}~6n||iD!TjP=XLFJIoE~`K59IO{K!}I$l9gIxaPd>nglI%koz5YrQV^J~U4RcSR`guWi7kDV&`(zJpXJO;5o8%%=HYk9y)U2XyRPFT2kwL8EI z?w0}WdRBLV%?0{FL{bSotS*%0sx2dPl>{ha?wJ>6n6O95GM+JkHX5d>=|LgEQ4 zPCiKYF2{WvKp~bh`?jYKT+t5fd3myQ9VVa6|9tw8Cq9AynlM(Os_Dc{k(TaM<@fw} ze%^8M`uH~(KKzy0|ocA+E;?MJ1@X0hgur)USA8+`i{%fB7?nY`80-6;Omv%j54t1s zFP|nz9Y=jwBWO4AjO%(DXMok`bV9?zu1$`f={So=G7pr^cMtQx70IN-N&umbcgn>@7S=(h10kGE zvA?zR#ud{oJKG}0hFbZc7r{fjL>(|Q8oGc@@oqT7*y%T5f`jdTUwDocNW!kip7n`IHXhgU1P z;ma?9d*7mye=`4=fO#pKUL`nNF&m;hE?KZ}crXd7N33u!*v^sVq#FQF-1L-DxBe;4 z)vd&QYkUFh`}Q51H|*`++RT}BTvt&Fu#vy@1}suN*&vG9r<_iQDcVmOTN9$QO{IPN za!uiFme&jdh9+~|jFRgb&zTh7-Tp-Z zV9AHL%(K*4f2c;kbyQigo$g(0?YE|IR+;M9>*MFA42@tE+g#Dbo;~zpYxAK99k%Nz zv{Jh)h<9R-c>o19;}&=)l%a3Z1XgJV4RyNjaXQVm1;xkn(fOHPt8#(V`$`{bTiX3pPe%?9Sex`mI>raf<3tcJGDqHwQyNmWSOa3f_(*&7>?Kj zY5d!aO;fgMd4{J1?R64$N(&)--ugXajT(tlAhv}O6PRr4PKth6s6#fBP;CnT#~4PR zX!AI3p*B}St`L~i_}#-$e+;uUM?!B2k&+(P!+aY=l<+2T$Fji*`PYsCs4^EfjX`(aa?E9kY~kaAViYa`yywUWnKRZK#8&M0|n~Xr(~K((P713EyMYgr6eaCbtKSL&Fp?V(6vFA3zP;JXm@B^L{zmx3#sP ziNVM*{*e0NE1Qn~QYv)}wYPn--M()u(BW#8D)^XHtui%B*{LAmfh!=y?E5_Bjn$f$ z{p4|I^#10@b6iV@$x62%tq#TF@gwUVwGjVwzUI@QwU8g5Ni2D;%l%* z^_BXUKZ;C$-8BQdGzTZXAI@P>{ja&-pD{RJYsD!~4AEC!_4VVnFX_j)IMtNz^GYdUH6N70SCK@H#wP{`9E+lGrLyJJan3 zPP`%|=hz@Ej{x&D!EeVmYEwp$H$%r|Qe6$ za_z%Gvpukji57H{nYlU6HxjD|<-d>6vzE0NK8@4Cq4d-WUckU#$BQ8j+vzPRf-|I?9EYGAjGs@j(m zj9YT^c2`|*57!~wgDMtJ%A}17rsfW5cu5^HhEK-Y8{bYmFvMkAU>uGfC-l>V@kEHP zx1MO8x(DrYS0F=aIiRrFB1UdFHOU|Z^yLX)U$;;`>Kh{Wc%dJi&qdiaF1)2^-zmn^ z)b?x~R{8{6c8RzR@p*A2b(qr&Ch+m3OhT`&M(?*&mF0ixsxqVkNsUD@had>ao@X=# zhW5j1yE=~C1kKR6lT=*Kj~Z^+F8VqeGZ$b}>S``U?dQKBMYQKL_4tE$RE`9Zk#@$G>Fop^-#{a#wp;VCi%u{WKns>evF1XxX#Ai zQLdxQ!Iv&F2Hi3)qZLrhbBwvbK`F1ALYqn%o)Lwu@t7C+0Ore|xedr5-`GBjRT4G| zoI-S7!}A+hoN=>O#g6*X5**zW4kAkr2j1u^f&%t2K4Md-D7G4Nh$x`Zpo{2qf(jNcNYJs?pJO=AkimLtP%P)(p%@1#j> z?WLW%8BdhV=@73T8l3SeFCIQC0$C4orrVnC$%NOWTv>V4I|cHx+yX=qZ6hl4}PMSKtn39No{%*B)pQ?T-QqR>mJ;~x+Gk4azLiv?fJ z(}IV6?V?cxQ*)!mkp_p1Aq7^v^XRy=phfuR3@OX%_<_x!hzd8E{dkEG&DcILVsap;(7vLbi0tcqx&mx-f=i3yo^~(ES6{gSX z>VQf|rOWF$84%Jk7R;N1U2(&g9t;7qmbh_zygEzBniA|R$rX^1A|%n8Wz-RI=;^l* zA)WfVCMrqJFd8c&jqR&}f|(~uBa1dWkdaC<$@Ym|E_HE;XvlkQ;;2 z3Ct)ig_pQkGwQbN5&K19b@oL3TSKSCt+kNghQDLiM$O?Z#p%*h8Cj?#dafzkB|kKw zrU-VBNqj>Ka0dRcF^PN~QZVaBMf|QxQa{unY{{G>W;-K+1&Gu~+{YV=ecbwVe3ZAs z5LP-?OkgI0A%v_wPmNm43W>;5#!DwH+t{Jn@XpbLvv8OT_?BCWdffVXyhI3OtsCSP zCCD0U^Fy3(mJY5_ZUrG1n6k#iP%P&nv2@zujEl*t=I4V0bv6YL8qo!mOA{8+QddwR zh%IV}w|<#ztWdbCSDFnLE!0a*BJ`(FK={a#ctNKcOE>T<;Z$Y5^fM|V6V%dSOmQFt z;g)Nopt2*3g74$XN>EEO_Q|J(s@o$_UzCDnEsu;Gy*2o8QJLy#WqL@+t|1lLAZ`i* z%3S~)c%`i&9Cf}W;#|AAeMHkbzpw%V z7EWu+%U=V=qN2T0ljO~hak3Jh>0bKz0%|L0!L3#O%8ru!|NF3W^D&Li8r1e+Bmf{H8%qLG1=r zJQRe!3g*t=NIua%i?$RntrIaL$N~!iDnbWCH$2l7cC7BgMX=1ESSfXde2=YuD0ji$WPM}lu@3XYA8Zo0#)r20UQf^3654g%gE6zW@B1&JBvyfUPgF|bMs~oH}RgzH@azI zoe5i!y=3$zRFyU#KgHbqC9=xjdj)ZX;2e&+afK51JUS4zCJ%)`-VuD^TLS4ogIo83 zj8*{%nj6=*^hUb1GlmMFa7T~qCHfiQ+N=WX zNv&EdjlF1C<1Fl7l;>AQ-(JN;4h9i;VP9R-ufb7F_53|P0#zZAyerW!`9+Y#w+5dz z3-&w;x$ElZIz{d-Sou<@wCDtnLx9|3dOlk$zuD5tsUP`EA2UrEjtk0Y$aH1*!b(39 zxMAbcBL}ovKw_5gECy}HqOSS!+h%LLB;fUUO9cwX^^J@1EA)MdA=T9&2ol;)xG5*f>`FRLl?jF{(Fmt+z!MZ} z#EfRRKT#DkG;zlRS}~q9ZwHH7OcdE78G?!0rCG~48IU#{s!C_NXye};MP3thh2Rd+ zEfeM>^#P~U!m!=t~ypLrD3&;Ft>*_(M$jo#2?uq+pgV<%CI- z(ETF9tZm114zrHvuHtirS#ll5AP?igvV{@%U0jQ$QXGtc?(q@mA7^nqAE4R~)d9;w z5f!z)J9xKSx{%WXSVkSml-iQ2bqot)Kyn@hWIRL!t4=YMc-7vre7rf0YUX@&AP#OrA>%nNH^Fx4}?k9EQX)he2x{vSMppxHpeHvPH z)tw|2hE3(~KF=sh%`K%@>7WKL9^@dF0jZCsYxqzKxQV`#h(@LAVXoLPmW^I(Og;y_ zXxS-vnchASPP>HK{1PeN&t$$w+vHENQ2eQfZ2I5YjC4kpb|*<=QacK$18V(f4vM#%dHnRE65+*P;7r$z+4STp zXj?6Nh$?#M9dlaxbb?viyfV6t9Qo*EcD*umC|x8=7MUnn!TzjD`7;I*c4LmC!)J(O z!Rn-VmGikDO0Bv}T`zasnT{xoXEtulVD)3k4Pf@pg>fibVtv4!1T%dNu4~@3!h0c_Wt7%5PT`Az2LH zx*P5+Ui@qKV5`kP&tIHj)UKV77p^I|3}=}YMIRdigz%S{t4Ga&TTAfIDK`!4 z11$FJ6>pRHA=dd#vJZD4^`sW4nQbetxn+H6+|FFY+8mx1T6pIg2THZW$siul!S#*oqY*Co z2cbKKlJ5^!8~hVdeKR#Rnh=D5bLd-!Ax&_(r{PB1F15Xt7xffTl}e>~Zi+=HBJTwE z>|UaK^bdefSdxtz)wo97J3L3YEL3t7rxeOhlN2QEHceI5d0TzQ!XWSuEED*gI7m8R zbH3h4CPCZfJ#T0gt#Uq&x?i+DqIviv^T# z?&K33ksb&$ct`<^)oJR#F%-d=R@i`cRRLeZQ?WYd%6w9C;W!q^L(pX5FUe9)u~i8U z)~uET%fmsoqr%QyZ?Nqp$nF7$?T>zWfk6t~RADvoa`rK>$)I;y#)xy&{tOBZkXm(0 z0jKT7|0x9ajNvxHAb`Me9NeCJZQKiUqC;roo74Aj5!lY$d$n-tyiqrtwL6^VryLJJ zyqo8!te*q+5|LrjTEXMzi^84QR7m7B?pD2IX3IC4ppBdpVc_5%LN+6)W zK`1C&ATzY IF#OT_AKX`#OaK4? literal 0 HcmV?d00001 diff --git a/src/obds_fhir_to_opal/obds_fhir_to_opal.py b/src/obds_fhir_to_opal/obds_fhir_to_opal.py index ef29c91a..12ecb59b 100644 --- a/src/obds_fhir_to_opal/obds_fhir_to_opal.py +++ b/src/obds_fhir_to_opal/obds_fhir_to_opal.py @@ -241,7 +241,10 @@ def encode_patients(ptl: PathlingContext, df_bundles: pyspark.sql.dataframe.Data return_yearUDF = udf(lambda x: return_year(x), StringType()) patients = df_patients.selectExpr( - "EXPLODE_OUTER(identifier.value) as pat_id", "gender", "birthDate", + "id as pat_id", + # todo: remove this later or change opal datadictionary + "EXPLODE_OUTER(identifier.value) as patID", + "gender", "birthDate", "deceasedBoolean", "deceasedDateTime" ) @@ -255,6 +258,7 @@ def encode_patients(ptl: PathlingContext, df_bundles: pyspark.sql.dataframe.Data patients = patients.select( patients.pat_id, + patients.patID, patients.gender, patients.gender_mapped, patients.birthDate, @@ -632,6 +636,7 @@ def encode_observations(ptl: PathlingContext, df_bundles): def group_df(joined_dataframe): joined_dataframe_grouped = joined_dataframe.groupBy("cond_id").agg( first("pat_id").alias("pat_id"), + first("patID").alias("patID"), first("gender_mapped").alias("gender_mapped"), first("conditiondate").alias("conditiondate"), first("condcodingcode").alias("condcodingcode"), @@ -660,6 +665,7 @@ def group_df(joined_dataframe): joined_dataframe_grouped_repartitioned = ( joined_dataframe_grouped_repartitioned.select( "pat_id", + "patID", "cond_id", "gender_mapped", "conditiondate",