From 02e06000e2d87cc7ae1551928673411c56d31928 Mon Sep 17 00:00:00 2001 From: nhatdongdang <144138246+nhatdongdang@users.noreply.github.com> Date: Fri, 5 Jul 2024 03:54:40 +0000 Subject: [PATCH 1/4] Update --- benchmark/matrix_add/Makefile | 19 +++ benchmark/matrix_add/benchmark.cu | 13 ++ benchmark/matrix_add/benchmark_plot.png | Bin 0 -> 40488 bytes benchmark/matrix_add/plot.py | 50 +++++++ benchmark/matrix_add/template.cuh | 10 ++ benchmark/matrix_add/versions/1.cu | 44 +++++++ benchmark/matrix_add/versions/cpu.cu | 37 ++++++ src/main.cu | 168 ++++++++++++------------ src/matrix.cu | 39 ++++-- src/matrix.cuh | 12 +- 10 files changed, 288 insertions(+), 104 deletions(-) create mode 100644 benchmark/matrix_add/Makefile create mode 100644 benchmark/matrix_add/benchmark.cu create mode 100644 benchmark/matrix_add/benchmark_plot.png create mode 100644 benchmark/matrix_add/plot.py create mode 100644 benchmark/matrix_add/template.cuh create mode 100644 benchmark/matrix_add/versions/1.cu create mode 100644 benchmark/matrix_add/versions/cpu.cu diff --git a/benchmark/matrix_add/Makefile b/benchmark/matrix_add/Makefile new file mode 100644 index 0000000..c754193 --- /dev/null +++ b/benchmark/matrix_add/Makefile @@ -0,0 +1,19 @@ +compile = nvcc -O3 -arch=sm_75 --use_fast_math +SRC_DIR := versions +BIN_DIR := bin +SRC_FILES := $(wildcard $(SRC_DIR)/*.cu) +EXECUTABLES := $(patsubst $(SRC_DIR)/%.cu, $(BIN_DIR)/%, $(SRC_FILES)) + +all: clean $(EXECUTABLES) + +clean: + rm -f -r bin + mkdir bin + +$(BIN_DIR)/%: $(SRC_DIR)/%.cu + $(compile) $< benchmark.cu -o $@.exe + +plot: all + python3 ./plot.py + + diff --git a/benchmark/matrix_add/benchmark.cu b/benchmark/matrix_add/benchmark.cu new file mode 100644 index 0000000..cc2ce0d --- /dev/null +++ b/benchmark/matrix_add/benchmark.cu @@ -0,0 +1,13 @@ +#include "template.cuh" +#include +#include + +int main(int argc, char* argv[]) { + long n; + if (argc > 1) { + n = atol(argv[1]); + } else { + n = 100000; + } + printf("%f", time(n)); +} \ No newline at end of file diff --git a/benchmark/matrix_add/benchmark_plot.png b/benchmark/matrix_add/benchmark_plot.png new file mode 100644 index 0000000000000000000000000000000000000000..72e786c34257599c2c3cfbdb60e861681085c569 GIT binary patch literal 40488 zcmc$`WmHyc)HVvzZGZ?8q97m=igc++mm(#N2uPQ7DXFxefKmn_-Q6ux(%k~mAl=_w z5A1im=NsqG`E|zF&)9(LS#jSpu6fOCu6K{+C5Z4T@zK!Gh@>PRDWaiaW}~5LKg*X#{!Y>h1~INA8v?%cli^3^LV zTYh$S^Zz-3&CwBY;&a3W_@jY`WHNPOUFSS4`V^C@gVEK$Lj^3IFF%m6sI2WyhuBWNWBfN{nq~d3nX#!<&^+=-ajFxi$N-H$(PP*f~`LbQ)#Pu_msfyls zxzh%&-{SU(h(s_|vuRdEV+(s#tJ7Sps1gfB^2A$;xQ^k)!255k#`oFYs9(l8)KxK3}x$K$bG(-5^kEIUV#5a z@o%2c(ZLQ_O-DwezlrwkbJ3G^e%<-TPaA`-sv5lBT1d;v)vLpX$1l<7XJ%%myZA0N zCZ@bj|I)>al9G}G8H-Y339-``A4-KSyfbY3{5~Q={Q2{S6A%AK*GD`~jy6|DqkDRk zh6>GF7$P-mOYIhKQS~Rvkjo42y-1Re{q+Ec`jLsrt1M|G^j8^Mi~Y83T>Zb{ z7C|&Wl7iDnI|;_Mj2%7_1(CmQMc53z0hRdc~fPo5O{txvSy=F0($*s@VRf5HFmrmh(Ki3EdUW6=PM4_rlOH z?EU*1oY!6!|+s?J4X?h%OfytJ>n$I1zUHN%If~p2SPmB)_H#)McR@z!F zoNOkGtpB_b^1H&t0iN*+uhlhLx6LPs{T;Cakr!@=|Atk{&Dvuv8d2PKLQaSFYI&%@ z?Ai2emYuRtcICl}K)c|Eta|>-2hPs?V7$0lRmaXg`rJFRE-NK-_PZOo1@O@F0?YS~ z55`Y&ZDxK+A2bDBmFP~AqY!rGJ%6~Q?Y7nXYN6-dhYzINL#FI2l?N8r-%Ebf$varB zs#-GTHfpCrJKpcF>MC=vgcUU&E9b+ZalAI&7{uaoROqaGG=6eKTj8>ElYzm9n3~_J zflO6N-M?hdJ&Mza81cOM`T3=sCK|9Sp^w5v)V*|4iuzN85O}7P?CrVJRI_eU2EmO< zBNqvaj8rvlY)9;4Iz*%$i(Io>SyijS6N|v{?^nOwrMwQulcOCeY3W-6u*g+2ft>9V z-vZd4_xyNrxTte71q*9b{Tdxi?dt3DazS50idt12w+0__ZON14qh}S)T+>Y=c1{0mI5ku~sKT9dXz zLtvZDp>(1@TU-D5UbVQ!%>2%e)*Z3&Z?|0qmh-z`_N7zwYud^U=2iy=22#6+Y;0_> z_;^B?B&#&)<+2#zwXwA=`4c9C>+=v<{22QoW7nNg2ZOl-SSE_QFL0%#q%>XE8(z(K z`Rr}XZ6JXP0?(Z)T{>;tU_A0A(<{HSv zIZomElF+sju$NbdcpOgSH>vK-8JOOX2>$bqF>az{E-o!w&=%~qd~Y`T@DHO19V=_l z;i0Q9G4(GJk;7_am*ZSknvqxwM?5Puqu3eu@bK9X-X72=y?ohby@5PDEbJk8-fok~ zaSdV#6h<*M>>hhE_yh!d@DzJ%btHUqF}8)zrk9Jx+y8cc=bMevFJ8)RiQ=)O(Qgc# z{2n5l`id6WOuaf^Yk0If&+Bou9Vf9YAR+8;FSp<^#-L@l*hz-<=!!8&DQY@_v3|Rs zSoGn;hreKh+LHs!Pm>&I*bRM>vY>0)jo)IVI5(%nc24}wOJ1sA_q7@vgK5jkJwwvB$U+$C@9ph~dEZVsG5a0t%z_nTP2tr**l9WKfsaX8f&(|N6I-3N)L z4l5&k{BVZc<+Q9ye*8ztvY!;WBch|Pf;pJx%S5tuc6D{VyjN-~!mbRt2b8EW1t_s>c{F~ zAK>r*4elraQsTUTvL9fFcQUvPQvOdp4u;1a50;CZ;6ou`e;Wy8A_s#Vj^Oscf}M`{ z=9+)~ni|j#G1yw@HL%3MCcJjzhL^**TTv~a%ho~zIPR}PjUtOS;e(|*__Hea_EZ3S zSC&qdbe<8~!>^BhmI>I3zdY>t(l$5t&lQb=f+EIkd(dDQ$wqCTx%9`~c4j8LNZ789 zAFfAaDagpM926H9M{$|oD&&0^!PTIXN2TD$ThVBtL35_ zvu$aD7I|G=iet`;tnxxzN{~rZxW_8pTp(ysLB5rimv6W4$PpsfakU=Yaoo_TazC=4 z*EEW?Q}O9AT3l^NRZfc%bh=Bf?MMf?#K@=L;>ax{6YtVZtKScB?l`VZ0CQB;3V9$3AqgLHBN66K9jwaBv2rJ$gUKlU;IH}l zADY7%Re4wEljF6G?VsftwyEX5kowrdUNjTyFs277MiY_~+2@fHct}Q0&b+R$Ul8i> zIhnzx`TlI zPYk7JVF{3rLjd{en&5Em&X`N2TL@3pGLzD91hpGG zx8IH4qcN2xaIC_;g28eRBa@N1Hn}05o!9EQFYulw$xcERWFxp;wKse9rhJUryf!a{ zJhWS*B+5tWBaYKSFJ8Prb|?qVYzxVF?f78TvZ$~vey&d^;Lo2Ih#kx?KJGor-f}Bu zLE>KMAO@gB=j&9&wtwDB{Hw+wS!?4N{|)*6`g>gSq5R2|mvt>i^?Q-jxICZ(C3#QgmHU(=@hE*e+ulN-@7uz%+nDInm~9Xu4tK9_>TSp?lU zF21vZm~2x=#G~XCOe5s1G?2MqWB?9g0Ja2{aiRxNxqNpj;1Gc#NK*YS5i+J1+xsLo zOH``|@P^y-*AUu;0AZcO4C+HKby#f&C#lo(qb?ksFf@9#Iu;B5)Bm%?q!vkLKc0Nw z-H1QY78Vut6G4Q4oE!!S%j_2Wpd#P|vyncSZI6}VrOLb)ke-Ct z(XT{1jgHxr%r=>84x1IDpJu#c1suj^`wcgaKi>m<;SRCoAA(WAIkv}KEU}5H)C#Ly zc2@q(%zSJs9#RIlI+>o=J_VoW-~>qG9)Zj};=0k6kd&mt@4hj|2iL)ukputw3E;@S zBrY~XG%UflmBBlVzV}i?Lqqvip9Vc6V<0L7hWlzb_C?Z zvZ<|}^BfCqOn+^nrdQp+m<3tTkrLZy1t!Xa1L@E5?42l{fn)FiklO37I&OfpuA?^| zp)-C{?lYRwqS0LaC50D_ffTaw!WCv_P`(KM`*{O_A=(wr%ZeGpxlV<>8AW5i05P$& zR(ejjL-ylGGD2d1rcPDmZM3mC2!u)l^78UYs#)5PzCZB~2uO2n1HTuA(u_oSSJx+N z22#H?AWX*CS6#$e8kbO}&WD&f?^feg%MWcYo|V`zz++59SeQ(Twzx$ghd>hqmO%Jv zhKuvTBTGt4Pay}uhRs3lo`5wONLkbdEMC4=OAzBU`$^Mr(ueR0$Lj*q{=4pnj>oXj zt2W{5T6R$NZLE5nM5m|Uc7~j=Io}P1hr`Rk+!?l5b}C{{`z19bGK&XZsqx0U#Jlk0 zd!f@I?eQAUnEQb_HVI8DVsFo%Uk9+nAtok8Z{OVYRLy7evR60K=YP{yht3)8U-_a_wH<+yTDL>v{Y>nZ&gd4GdK4R0tiliHQ z1_tY2%*h|HG&D-*lcF8>=acy+y)P9nq^7;>{D!3t285%{iv(c7@n}0hAuW|=M-3g8*c)WUY9P$PO zN9E;!wpN)UWM`sJsLFn!Ps5V+fR~HwW1Nu7UBlKW>56D%)j4&m(P-Rv^^o`x9u*}S z$)?4-Gh)jJ*R|g-g%m6M>8{VPMOhF)>rY5Xf<$1o*(uSTqu(@oTyEcoK_Ynn;lpzZ zLR+LhK0Z|T1A65f?E+LDhwEjo?Fk9=2yT?;oy2tC{7$WKLjlH z_4x62Q6L4UZ*A>EeutITw-;{w3Z{U9uUb*);BxM``@uS3_~owj?+aJ>Z0I3UO@HwsMyl4-Y<3}`SgG)v>j0$R18)+? z81EX^+A8b3`L_m8a|_MIY@tX^8$9IEp+=yE%=l_);C)(VssH%lEYGmpDBx>(@M$Li zX~najO%eM*tP5~IE!1N}^BuOTi=p%>U-8AJQ}U8e$)U0U>?36)a(a|!1cX9n3+=Z3B2(lyCw4{^eW1nmJo?g5# z7fLdI`=#RUEK@6V6zWg10W=qNc6N^SlLVvzb@1K0cVo=+eI)5?yAoxPItu_@@?;$98n?QCoJ$8V zp1YO`IR8!x&^p=H)8dLxpAuykc9q(hBI_z8^)AHcBkFhqN@dU7k~w8Ke;~^fWo~xr ze4Z%OzYHmU$Is@rby9czcW@oGPpMr@|M7~wwAgB=tfRjvwp!VbdlwD%54pzy6CQt7l;gb>^@svG30KdUBt5T`0cmMAzjm)#m1G zKT8?0muC0fGS*?~AA!Jgw7)eC`TI|Pw|v@1Q_0g6xnZQcobWC$(t2<}fyG-DttF43 z#B3>9Hl`w7CDR3nu{A%D!#A`iyG_0BZBVO715Y3&CzlA=J&;D|b6D6#H20md@y@TJ zIDP5Qp4uNuF~-(IZj_?Gh^`CPK}d1e8yFUTyM=nBKJ_KKYO;MP{B%9uh|}tdeSh+g zCqikFwJ1AkxY0vOIRJ|%WK`IlxWuPnSV*C!QSn>nkd_Uv!7ow>D1z>LEjo9BazVlv zjmM!iBp_lmi;|hk2(a&P-{wSvC{cCCeCQrheb-KMKxJnHQ8xw(7+0uD1Oa5q#`1q; z$turPSp*ssR$)l}yUs(k$C}bv8sq8jZN=Wq@cC?39A!|`2pnwdH%m(Xc)zG(0I_%E zRks}Qqxi3Zid=_?6L;03_ze;-y}dCTgQ;5pNp{HbjMgIwM&x8qVSB_@0d1vxLmoK6 zWM)xhJnA^-aOt0)DhAz?~NN`Xol>L!A;6G$)1d6koec3$Qu@w8f=!ClYn#8D&a z1-yWeIxU8p=pP~?6xGxu0V@M(^lGs$NaScMz3z)?nGG`qq~c(~eHTJ&Kxl(}8UZMJ*1z`O6o!e z1x6q=DoSm8A5g3}M8?#vMV^kXQ={fGYj<>95gaIe^Q1jj*R?)8=x)8qCb|j}>2;wJ zXB}TUm!c#$m0MtaX-i&Y=Y|Xt@4i^=qxTk7j=OR3-Q%#B`%Fq($Dg~ZJUnLGqMNj# z%5Kw-bK8EV?L5zbu;U0w04(7m9zvtFz0YSmYqqn6x^xP*n6%d?)#0T|YR`8(4?4mr zM{R1w$LrmU;OvNlT*FN(%y&8{eClk@Sw2J!ksV+__$N-IbJ@Bz=M39pngO1XR0HHk zvC8&`2=W5vs~Pgb-oF<1JxKSCv!9H1+brENs>2v$5PGE;@*0JpBS(nP_7x-r>_M%H zR=nPL^%J)_B~ov38nypHU=x@{!MjrvC@|>E_iw>wdH1lLq{ccZ5WP>HvM*UkzkOMu zEZtwObr=?9cI23FvAa?w^8mn&Z8bWM6@1XFmJ9lAkY5%viq2)54d&)@5KI8S4%mN> z#Ij=RGH@P$LPd!uff^zJunml&4LE}>ltCY{6rnvJ7^!7XF&F2;22Z~PWU(v|!oZY% zrW5u3pp_=-M_ll44;8U`{?4$ajafUfm=p!`_WX@Jzv8dwHe_p-W#0-$^RdJy-}v*d z2U;3EzgpAUf_bNKzjdZ2oyP`OU6^xd(Pix`!Oqr-qj*)5+9&!d`wLu{#;Ad%J|0%9-!pDt?w`;a)YX zcD`|Me|8zA7KX$B94|nSgvps%8Bb%zm{?a6S^XvLT7?)##0b43lhpybYrUbJNmDr$zM>uBp2a%LO`#brN*9#@E{gT)bjP^(dX*wZvR~^zpSs- zZ)e4mJ&w`QSr>Od23yq9vrp)`mR5z)*$w5V^|5+3?^L%Y)2UMiZV6HZA9Cd{Qx6YO zovh%8~JH3W2$Nq2N)X%J>c!b>JhMQth1d{Cd@*!iNKn+ZS4PLaN|8Lz*iE81>^uy zmVi)2%D~&2#i7vc0#3T_%L6Y-$W_pi0E13hDhLrd<^}R7&8iYy(C0Bz{-`ONc6k)i zNC3hJA(unkH*Wx4gIf*dnv{Zq(MYkiL6ZnvB_T2K4np$tXh;p`9`);Z2=!|@P{+G% zQ~1$1Ga)bl8f=3z>@wEYcM!h<0BG>*19SPw@c|GV9A*O?+!iB}9dSZ?-SNjrtEL^I zSO*fFHoIg>AUYw$51LV&(@hrD%s)XqV-pb4zIvP-Q@L$+rbA5>xp*2rV$ijmZ#dlfN%Ibq)KpVhf!fU3rQ9$U^-b zo?Mz;-^hq2jpGD5jq_hR@gPd9`#US65Ercm1AjmY4g>Ck`ZaV8IIO20(LiIzqIl9* zYhM-#?2x+LhE8n;wG$T7Z6d18c z^AqT;e3J;F@vhUWgx*DNHtfwaU@M>t(1M@E&cyU%%^BqyQxC1tT@Tr#_Q?mbZ(T9% z#Rx9Ob3VFK%l15g0V6%()$UdoH7!ofQ=E8y{idcMS31XAdStK?d?h6>ZYHXh)dMAD zLaXu7idXhk8bbi^7{hLI+S)Y0uTCH=RDKrlE(Ww6pSc2vu5zCz_Cvtz@^BHmi`&8W zvJ=oWw<_Eh-8E5L+aN1-!F8Wrc8~K98$sh|2O`h3N~4FobcI`Z9M$+q#|%6vV>zQC z_;K7(d4Z`1x;~yqqaj1vB#!jwq!Iaf z_ctW|0qq$EJyN7G1^6De0T__Pq4=~@NBfq4#W2{bdFSEwl199K15@$8kC#|U<;xDMy%?H1ffmP8ZQ!$+%EK^@|U8jj+QOU zVe49F>;c;>8!2FobcEooSTTeS4t~ar0LhpoPb7?S~)_oWK zHQa4isxk{w_TB-Ydil0mwaaQn69Ra_Vu9hXzDm75IF$RA!hoVxVa zBW_Ze>TdPWvkKq3w~TaoZVl_L#5ImtlLa<|NDR@01|Gu2E|slTA`mfMvn1E-M9X2c z0X||tXh-qe-GYu^GVl;+`7b*?q0qmpSBr`CeZV!_z|gfP523%(8qJesW~>K2ROsp$ zLLU$*B2XMWbQiyY8$hWX=lW0TrfgI-)Z>;HFI_V3O2E@^3Z8+|0SCkd*g$(CPP;ql zXIq0s{`kL@|iZJ5(FaOKqN)*Wh<<7*y>_|<<< zQ%y0)6)Um`R_d7f^C$G`AR?gTe+?ZSXm*#nko$kY7Rvdja$kJ-%bYTo zp|;rd3()0&E+M@p2GIn>cFF!stv#^40gsa-hXdCfrSIdkDW8EL`^2Gl4p@)tQc}n4 z0i-fxA&Rc+aY*g@$KtKH-ipV>sDR!~WR04-E$`C8wfs~I4Z7us6 zj;#$gl??@zvlz#L^j0vsiV)KIryRKT7!@|`+iG{bC{zQ+ND41?dm4E9X%We(H|F14 z{!gHyRKMUQ+#_DTCn}|UH z8x|gp=T~=ItMo%(hDLZyj4TsrYBwdsDCi(0fF)S*Y9{vw8-d6JjOqfuk1KsT%7#tS zDt&iWyxDcIzum{(zKSkt=^ZhuuN7@7?4A#u6VNUs6UUZDZ~B zn9oBx;3*^@J_PX#${`xU3&v|8DUo+Pc-5VnIws9JAxFolLLLp#Rg|)XrgLO z=lbu8YzYZ>I$@*w3YNqOVTA|bq&ITVf7sAOqFH=*Nh<2l=%*HCAlD+H#0aLF#C^{k z@6@JvQk01*uU1YyUhJzS6RUE1KnZB=PW}qS@lpivMihL_C2MJpAS;mT471 zfMS(_tb>>{=f0S3EgLVi?4@_q;C zb&#W$KfffMm3!@Qg3Drvqo2|?k)1a2MdljAjii~B$^T4VfRWP2a{LVx2#N9Fg`cec zl3D#U2>nbpyx%kN-x@Bw6W6Pu-yNsUej&~g(Su1`LP}h5u}h=AP?$^YY&VUYF3S;k zrinfETCP{RvdekJmFS)^+3--$#2ErlFxU8pSAKvKbyB4(Ku3YCi6)xqleGL?Bbt)v z3_L3h=01C1yPd^|Jv{k{cS4i*(D~=W6*$M=|L(+w1%Q@^ca}Cmp{U;9o$zbLCY@Y` z1u32XSxol+RmWAfsH5xWu^#^3A-v*zDSwEiuQ%mA=c!4h+|ejQyE%#xFM`o2$X z(e58>rqL^bb|RF)shozb zR}iiSp@{*=wt}YRK6J@cwd|DBTTnz1vvAx0su?DJjMpw?P1Z?#aTMhko1f5aR4)3O zlcGE%WlgRNC^(4r=&wY~t1c-tq+tS$oX^0t)ED19y}6#4^|cWvOa+pWhwX}vGr9kk ziE*mmSG}nS_I>9y|b8Al?14+70^`5(_|ZJnHRziw>i+?9lw?(cxzbTVdflLG=S9MYdg!cvGh+$!_r1g{ zwEX0okUXJhT3DERnsVx|m1AK%vFY>@CHjk#&LrI;hFG7$Pvxh&-Xss@5NrE>$4qTkMUgwRdg&8={`ySZ+&Yu_}-o1j4ctwJF^ z5kGNxxcs?=YJ=8mJ7_W6f+oQlq(K~1!jRB+!hC0TP<30zuP4J#(d{KTxB?`pc^4BuUv@z6V@{5#rFIyp0cnjC zbc7HnfPk`MC=>LxKBHpLoo?4U*Nbr{%&te`dlTAC)exuZ7UUG@y!{ofkM#7p4QQR0 z|B{U3U(#`3ddsLc6%?A6?|M5=^t|>KrFNvU3>VwVx%4EtXd58@Y~j;XdKzV) z-?)fps4iDR;=UWjah?u+`|1P7p9Q~5C1(0JDT##?JjJ@NSz^7&CyF5A-;gG{55|m& zmt5+J3~Mto53C9n(sXa%`Pg;TF;j(A!TayS2Ciimg*`8NDRD})dLsXevp|tfMqZn4 z&9UXKR>gEhrrGr6H7ix^JkIfdt7x4Z&Xeca1?^nFu*kuLck;Nzo zB6AON(qvMT&$3V|9+gcrC zjT-NTZGo%UIz$=U!NyLCqPnN+*pDLpcE1Lu_`dpZ$s=zXKI^5YBv3@Sppk1hEH4p z3$2(3)=DHqPN z5uR>CB*zweO*q!~_qt{s55TngJI~`cAcvggBQA>o5l?0h->h*a+TlgDD>kgJXh_RS z=|7%bhD*4U2>|PPP1cxPV-Cy;1tMq7f>rXmDSGp^)7|cId~}_|@*x~?bnglQcc$woD797`?Hmy%$#}{#mDWK zIVzQagy^3Nrv7}!#BUbe2{9C0pkF5a z@1Pt#BG+AqsC5of03M&HC$n{G(dQo8ID>KJ%beZPmQK53)b@FRRitw`u*Sr&#yhf- zvaq3H_5r86o^q_tDe~8nYtb0VvjxK>zM41(apw@+#h4;~h6xtpXY#<5Ct4z}QQ`q3 zh7GV{MM9K`ba_pEiKc$DLa4K=*0P>!qF>a^}!!SB7_1HB9>DrXNrr2)lA1;Nu5 zFQ2E}M802kQXYh)Z67^<7Ms9-W)%Jk?{QmQgrckqHuH3E$6y|{d)8&XQE`FYl(K6) z-3wA30C?L0HGyfp8=g7!_pVq86P8}g^(x){aoxIKy0c|K6B`EQ*`dCu0^j>U*s)&*DCi4cL%5iWJ1re9Y-g@z)_(?ET;W20)EvV+UUQ z1kX^DHJT#OC;ZF!pw&R{Z<8!F6~>gfvnO9J?RvgD<)ho!yWmu#_MDQCmlj(K?-{ke zdR&5U&FNm`Z~WiutV4*SgDn%7T$;8U&Np(V0pMC z&w}pT54GQnLyOMc9eOzT;JiH3coi5pFSeoJIiJBAxG&zNNUQ3S&<~fROBuNEPruYN z@V@~Wkhs}RXi>*t@4R5|2kKk6r}i%CFV!4INq)KyCatr9esk>@-^iM5BWi3#d>)^B zB;%E{eNY#=_+_s7JaNBWVi@I}vxC#GC)Tv4Wz?}S(FR1+#`)Du_CMJ9evLG>1)l-P zDHQx^G)1*bie`RBe_ae6%902xtl^7C{;5Z8CgaE8G<`}0FUVQo50ud_NiWO$r0l4! z?gkbXs0Y(8p9Np8%Sa4jlP$%mjMok#_x4P_U{9m8m?+h?ed>dXqU1rfJeG9{Ws`m( zUOC<`{(8{lFB6@V*l)biD1Z4hZ(Gmd0q<9~w2ukWr@Nf;6?on;Z`i=xPq2Zv2#e8( z%Wg$!VTF#}`hWHiv6JG)*f3tj%N;9wGjzQD%>ye8M%MU-h zt4bEuH|%kK%pRUxF%UjO8bMC12{lRJsC?pFL!hd4{T;RFj4y6duIo+gNb%CdQIGN z=)Qyzo&ToJ9S=tvty%6la?n}K^>?_+Qw>=yAIB>h^V~gi5+F7(#2+ZC%eI6k44pME z)>E0tg`G*0j>!J0;^;q%h3-oB30Nd$_doiS%(Go8XKwYS2&vwzdQelUoBqrQ@$v&a zR9cft)ju^`Qg-~l8XONlk|s>L%l89Ttah`${1+~m#ZQ-sxsN;j4aqS!t?;?4wq7|Z z2i24PqVe6=V$OrpFgKnZr1DP%`zQl$3dy17&UL>R^~Px*@KPbi$e-PjR?D-?f=||_ zo4zDmyJA{IOIUGXo3C?1I;`-l7_(z=!IQ0^6?VHQ9pITGgpFC#JJ$4oi4-1-S^G5T zD4J^&;U6i| zne-{2&$c}&W1lkV5U7l9WmP=_7MlyjP6cxFvghDgE&7 z6&aIr80`3-X@T6xwNB5H^4|YL?NX%Y>{8ksmVUF`-*1}G zqJwzO^@jpF+x50>5#a!zs5W%xgkX)20cHe8JOsp09jumC~u$v z*9rBb@bT`nS|q||Fw>Z+5v`68@9g+A??6*_G0tlLyH!J=TwW&7dnt79%0znmGc=k& z#o7phlxC2EJph3g$j~eik#)KZLN1WXl(6Uyc|Gvjo;uV`H;;#?U0Fv*6?r1U4)+O+ z!YBv{k@24<8kcmlbkqLKJLyQNsVF;hn<$QHw^{$OBEYwCTHi4=A$g8s5LjGWIZgmU zs30;0g!I=Ch45<(;t3$aCDL1vbfnbZ$L`yokbf`#!k&znc?Ei!A(&rH2{kMqP3>3=HM^m2P{r2s;zrX*lLR%XfG$uB-Bv8M% zFJxx^-#kS9W}If#TkoA!#?3uW@9Q;NH)4tD?(V8SDkJuJj4Lg@P|3#ZdnlB$Z{gr! zFA8&z>Y@cbZ{aG$f@}Ao0#)I*hjEs55UAgA+uxLK0ukLZ2!BeucA*s_si2V6tB!0& zZ_G`fmie41F8)lnc#aEbDh5j}<@U+BH@_R5*Mxz$M3`$qg!Q?3dCje@BlG=D@QmBb z!`(37gs+c?0Ab?e`Nt+0W^B`hhAxU{xhO)y>JH1Kc*ZcaQJkW?ml+roX$(|Jjb;+skxkL9t>25v)b4wl1dvcR(z# z!~rT2RlOyI;ekmsWt#Z^O-59vOjih}A`$=o}ddpi*4t3a)e$TXl;i)cM(9jXqfffJtrQIXM~pYLgR@DCC z$yVZ2#uXc1!>n+}1eaOfug||HrRW+^hDI>)CtP6`KEZYrh@qw4P{a>NdN79n^Cdwn z4!{)GlO5XHgfhKj53l9NSN3Cw!7t678Qx$hvgm8H$1p&J86w)hxtbHFuU+#?Ox~Irt z^_#)!ke05-LtHX!A)?9&c9LE>qAajx{P4ZivBP*|%CG>Z#E7DpjL41k z`S{<09u%EDhTiS_Jg>W)%D~4`=q1d{QWX=*VM8mI7E@uQC+2kD z<=Tc;-|QUY-cK0rlx^n?pH_brV67KMkEkz@kmsKhw6|7^WiUzRrzt=GOyNmbD9vyw z$mi-QkS$QukH{m92-;y_O|XREjQt1y95YTm4lKJRZF=GNDT^Wg>h1g5#=ul&eB^ z#f~+^d8CR#u1$?t_|#pv3Vq?MGkX0*->R0jeGS-}SRsQUElgt;EClRJE8;zfD! zAC3+4xJpg@R?XWxhp0Z z6+}1Kax$eMX`aG3u1D{2%Uy6}_nu~tPGj;O(S~;2D+GVeCoo~CzIE%?bq0n+Zh2PN zL1PL3e-kq@x+@-@%K(NY#trlA!R^R0--VZhFbo;I23_I%4<8;uoAuMDPlVL`ACNKL zM48CGdIQ7;mVfRytYzsvaQqlbm>j$={7 zHB}fW2^z>iGu_{u@8!1u9o$4_*~(BXN+OghO@Ho$nrL;L)@;@GV$3#f!i)NR@GKL> zpv~%3X=7t%kjK4%N#_DPJb3ODYnER=MiLQzssKpx8Y#8Gs$(b#vNJ39O%T<{HQX$C zMnsv4blE|bpe?5*4pwuqdVh7Yb6Rf1Z4hBsrsa%VAR2s-#9~Yo z^lR(GupiR;H_a^Z)vzc@Ok%qM*PTBgnRtH#dcMCCh+%xm~Reqp~1b zz|gJr#zSTt3>26SVSvqK;V$9|)7-XHg4XzdvV85`Cg)DPPE^C+EcI?va2%oIw0kxN!>P;z_`^ZQ> zZ!zylz@}UO_CmhpcpS)_V59@18ekG4^)WrWIuKdx$Apz}SIlVLg{am|542#xf?&Zc zs*dboKk7gs&9~9Y3G=P!*I*tk+M+b&^ohCH7BAO!i@f=0>uI>0d`!D3bOZIs$?(YM zc&7t@RY^(!`UsCiBzTk|V%-yiwsamuFYh(j<40Pfr4S2#QiuEVth*H5SR$e{~`(nWPSrt*?inKGBV->83Fl#K_%#YiAHf-_UBoN*4oTkf3Im1F;4R&$ufm z%n5ykIRKam%C#8fk6_o4PgBX@JvrR?B;>*k|B#VV3fLvVH<{zAb}??Ho4^2#^9{m^ zX2WHOtCmGBx}Ge53TBs3}!pi z&?jNH4IRYLAOXamfUB>*0*;PxUP$4x<3}xl8ZTMg5i@BzvZL#C%y=W+G#}8bIUnSaMCsF?REOP6jft zb6Rxt7Sx(K9-oUk8P`4dsWGv(NJr{#t_5FbK8<8-8J|cL;)&9#eCzM+u8)7+NkO@L zvC4D^4hY?Dz)<9Oz*b2LNO0)29GCO-pd#~Dl@qA;xkLmV^Hg83A!xjm{QVXH&@b6x zb<7o-6KIE13+ZiDIi;BBXYp`-_#((j7E5cXIwET2wqG2O}y4`2C5?^fEXp z4G3}!rV$BD2M9qP&sf#!V;`ZYE` z>HN+QecO8h{hLoB9`0vf7QyesY8$F@;TaHK;NkeA%554+Z_L=M_PW5N_ZH}Co*KKt zEEU-~wIH+r)0e|?_H zjfJqX5saF?HB*Kv#u zNP4FH0DF!|xuEK3fDvc0dz2a3?4?Cs6e{ZWz85|I%G!)u;CJ<)uYHtB9$jU$`h^i1b+J1a4*zMg#Ow{{zK2LK zQ@Y`~rPdieSPu`HqHEL60}SA`3q)3^ARSid2u2oPz_ea#oKQ5p1tS#Z!}U&}DP_8# zh+Jb-gNvjw7HjeHeH-kUV}*e^Z!l3M|Lokh8u1 zM)k7L=y!UB`ReK6PxNJQJ=Ozs5Ord%+2CDxSA-8w9TFsZL5VDTB67vS5H^nw_cQ=X z*&(-<@hs05NjEfW)y!Ah7%2MOJMaD4lnyI_@FnJpT49f95q zoYy|Uu!g6Xy9Izocz?_&aW-T{aTuQ<#qWf1Ug-nTuji4m-ceKtE;HOCuL-MJY6uO( zx@LucCgV6@jpX`5Tq{ zB;~K}kRn2IFT-W%Y9c;u%(!g-nhVD0(LmyOABM?r{Y=v{kTuUYg>84HJ=}4qvG4zC zH9Ngs1~E2nXc^5k56+zDe&h^uh&5+nXFDDS0J!X!VH=Ex{R#9A*(U<#JeS{y5IAcy z&^I84p5FqYz!}&NWKs{N65u5=7l*#l88VTg$|}X>7A^FSU~dLmi1-S{0x$dEI|~Jr z8ml0A>_?=V@T!!i{jJ4kMHX6v!|+mtGjP-k^*1W3foLe1odTd_&wV6(gO2kJ!moh7 z;xm6DV#q=uIB$u(Z!MfZ0;eC#bwNr7DK5mXs9~G|CD24Ab>k*@_E+qJJKZ4ob|*bt zqDCB$G79n3E-6N9kU#*OrGls~qo~+$^Aj{^vr=S`VD(rNElV3OPY1V?na4CA87 zQdTkK82O$7%boZW!N2TlW#dPgU6qfS9>FVk-*ZzQ`{*twUz z81DZ9k1`>eV!{bhP39*eEGoA8PJ;x}IJr4Wv^q**wrWQhv(7HrhC&o6@;z#=w_x~1 zb%2eH%@EKt{mq+Qpe)hC&t55AWV3)-WMote(J%BrPK3KoRpp+oON+$G%R(bAp$a~# zXd7aXGI?aORCH_<@1|mWn~~9PqeCPfnKTn{t3>E7` zz?5+m2SXfX(MnCV1y27!dA6!ZyHKy{OO|TPdgSer1-MEg}!{7;30$0BaLOz=C{dToXFSazPBR+pavJktChWay1 zEdjNs%Y^1*tx7SwJBn~$@5E)@n0>Ea9#@-SnBF{q^E$3Y$1r{->s+>A9&P-q;+U{` z+t++|ZWpfqL+Mu&7Saw64@aHdnBavWrr$zAE2}a<>{pj>G5Yy?Fuy;NB7o{!S)&Rd zXP<@l8~g#u5!<4hUHt`#Y=G~UdGB15IkS-&K6vFA!ryHv{D=VY(+0d`$QIr`5C$(t zTC_JqQAeomd9+HlW{oXc_5kX>{Lzmiyyw*r)F&;aw%{&S&C91{Cxs~?>(GkI5 z=t5ac3>EUK2z&c&wizx23V}ThkElL={5Zqb6^6t?wj2h6@yMW$5fOKt{=Gnk34r^A z!Mt7aD7ZAdx8PPR3ygqdS}r(dDVUfc!lcQg0bH0hVb?}eI`rsyJa+e?QwtJAY3V0l z+ukYl7lPFUl~!uy&}jUyy7`p0l2Sczy{g9ceTB#j1WXK&10Ks;T80IA0Sp}Si=rcv zoSknf;$O0SvACA%xq12bDx)33Uio{J31OuD`0*BbIUB-M0*4CY4A{s7I}9LutVfR@ z`of@KX36Yllh`zP&kUf7#wB<`%4I5rz`B;zS>54E}lDP_907>TF>ci&CB%4yr z-ja|#7U@LjelDTK;`BdIc3R!8Wi1f_cfw+oIy^r z?x*y!E3SZXQxMu^cW{$tE6_V)i;U+Y+D3SeaQ*A~}l(t)q7C36(#e-6_-2ld|tRRq#Gz1!&Wl}Y=X-=2) z>VExmn|$>5E~axF=0ObFGwLFbOzk=^eR4Sh%0i=jB10p-4Yq1XH^s& z%EqpA;*2upI`McMqf^PwaA{kZTaB`KCnxlCf$bbI+_m3QqT{G`vH#p=9w*1Ii}Q;R;mZ~)Iz3$gim_YxUTd~HyR4hk%VR0}M1HR_ z;niRFo+NjbCeiQ+)D5O!quQ{&Oy&^EJZ(a#WS$b+Jp7JJ_cMIIzxU5~t@nM_z1FkubziRQ z9FFNc&g1hrH0%un{M1-g^=k4^=f0Y2b#>x~@7+;&r5_KABhrSX2)PYzrRG0a%SeBo zM_Qd*+J|d~pImj!q(hyskN;ExFQ*lYJ=6I<$vHnbW?LXc98X2%qP<{$3d(TJOyMy+ z2NL3@X+gHic8vURw(aQgEnRD}`ga;!2YXh@`;-LgA3+Q%*OQ8~+zfSAk~X(wxa6P^ zgM;nwlbU~@e1IRgv@Q#(!NGE*`;U8jC2@7mn)snLDVkF|ykI?g7Ju6!#tDCt+mC~# z?%RM*7pA@J%=%NY^V6mvFI{Ga+uDfzJAbf%)`L+jeB^QK5^b?$o7t`FP$@I_17zaa zg@;Zrw2{eB#v?iFo(Rz#f2gS5&6(Ukp+M2H4tJ=d%_Pt92F=$nX7jUf%)#6|5Sacp zK|_WQ8#fMA$dTf3b^@2&RKs~zZYM`%dsUHXV3%Ls{2rj*K8Rnf0<^TQ%A2 z)5-dO2iF*dd=tY6@oHTfh~B_yR@+;fHFg&q==Jt~eKd=Ifb8u*`D}s;0_Q#^m-V`| z%Nh2Ghb6L>w#EZ=pRlkGmCKtoVQoplXt*?3w`iZuaf8V(9F$k6*ZyAsNwFM#>ea#+ z$wM`XRNXy)#$%i=dSZ+sS>HzKtrK8REi#ddlC&*Ab_2xvyv}1Ob8e}1wa@cp;)*-R09jDG8*QXlp^W7zWG@AZ~0@h5``XD7w zXh>P-6*%`Tq9~EVOcw@-tvk@sW$RriP!f+??wn;Zj5DkxK+Xr-aWI4pwS4BuDvCr| z(pMRa-z)*Ts{nt83OuNhqG3_E+GDfJ#a^{>*lmR~j5d5}?h5f%0{PMid3h_g;)KS< zXAH)G-bEK65$ZvKPGCF~?sR%}CUPvdMv8qf(%-D<--a^4q)z&-9JNak{cgQ#WH;20 z7l=x0+Z`0;rv5~xsfp}59O;EvWIRQm>6THGUC65cWX4V@7T~+7doRF2Sb4#5p!@@!z8{$}~qi!P+++hM>Jc0BtUrCXENaOnCY{FDQvNGchSx#c0o zBQ2)jXRDHXZbHB}{w+1r1@i46GVg3a^lBy|6(g!icDcMKjd@ z?d^LUyUL|-yQtKw$36YOc`*6 zD*{ju437mJ;9GEyM&vej%Fln$S+U-H7TVu6^*V{wHQ)$XEA5xa66zsKcFkTTdExi&s*&^eIL)L>m2QTWHORsq|7^ulbPAqI%RP9CE+^)! zuSTNmiNUeRVDrV3!Vkul_#O@OIuQ_29!4s!RBwb=UU=s6eX8?gq4vZ$cQk;QsdXHg zkN>cn#~`^C9CE3b=Km0YrrQ#2E=K|91=TlFab&Jc$6z&>8_sj{mm-!2C3*1V*85^& z+Pf4HWF7>Ti=+2y^2R?JWuTXOk$MsQMfmQ*LGola^$|^kCsk2`-=N^N(Llt|f-d-s zYVcP!zn6!{u>Oc_s3+za9ygRC1vSi`j0nyw@si0TEy|T(CQAnX6=Zyv15>vA9*Q)a zMaXh#mH1gy?mSQ6I+~1LPi4Il0m_nCU(Nl1-PTZ-6T2&1(gI_WLo5* z+R@f3%XgDXWr7QaC~N%tJoj7}R^KCJ2WXD{PqiafJhFc7kbVFlQ=LSRaXqd_R<}Y` zk7$LHxczIbJLv_Ex@Q*;0Xhn5Ox%^desLD$BJL(o2oYR-#vN|HIEJ?37$IiS$%v&n zyKDJU`JFRwyE*@PAL1F)hVp+ZMiGv?Eg$|Rz|Yb-B&5asVN4rJ>*r^YLXp{n5+aO3 zhw8@9O{16J|E*H_V}o5YM*q^OzXrBg@<6A&2Q`jy9_cocVx@OsWN9lN+7<(>)89oq zfadpf@bLR@?jPIIX^$u3Aeq=+W2E6bIZt4_V|(&I+0%|9Wk6}OFXVp&sDRU4d-b-q z!p*R?IvIwxe)rKR$KS1@3`cZtxX7t4qei2%KE}4SIkwne#+(?oossBY+s+^c3a>hT zireoWn1El0&-u7}%TZ8=GtqioXzf30=AH|cHk;m?pEBP=!LmrbwsnyAbZbH^REg4#m3F9;u~GP7%=wMc@n%T@JP=Z z^2}`JPwym6be+V-ud5!d3pkr0H4KW}E=)!_PYFywo zoRNFuLP&i~o@ql%j4a}((?*c`LL7ZM$Gs08q^x}r!?B~HlM7oSH^|?P9`1>`F_f`x zI^5n`RuuVBw{~!O^UBbo3!JD>s+2rIH=_%T_<8M*S@a5r`&5*kXJrg@)V_?Ab>S|a z;y#fgT5D~KHGopYa)bM8HhRLu7VS8)k7S=7J`xmiAf^`ZcpjK#Ecp-}B1Yonh*TKTHWL0q)@u!>?R2vl$0w;vbNu zE^+U^NHbyMI@wz)hpi`22hDFesug-)!nIIi&OKMh2tk%^`gES080=d047V>UkH1#z z;e6w#jr;5B@I}hePlK%=fb=JPZ`je>V$l~q$1T^Lo-egJoG{2r zcGHD2Z+Gr?YTfBSxFSZ|x!rdm`n5JsnM*uEr|E3i}a5)YhPu_s59YVrE>#TFY3v=9Pi|0BZoL?4ThJ6 z0-kKh=Nv~fuJPo(5IHAGq8ck>P-J0f(`}RUI;+0ro6fTfPvbxV(yXK!p}Fc@hxBWc5aEHOrN&r*dca%gflf zS#P#-au+`ex!d~sp^j-S$fiThjj&EUS8W$uXn; zUcBVm>t?+WNNTg=3)@@QL`m;wmuaPF%2@J{W%V!qEUf1ILT)OQq9ckLq>;0xS0Gu^ z6-H&;a=G%7Tx6T2m;47cBH^t~9u>JjGCj;){SO~YT428iY3M$quV!}-6C)!eFa8D}`xD3`EHri{X-Mi7=4r%2$v5G1IDE#0t8PckP*qxaaIy$L!9pos}gZS~i z?QJDe^zuGS!{Q%uCs@G1bLsCc3eo16n%b~h#~f8`pM6rQZ2Z{5Z*n5L3}aSf*09Jw ze9SM&+2v1$sk5%X>UHW8-qy9B-T5*L;-#9EQ67ThL5CvJnylN)Rt+~Uy%fs4;m|AC zS^dV3&Bpr;>Cs^XDmg#>BofwC@7H9Q{YS}|NY$lE2kfSuRe!73BLq~UTB6fX!2}WM%YQM0iR>99 z=zlgfku_~A-=zmuB3S>p`VfA(%s;=ePYkg^ImjTskpC!@xj(E^S~lT?3GvM9e}g~( zfz$C1P(ELNx~K>M<6`(7+(#+%O}P1-#Af@~xbPg`%Epjp>q+cdTs(4I0g+>YRGRw#uzk zmiYlW2Sk34f!g&F&uR4;3<)^^M7_r9>{1yPo8VBr&zai^}P#!dA@t_F(f#R2DY@lw~ zvpnjpY|IBlL5&>TJmnghaF6A7j8~%1Mb+{I+iL#0vQkOxvHNBJYyRWg zDy{E_9GeN>i}#rHOEX=#VVb$7<^ZaMssuXC3s?i~8Rm#=j$qc$GptErzIHD<_*T0=ZK_u%ZatBa*RfF@e-8aHJ$2jSdYxat)yM01&`T34zEiuIRu_Bh#nOH9d`hc*p7Srt{>@(>q{VUKo97g@@+A^Uvab>!!e<1Z zegB2aQ?dvflg`nd>v+1NR3TGwMZGZ6tpR(RMPZ;nnw={`_GKPO!cJRy&S!31;Ry}0 zN0H(6hnQkrRCvvt*3AWJf)?jjh-W)YBlUs6w5kpWVSr-ZB)s|C|1MfxIK|QUGH1tc zzGw$MWbS#GcwsP7^xS8tSPhBQASAMY1gI*#ylK2Eucb}>!1VfWk6XsSJ$9R{*Mlhn zQi>Ixi(1?tO1nN>e9>CfCs-&1=eetGDYr8>*Ph%TF>q|}i>Glg*{)90g6uw?455UF zQkJ*z@xT}~saz`q9Ek||4cw-O1Hp4I@S<8XYvPaR0E2P?5TWXCdH0q6{P_T3IejC& zrWF$6^izcmh1L~)<;%T(YL9a|@hr#9;sGpRvHGQ-Zan$#bsh3gIYs;a%Ty))oHso> z+Zz^k@Y?mq4`cHz6B2HgX(&w(;b6Bjdvi^HKhC7JVCV2H+pK8E*{jrSPEhP!G8SmH zZ`lv}Cd2Eet33T>xq5hUhl>}V^fyOe(t1ea8$au7`P74&52RKVA?dVvM)LN*3uynM zc^@x9%*w23XzTL(2GsQ{a{;}Lug+4#n|+FV@BRby#Wr0$^|H_*n`_tFCab18bC<*a zhJ#JCa#=sUgXM_tVy@8iC?cSyT~(+jL4Bpwu77^*NLKF5LIfGR&Wu@o+X}#Xrp!&6hGCF2aVKl5?;b^tH+L=Gt|%)!(bpc4=?R z2vK*3iu6VcMFCVnA)K}J+V^eqnGULQ0pkjXz*14fXa`lLoT}kHmf2#i&aYQwTv>$G zF=dZzPnxZ)+#NoZ{ zxf7N?cdW(iet3I^PJDA9KSQJ~zc_0Z_9}i-TRrx0wg&yf^x<(O+zIe5jghx{y}x51 zJ0=-NN340{y$|iMXO<7FD}(>H`k#eHvy~LpSkZgxh1G-V+_7emXo|(5k21w8ao}CS z1^;ZQ$3m&GokBj7vhVKgdVupeMsn}KEQ zDyX{2m;-oJzPp{3sh(;}>BXNpes^G#l# z_3YBCu{@XW4n!#Nh}=UYDH_`W`wAye)sdI-KPuO~zt3*ZAkg(y*%L<%j~w9sz!;eb zKVG{caBiDhSN4R*e`0jOV+DoB0$UST%Orx4*B=}BW&2bV9hKs#qWk&RqbrPk)urZ zN~v@#4+o4_L$^I+s4o$Rg%>Vx!ZET0A=lVH7P{ySSMVm}+;-nduG@u|kIziQ3x^yV z(@(;{v=a^QI@QVdPucE$h$4_Qw&_Nw84ntvf${TCcfF(Ou(gmHvJRsXK2+^-N*q`O zbcb^WNZz(`SKKZFxcj6o_FE1>I$g#30I>zfd^@H4qF>|5eoe9@(FhW?xw4b@OwZDU zYO1TUiR`St?TB+Wx?NHFL^M3@E952A?V;3YG8ghpTNVfCU_n5VUhgeMkUH$#{uka| zEWgF^a0=q7`?2P7rc<^#S6&ZQdAjbVJF(MvxAn2dq;tIsr>zoERLRiRIput-v1(_q zhbdwFQ_aF)o=K7Z`|BZ25Ot@Gzyc4P26LwA^uu7)?zL_07&WP<;mp+VVRTg8eA3Z5#h(g^czC5>npu1iR?L$~!bBw8?Cndl z7FghgDExb6316o`&Ocrcp;(-E`4OUi1VnS80wF_z^q*;wHv%2pC;~XU{2T(>37Nh5 z-$H-=>&d0+w>X1{vc@Y_*2L-}1$=_i;>NF%uu@(_jPl{ZwhY5G^66Y$)D-yn0TEWl zpAaJ>%&_Fw{irQh!fhneHNSbva)`KrwmKQGA6_v4Anb%8u?b4)ds1RU5)=|^94sw9 z8Y0KbaLhZy@Vu<^U$ORrY`*E7sdKoQob?%b>vGShkrN_HgTkkIs{iCh&8o}$xOna_`K95^_Sg4RwZS39Z17ba0-?>TOLH}9oHybVKSEVO2Yf9t#jg=P z#40PrRlkOlgpS3`;GcOpL}7qF({SOp34GS{y4so6KcZJJ=9QZ|pE;7HAe4+F%PKA7 z%acKPHk;QlnoDLqD2x=crdsTW4XY_a7yZHwj~?K+6af!@ek6em4j-a$3;>#4HPf-Q zc9XIysn|QjPhRoJ^4qq0;W#5E(m6P+P6}HW@Y&AE+;`091JYb12(R2{<>rXwEoA2O$XvwoOqy9+|E@v?RU2C&@ zh1$TEySY_-g1r0%a_7-35fhNfgClxc%Y}5v-(1=>&=R={WDVXoL`gI)+D&vJ6^wg) zE>HoEzaM)Deti0dAAgpd+n64{)G4)gxwY_^c8vZ9Md{RvQj8o#20WdCZ@`t9>LybIE2vz6nME4AKO)HJs*Duby$^Y)&(V%Bl{-KPR8MoVj)&3kj%>F zCxtHpSRK;_m9(#>e7<8Ac8vIPovHPh9LUqmnh<@x3Au#UtDR9pPbT|lSwLq;emI|M zU>&-uI6`xegcS|*FdbfqO%I8VQFn5K`1dp?->*-*l1M?}jN5-svkGK9zFM(o4~+uH zY$UNl2*&qNL?2yv#K)=5`Y_qLp(XEv;z_&daJ#O?qfkbMe|LwyWtQ@mZhI6?EEMBt zaKEGjUx<35eF-na8zdcrK8}OKf8Vnx;T#p$rnuq6?!0zvS64Nh$^RyZh6bhyV9UW6 z^+kozHsqLt8yu;J*^wOIB4J#$hio`NOHr6dkZCAAd6#4u$aWzGFy1t6fg&%qjbQLq z)oqI!VpJ{ZRNxTd7FGFyC?IG>^S_;DUEy_eD4oHCATvR}k!k!V(P%dXQ2c0WJ<1(= zsN+YLa5nj84;k+h3hj!Tx2A*F?N=$T#1iS&rkEudC1nrK#Cw9Yfm)A6#E18|KqFk`B(%45`UAS1xJ4%fZd6x19*NZUUsb`>4aMe4|YOCyO8? zEqjhqZF4X_V}(mrWUyoj56)(rn)@}Ths$Ae%Z|NzT2(A4&U$y-3|ri(z<&%oWa|<* z#@ycXXpArDHO+ntKip^jDZ}=?4Wp?u-}mSCd>+S9rzNi|HjkmRtEmfzunwWmRvlkj z6Y1NKgy5DZFUzg8Z;M|JH*e(H?jMAg(9C{Kv-Y)*#@+tLYI~h}0DKD*BKOq*yb@mG z2p^X9q5!Z20fSf^i+90mr%M|Q;Icz{fbN4Gr5gn_-$;@!S96TNQ{Y!&K_bEnJWN^e zgyNNUDYmok1iCFw=bhS}$1`sR*1(*H5n_fU(q`X=%U9Bg%0qe-NwyNS5y%uVK#-qA z9yL)KCapv7-~S7%21@LIM+8Jb*Y<&c7yNQp>A@|VuJgN~=zPN|;x0DbZ_cH1`Q9jQ zAbG%b>yGZy-@2T;M%K*&b0qQXP?M9N{Urztt+kIQfN7wp|jqE+u1{PT#fpGa0 zSfVO8PUqfw9$tPcZl&`?nd^JA8I;4<&mNY=kqszX0eKn#(H0a#>6$7(~KuMo^ntALU7TE@b>(aSNgP72v8UuBZe z@>isXM02SX!fVq*66Os}p#1FaPP-tU^xVpianCxSUqNURG`N7LWL~xhfZFQW3JPpF zExZqAw(6CV)v52mVn3Y8NWxY8P!}V{f63q53f)5(i7C|uA;dj=gpWG`Zr53)DSDZG zDYLx`%v^z;CTUFkZ4`nGM-dE^{MBN&l}lPFTvSMGEs8rSnk0l3Q=MI|+*optoEY>& zuhmf;S}0T%&$4{Qu&)OmVHbGX8dFoGHl`CVE#$qem7udMRClQ6-jpYw{AtK6G`h-TN&LEl)xIgX!w6+J!I! zJ<&BaxqGaiD69)=C6GFjpnyXWG%A5ljGJVQwr;$w<)pJ5CGp>(*owCX*Tkd7tZaSR z1mn)yYERK(rnX=shELY!St#^oR_D*AS4KBOj*H&j##6plG*j-zy3>Ei=h+6}?Ucyd zPf*w!HL(Gw-MxHKc)JMjjf|1`@S-peOQ|xwGo~w9l&urEOEz;w>#?){xa9xK_@v=L z6iWCJU?YhNW!qn1BkNbqs3StmqcnQC$C`8;ASi(wNAI#Mb`fF7mudW>Es&}-dR3#Q z%QpwIfwt*Z_EU>BS18F#Syu;5tbfcv0qoME7*GxT&)PTXL&D}#cdJf5iHHa`b@EEJ z^&yObdH}0V)r5iF24&Zj0}LCN8F;lbPRFqPnSvdOO4O8f9hco@NL`72?OMF3{$+nt!P{ zOqMV;lBi|ay9-i- z@N4yS05?k&dEJ$|TT79_PbKJpNyjDOFnO#E)TzP!3b2}$SxqCCGI?+*Gd#qBrG;yR zew(kKe>#%zX_s-w=|mcqRzK7~bWZWYu=MQ*VxO4Xd%v>rhCdp`P|Xbc2ta1#t5m#l zd+($0lft(IgKpBht?z3k=xdp>c^f_>DqnOVPKDMcqV2L2LK6fhC9x zXd+1sk3#l%x8nbSqr`E19FN~q2#mbRR1 zKqv(9B=E?WC2!vkpJy_@o8X8K3(gxJ{5Q+Ah$voR8TX3^91}v7iI~Lh}dpk?x1|B%(sn} z*$KBw;Kw7hetX&5cOiG0`OQ=uzdaB|H`7`?%+OdAhaw5~#$!kLL<|(oj~3;c)snqa zYBdK>TzB@y{tEDehp3aniJI5`RSQFUr1TeC>qCnDQ$<$ptQ_waxYV(z9~8;z&UenZ z#f3-Q{H&b@dx;RP^?kUhV7xd7IjG&@Kg-Ibq^-oRAm^jBAs7u|22;n;rt@Jsn)g`K zbPC^lvGkwGFA#=)vFu#FgQ?C#qUQHYTJDG@!Mjv_CTjceVl-wAnfw0qCmIm0FS`q? zdzTsA=p7gl0vCLA&z<6?SkK$3_mQ-Txw7>#du{u3wDI!V5ga7LsoGUrUG7;+zrrc? z($tG`gF8@VGxbXMw{CX>z*5vEjaA#RCMY3)e)i+X#Or1O$O0amIUyJ~)^;);)Y{Hb zhg0&|hOOeUMN9i*uXw@mcBHiM@(eD{-C%tb8LhMStL6<{fGvHM3W63b?3})qzgCc< z3e+}Dtap?>W1)M9~439&!SLn!vg=hhB;fu1ao-QSwIhR~9 ztz4VCp04Z6OpfkTo;xB!(b4R`6yEE3dwJ2FDyKM_TOmS0+byumZTik-2B#&)tC)?~ z=YLX4QIbvCs*^u*A$Ld>+pkgl73dQLo{*AX&p&lFtDIWS=9g&n=FT47A0>a{kZYgs zeqMR3O!Z}I`^(eOm(Fa?@tspExmg5<4F*)V4R08x+680h=6kg1*5@^3@DE^LQ@V(N(faW#8ky z7g{YdFNNZ%L`9Jmyo4x|+9-KFTOh(|M`5U39>>2DEY6nxc%NMBN4`{mXX6ODfh;+*=UHg=Q3SEliYzw{c(?^Hd>;2`XEZlokGbW}XbMic{duu~W zDfNu9h;aPwy;b{y9bL`o(#Jj(yEVu#J5C&;bvc`;x%JE1(+<5+%Rg?Rxmia3rsmd@ z-zJp46p5YFH?2*K$Q*SpPMp9oIPN_3ks4hS-Hg|kjlA))+lE8FJe+Pqf{>|;{7cE! z#+^abNTt*5fOW+;RI&qtXW$zjNj}+bO*75%rjH0J4jYE?Q0$tlU9xE7o55}NB)zQ5 zJu!VFrb;G!Kopm=RZKnGH=AXWTe=vYXcaavP;!q)Lh3F`gBs5hhM~fyjPw)8G^aY^ z6y{j4hYEi1yw9;+XXJ}b>-%b_#N1!2SaKM?I89HiMP}VB8(La@alsvp$ypA*wHnDe z!mGd8soFNk;(8*I(J+M4yS`mTb7U&r?3s+Zl7)Gm+@9LfX zb>1OhB!fe!m++mp(lqq2T+ZZeNxtZ4fq-~YX=DwPUl1KjZl=S2ci$JSYdB}Ri;H$B z_l*61=INb|7fBIht49gUwjW!v`gzme#5;94o5p7c47a-A)Wnn0ep++~;-e{}Tw8G7QM@TqHA7e2~O|L#-V9@zHD?m5Cg z&X&39ll^00+tc!>_s2P*A%lDGduClYZf}ED_qO8HD zNDli+EjzWNI6Ti&Rx?j8S6JG_%jT{M<;(pDWI__bM91nh_4yVy!i!flRTm!bM+vDi zT-A*E;}s@$pHn1AlCP+Q_B?B zpGsIg=S|JA!=bM^&W?W|%ba&V|M~G(>y`>()~}|>*QO62<|FRZbWID^P;|~ytWg_| ze9)WrWnJ;~BbGP7Mrxwh-B=Llt_d<%R@78(@(3i*tS_+h?PKHE@-X=~V~`rQ8^=`Xv! z6X^Nfd22^9`$dA?J$Gd$enoC*s>D-dy-Bos+dN#N*0EjAc!)RkR1P(60f)VKHJJup z>Q&>HMkJm3UT$cmuz2CDs%f%k8#8B6G>5*3A%pPctcgP*`&}!7%6+qsoa5glWZ~Dm z|9DyZEQk+&$In&PTCM9iJo`%D`4;2eRFY{8Fb&=^h+Hk{ycp5>5{G5!q668yXofbt8-dPs&%x8#4fcAS%v?%#J+eH+G(z? zs-KrS;dqD_`zB&w_SI=e9$Zdn-blQ=C~EIFPYY8<-SSBWMi%U)fa1$}X$=GMk`t$> zNJx><9|MILX;$M}FkCwEUSUPI>9+C;sb)i7q*xj0R2^OFpU@x)my`~j>v%>D_s1#9 zh6!0}sq6EkDV6o}@XF-kjXUXt;I5BaWeUe+x^oQ^amJ|2fv+5NrWBps>9EAnM3kR+OlKDWfn$yAC5L}9^vu4H{t1N=av?d@~v=K*IPZfIrS-y zipTz5PMorNtSh35q=tk1u{6{VM8=H2v^j{+6Bi+$wqreM^y8a*NptNG&l2UAspa6t z43rx4+on!1%icQ7a%yuvJ+)%@hs{LLU;r9i(>{I3HNrWI;J>RG%b z*wn2RUft!J6cLcn>vpfAq$IDGG`;MR%*AQ~1+IU1C*@o6x@KsD&3KX%@)EL;u5JF4 z$rBsqJ*D`b1Wk@k)V|h3Z{ZmOi=L@b<5QF^@-|T#y7#BM6O$(NJ^?+)iqUJfK4;J6;9~?3cLB&W}C~%woM7~ zV5j z+V(5u*wj*Xb$46DFESy+@e*RiPV>(4q)CHA^Eh$&-t@7e6(xvTRAqGSioix}VvR^2c^zJjnrhFVW$X zM*6Q`zoRJKC?jVc{dy%KZgGq=%DM8_Ycd0O)}|>T36H{Vigca5m@+%nw#ays|ZEs1c(1r3F}A3{<_6y8v2J}K7ya(QoE?ZG#mi;~z%!=dgU)gh@3 zol}a$$S0Ci&VQ_+Q$K@}+|xyZGv4)J@om)jltL)W+f%oR%W7;imX;HS3I~!*@B|)< z(+Zo-!4QnhJQOf3LXq!&%k|Dut5d%_Cp)4L=+w$IIsMziN5%RO-~Wz93?-4T5xJj) zws9-Y>HFJ;*o>Qj3hxA>c?Na;xC|H9Y)?!5_Wx!1C%Eyy#3_B=-mx%wrq8L`h4%$d z?)!lP4RfV%jjLIO9lbNI67!-Qw@N6|AJl_k*T~K|m8v)5G8)!k(;E^!gZRlp)FD=l z6^0KGu}plYKG01fiJM+$aDVpstap;t3Fua>wrevPnY*oiGCCYgyIB)MZ}~viTFvgf z>DuhXyuOci4}B9~YMqz1xr=kL6hSN9P|IAJd-8Qyn|s1jmb)o8OwDAMP5-*9>ezn_ z)`9Fl!&rTYXuCVGdYr23K-7|i0m*AgX|SuIJGY0F2mJ9Sg<1a4oW#=8M)z;(2huUe^rUBUo-ui~;a zdiYoiT1P+c*Q5Q?>ly}!kY(<2Ce})Bnj+*?Vb*`8saZMA#5DH&+D@D9vvGIM>zsbp zJ`<2Y`aVWN1D$6gLiy*Jq~~vEhi6e|vz*|X>;JTEZr{puEIpB4;5pC9QT zEQD3D=XuU&1+K|r1?%z4byEi~B3d9HB$|V6zg;SxTWV*{p0%9+TA)+@B%MvMJf8qEnbbBF5vP2#(Y)|E-uo_s$(s~g4*%g*~J_27j_p2 zPtoYHO`pz&dC`3ZZ2xt2$?S}+^|?{5KL?9R6c_yVcs?-Bi;qCl7G}8b5iR0ORr=>^w54L#G6PTqq+r)Q);Yn9jj)$x4{cd6E-?>@y@r>%1Oh5GU zb+-#F2Sfc>PEk&jT~Kfz+6VUlHJ(|};DQrWg;@g8q4nfdVd1Rg+48_?-|THI1DCzI+$oxN#=OHXjF*dJ1Kocnr6;f)h7r&Nxb%C8ld z*k6Q%^++N;b%|LNk#v9uvD`6fa$j1PQ`IAzd-MK!UT6&s&b)zaX#eQ73}{?;Krw}_zQ0n(dxTW)RC+H~dTZKjrF+ZWL zx3cDoj@H^!_lgxY_aQZsBWKHn2<%=7w`C6}`z0WseL0FNEnTaUk^OZz=e-L6kBgy& z?c6vH#{&(#p+U2a>fO6{sd+6Eo5$}fE5EU+T9y}&?+_iRmPwVB+OFBt5z!W>3>6;S zdl`aGR&BJXv@L!;&pd`zapy9(17< zXQT=bBbEuuOG~>6b9B-hEjILngm;1?75$8w z{L$j_KWsERX((1tCht8x+7;OqeC<&(*Yhqpm6S4xvBcrGu6=Q>6FtzrI6OAiGHBCH zW@2)ZhnJW4MK~eup973S5akIvl_1Iih}9ptGd=Lr! z!_%Oo%(5}?jD+)_G=h44jgdS1jGyKchYy@5<~mESAIMMFl< z=*E0kcB?yBT~99uy3ScZ@su@{Q|(vr|Dm}=-aPD3`=Rr}E*Tmr(fy;(MAQ#ipB_Dk z7{P2nO!Zee<^I3=rK6R9Y}%%Y4yB4lqu=scAktuMor}ZaemO|^=u>q~b%$P?PGUiJ z0FkVlwgYj!!c(I9j@=|$csxEN4Z5tI*F;Xf5F7O4va`>eJ2yPhSrly_R3jJGIn@i5 zd+F2Dj|QzujUs#H(dclfYjs$izTf!i6LBSDQfDgc?)CqY+77zZ*;x!Fqu4zs*8!E{ zifIia&{IlgDY37n4i`B&{c9b(sFA_={FinOmgiJqnphYAt|plUvDSZnNw+OHS$+xO zcv3l1CqUvgjXEFpUs9D^e{Z;^0TyNoCT|Y#>lsu`5;T`GYibOjt?WHHIq#H|l+|2YcSQ~kjzjho z$Bv1ZG#$SsFV7<`t_@6a`7tpuhbOH@tF%vZ-hDf~&<~WD#ZC)Dq6CM&hIlN=2Q4Vf z(-VzVdPU1$*ff1R$=n@UXjDLx@bI^9lU}?aGc_~AOm)QfJDR*kdaGG{^g0yy;>A5D zr&5{S)m|^xu*gWu#c$<-FJIo*(~GOw9$CU=WN;dFK=o}-p8J9&#fcM|Iyx~4K+q;y zl4DqF01PP_&z^mNmU9H?CQPiibeI2{?F7Mv@aeS5BWM#0Z|_QvG{Eg|1n=LQW6PYm z{auQFH43`H(MPtLLQ_8zDn>CQpx%)=efF$5)E0Z|Gf)=-cY+zxhc+}a(tK@k#Ij~N z$TkXEr$)uZFj#i%Y_2H5AilJ>i%33Q6o$q&;n5O~ILB%%_6xLXi~U!m?NwBv48%FU57ueX^!2>CIc+_ZvQ+q7NXa;K8AJ?&Jce{k>>?`Q&7YmFnG0?BB zUTU@LM!(oD*=pRMKTqtxa70D{q1v$3aA`ZE_>)gnKoKwr-Ek{L%s2l`aEgldR{;-~ z*;tMHB(#bb0a7d>M@Ps0I4JLpg%)rK+3e??&i$QW`pXY|p%d;K$Lhku9)SEumD^9n zK*Q`6pe5@ATI`9mt1ku6U@RPZP0`)=yDU%hQ7(BEEcL}Ejw`NY;Xgw zEgn6()I_)PnJpM-SeuBe`sW9W+QEhA1WV5T=;%AFcN{dk{qs9EC2$qxz*>?+^vZ+S zrT>KI#tnO*Hi%oU=$sM)J3pBzyNeakxPRaHn@!o~$dZTo>&u+7vSW9yNIK2uPwq?z zdR`WdZFd9NZK5;r0J^Z)jg4p7w_^>r3tI(>WNLuJwhcPhA2#rW4xN+FNtve|K&cC_G1(o<|sP8d63 zY0MBPcLoHUr4qbG8Khl`{g2Z>AoRmxghxkX!3kS-z)aQA=yafz)w0)tQb6SuX^Gwg zt)qi}dDo!r(xYCXRkp68J?OG&31#sWKnoEb7nid<-aHpsyOZu%os@J9ZeMr;OvuPR3zsG4&11R_Pufng0=-@H|h6EkiX*Bx`ppiR0O z3z81q9~~ikj_Rp#-(IaFWtj$g+lZSmRaU;QB&%mkO-=u}R3F+micI?_dbp0GnAUfb zNrYZOY0z?9hn}83Bn`HvKTvtzZMALEQ0{7cGjuas4Emo6Lw<1z<5Br zZ3{3%Arb>eR@v;Fz)-scfhb=T6jWhx`+YrM@z7QpSf1JB0A^K?wfcsJ11|)Q9vBKy zDzdx9>=rv4%Pp;~y}+;H2#sDz$|aI)oq_#m6&jMxmeua<Pa-@atf@Q*JSc*`41d zb2wc}z4rDXDtRNSBFn?geIF>K-r^3s1p6I3ivgW>0pSvU_S?u(N<;)56cmKZ$|8#z zt0;rk^}`F*la}VOr%#_weevRjGQrLqbI7F#=!UVy#p+y4xo3KYP+((0F2}K0pIZ<}N2R#9$II;WX<>mcRol|NyHt!1y3%#}I zsfS))Mgon!jb>R)Z-$a}#9bA54nnP@GH^p__4RVuW$0$v18Y1H0Wt3Q{#xYOQ-sMMGWv5wtt&kAv=FU#3lr6xFUk}!9orV=?pGBuQiZlq=HRGftK-my9c@%A~claPL=IIM}=7Ra^Tic zg8pG2L!*d@U7*vbl9g4?V3l(bHxEzYD8|w&460uZ0m8EdK0wk~7d@y>9q%2|6?u zPda1-Na9F7d@c5%NWKgs3oW%Bvw^*CQ&W*P1-GiU0wNdl`SYC>AY0V32i8oRpFe-L zCGM?qtw7K5Sm<}(e*BzhTOS^;ZJ(i%XwwwTVhe`AOlITz5uqo)?jT%Ob1t`T`2oG? zWuTN40`@L5;Bw`>a-|1h9P5vRadr9n`9<%8j}b=9xMBp~W#!D?`LSBKaUiP3Lw8cs zqm&E}MXkOX0jpXYfH{nzrD!`_u+)S0WR~YEH@xr-Kx1fum_4vP(NfNZo6xh@TmKYw z4|~b>8o-UT?Pq6a+tzE=rBAc4*do4Ec5hrZ1|k-(Z8vCsj*x7No;-Py^YZ0xH{fHs z${fsP)B#L5=1oE8%;BZ)0(X7uaV$B}IW3(zj; zvHZ$wPA0KaYWdS?PUw*KVZ_G1f%0k;G)*UI6ZhEpfcVdsfq_>pU%osEC&0@PVIHyv z>T`2o0%Aa31BBsie4 z9-qmv{oOI88#jDJ;6&CD+)iiWU4HoZ->6RPT*t@1(d)H4Z2#~5nl=TNUECR74dP1p zkQeg}FSy^cqlE))Sq*fyRS0=syXV5MQ=HWduKcClEOoCo9X%jXjqQXJj(>9Au=^z7V(Q>Xuq{0~wfLY&&Ocdq$`1zFr7~ zdYQmerG5YYeFYS%yUKY*T|>jneUUI=n1|Rqu!{DRMxcJRst+^K9W;xf*DsItul2@x%BVug-nIsqsg) zrh|IigXBO2JDd>^>)ZSM2ib}%Y(p@Dr$YuJaMM|>R6vY<+)& zjkO(rf4^=GfjSUrhFqSMbK`|Asd&xr3%eB$?w=Qi1tuXqBSU9=A1rt}5+p!>=F%pJ ziBpS-a)lR9EyC0=>6El&8+XegG4>2$S=r=*D6blLp3J0wYH{0uyI9YI?ZfsWdneV=t{q z4QMpE)oQqx0z=BxD_4!wcZvp^SWyiLcrW|whz@q4U|kLJt{PbZ8?}VpcP~Uwp{YG_ zRXnp!Nh!pM;Dw|Ca0%p}*}|B<*R{_RPiBHWcT6&6A?@^C?!tkDr@`3+GhAxD1z|jr0!r-6grFy@ zs!Gp(vaS0KljJHQcaZPGgU;Fa|F!_q>OaqWs za5c}Z-b5M}mONmo)fgNcWRP+D-~n#r>DC+^N4vL|YV&N}#?LW`V8W2&(821%nM-oN z9W?8MmzDn%+rd=`(Rts3!^dF2@ITfdqW^#YTG&5g&bye0v!y^pod|VH`R02$Bk%tM Du!Y-_ literal 0 HcmV?d00001 diff --git a/benchmark/matrix_add/plot.py b/benchmark/matrix_add/plot.py new file mode 100644 index 0000000..703c9a8 --- /dev/null +++ b/benchmark/matrix_add/plot.py @@ -0,0 +1,50 @@ +import os +import subprocess +import matplotlib.pyplot as plt + +result = subprocess.run(['make'], capture_output=True, text=True) +# Define the folder containing the executables +folder_path = './bin' # Change this to your bin folder path + +# Define the input sizes to test +start=10000 +end=10000 +step=100000 + +input_sizes = list(range(start, end+1, step)) +# Initialize a dictionary to store runtimes for each executable +runtimes = {exe: [] for exe in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, exe))} + +# Loop through each executable +for exe in runtimes.keys(): + exe_path = os.path.join(folder_path, exe) + + # Loop through each input size + for n in range(start,end+1,step): + # Run the executable with the input size and capture its output + result = subprocess.run([exe_path, str(n)], capture_output=True, text=True) + + # Parse the output to get the runtime + runtime = float(result.stdout.strip()) + print(exe,runtime) + + # Append the runtime to the corresponding executable list + runtimes[exe].append(runtime) + +# Plot the data +plt.figure(figsize=(12, 6)) + +# Loop through each executable and plot the runtimes +for exe, times in runtimes.items(): + plt.plot(input_sizes, times, marker='o', label=exe) + +plt.xlabel('Iterations') +plt.ylabel('Runtime (s)') +plt.title('Benchmark of Function Versions') +plt.legend() +plt.grid(True) +plt.tight_layout() + +output_file = 'benchmark_plot.png' # Specify your desired output file name and format +plt.savefig(output_file) +# Show the plot \ No newline at end of file diff --git a/benchmark/matrix_add/template.cuh b/benchmark/matrix_add/template.cuh new file mode 100644 index 0000000..baa94a5 --- /dev/null +++ b/benchmark/matrix_add/template.cuh @@ -0,0 +1,10 @@ +#pragma once + +typedef struct { + int rows; + int cols; + float* data; // array +} matrix; + +double time(int n); +matrix* new_matrix_d(int rows, int cols); \ No newline at end of file diff --git a/benchmark/matrix_add/versions/1.cu b/benchmark/matrix_add/versions/1.cu new file mode 100644 index 0000000..cd4b001 --- /dev/null +++ b/benchmark/matrix_add/versions/1.cu @@ -0,0 +1,44 @@ +#include "../template.cuh" + +matrix* new_matrix(int rows, int cols) { + matrix* res = (matrix*)malloc(sizeof(matrix)); + res->rows = rows; + res->cols = cols; + res->data = (float*)malloc((rows * cols) * sizeof(float)); + return res; +} + +matrix* new_matrix_d(int rows, int cols) { + matrix* res = (matrix*)malloc(sizeof(matrix)); + res->rows = rows; + res->cols = cols; + res->cols = cols; + cudaMalloc((void**)&(res->data), rows * cols * sizeof(float)); + return res; +} + +__global__ void matrix_add(float *a, float*b ,int rows) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx>>(a->data,b->data,row); + } + double seconds = (double)(clock() - (double)start) / CLOCKS_PER_SEC; + return seconds; +} \ No newline at end of file diff --git a/benchmark/matrix_add/versions/cpu.cu b/benchmark/matrix_add/versions/cpu.cu new file mode 100644 index 0000000..ce36a1c --- /dev/null +++ b/benchmark/matrix_add/versions/cpu.cu @@ -0,0 +1,37 @@ +#include "../template.cuh" + +matrix* new_matrix(int rows, int cols) { + matrix* res = (matrix*)malloc(sizeof(matrix)); + res->rows = rows; + res->cols = cols; + res->data = (float*)malloc((rows * cols) * sizeof(float)); + return res; +} + +matrix* new_matrix_d(int rows, int cols) { + matrix* res = (matrix*)malloc(sizeof(matrix)); + res->rows = rows; + res->cols = cols; + res->cols = cols; + cudaMalloc((void**)&(res->data), rows * cols * sizeof(float)); + return res; +} + +void matrix_add(float* a, float* b, int rows) { + for (int i = 0; i < rows; i++) { + a[i] += b[i]; + } +} + +double time(int n) { + int row=100000; + matrix* a = new_matrix(row, 1); + matrix* b = new_matrix(row, 1); + + clock_t start = clock(); + for (int i = 0; i < n; i++) { + matrix_add(a->data, b->data,row); + } + double seconds = (double)(clock() - (double)start) / CLOCKS_PER_SEC; + return seconds; +} \ No newline at end of file diff --git a/src/main.cu b/src/main.cu index 1a12d40..59ada7b 100644 --- a/src/main.cu +++ b/src/main.cu @@ -21,11 +21,13 @@ matrix* weights[NUM_LAYERS]; matrix* biases[NUM_LAYERS]; // device weights and biases; -matrix* d_weights[7]; -matrix* d_biases[7]; +matrix** d_weights; +matrix** d_biases; + matrix** d_inputs; int* results; +int* d_results; char letters[52] = {'A', 'a', 'B', 'b', 'C', 'c', 'D', 'd', 'E', 'e', 'F', 'f', 'G', 'g', 'H', 'h', 'I', 'i', 'J', 'j', 'K', 'k', 'L', 'l', 'M', 'm', 'N', 'n', 'O', 'o', 'P', 'p', 'Q', 'q', 'R', 'r', @@ -102,74 +104,72 @@ void read_tensor(matrix* a, const char* fileName) { fclose(file); } -void propagate_fwd(matrix* weights, matrix* input_layer, matrix* output_layer, matrix* biases) { - matrix_mul<<<1, 1>>>(weights->data, input_layer->data, output_layer->data, weights->rows, weights->cols); - cudaDeviceSynchronize(); - matrix_add<<<1, 1>>>(output_layer->data, biases->data, biases->rows); - cudaDeviceSynchronize(); +__device__ void propagate_fwd(matrix* weights, matrix* input_layer, matrix* output_layer, matrix* biases) { + matrix_mul(weights->data, input_layer->data, output_layer->data, weights->rows, weights->cols); + matrix_add(output_layer->data, biases->data, biases->rows); } -int infer(matrix* d_input) { - matrix* outputs[2]; - outputs[0] = new_matrix_d(98, 1); - outputs[1] = new_matrix_d(65, 1); - - propagate_fwd(d_weights[0], d_input, outputs[0], d_biases[0]); - relu<<<1, 1>>>(outputs[0]->data, 98); - cudaDeviceSynchronize(); - - propagate_fwd(d_weights[1], outputs[0], outputs[1], d_biases[1]); - cudaMemsetAsync(outputs[0], 0, 50 * sizeof(float)); - relu<<<1, 1>>>(outputs[1]->data, 65); - cudaDeviceSynchronize(); - - propagate_fwd(d_weights[2], outputs[1], outputs[0], d_biases[2]); - cudaMemsetAsync(outputs[1], 0, 30 * sizeof(float)); - relu<<<1, 1>>>(outputs[0]->data, 50); - cudaDeviceSynchronize(); - - propagate_fwd(d_weights[3], outputs[0], outputs[1], d_biases[3]); - cudaMemsetAsync(outputs[0], 0, 25 * sizeof(float)); - relu<<<1, 1>>>(outputs[1]->data, 30); - cudaDeviceSynchronize(); - - propagate_fwd(d_weights[4], outputs[1], outputs[0], d_biases[4]); - cudaMemsetAsync(outputs[1], 0, 40 * sizeof(float)); - relu<<<1, 1>>>(outputs[0]->data, 25); - cudaDeviceSynchronize(); +// __global__ void infer(int a) { +__global__ void infer(matrix** d_inputs,int* d_results,matrix** d_weights, matrix** d_biases,int size,int iter) { + - propagate_fwd(d_weights[5], outputs[0], outputs[1], d_biases[5]); - cudaMemsetAsync(outputs[0], 0, 52 * sizeof(float)); - relu<<<1, 1>>>(outputs[1]->data, 40); - cudaDeviceSynchronize(); + int idx = (blockIdx.x * blockDim.x + threadIdx.x); - propagate_fwd(d_weights[6], outputs[1], outputs[0], d_biases[6]); - softmax<<<1, 1>>>(outputs[0]->data, 52); - cudaDeviceSynchronize(); + int total=iter*size; + int stride=(blockDim.x*gridDim.x); - int* d_res; - cudaMalloc(&d_res, sizeof(int)); + matrix* gay_input=d_inputs[0]; - argmax<<<1, 1>>>(outputs[0]->data, 52, d_res); - cudaDeviceSynchronize(); - - cudaFree(outputs[0]->data); + matrix* outputs[2]; + outputs[0] = new_matrix(98, 1); + outputs[1] = new_matrix(65, 1); + + for (int i=idx;idata, 98); + + propagate_fwd(d_weights[1], outputs[0], outputs[1], d_biases[1]); + memset(outputs[0]->data, 0, 50 * sizeof(float)); + relu(outputs[1]->data, 65); + + propagate_fwd(d_weights[2], outputs[1], outputs[0], d_biases[2]); + memset(outputs[1]->data, 0, 30 * sizeof(float)); + relu(outputs[0]->data, 50); + + propagate_fwd(d_weights[3], outputs[0], outputs[1], d_biases[3]); + memset(outputs[0]->data, 0, 25 * sizeof(float)); + relu(outputs[1]->data, 30); + + propagate_fwd(d_weights[4], outputs[1], outputs[0], d_biases[4]); + memset(outputs[1]->data, 0, 40 * sizeof(float)); + relu(outputs[0]->data, 25); + + propagate_fwd(d_weights[5], outputs[0], outputs[1], d_biases[5]); + memset(outputs[0]->data, 0, 52 * sizeof(float)); + relu(outputs[1]->data, 40); + + propagate_fwd(d_weights[6], outputs[1], outputs[0], d_biases[6]); + softmax(outputs[0]->data, 52); + + int res=argmax(outputs[0]->data, 52); + d_results[resultIdx]=res; + + memset(outputs[0]->data,0,98*sizeof(float)); + memset(outputs[1]->data,0,65*sizeof(float)); + } + free(outputs[0]->data); free(outputs[0]); - cudaFree(outputs[1]->data); + free(outputs[1]->data); free(outputs[1]); - - int h_res; - cudaMemcpy(&h_res, d_res, sizeof(int), cudaMemcpyDeviceToHost); - return h_res; } -void process(int input_size) { - for (int i = 1; i <= input_size; i++) { - results[i] = infer(d_inputs[i]); - } -} +int iter=1000; int main(int argc, char* argv[]) { + + if (argc < 3) { printf("Not enough arguments."); return EXIT_FAILURE; @@ -189,6 +189,7 @@ int main(int argc, char* argv[]) { weights[5] = new_matrix(40, 25); weights[6] = new_matrix(52, 40); + biases[0] = new_matrix(98, 1); biases[1] = new_matrix(65, 1); biases[2] = new_matrix(50, 1); @@ -198,21 +199,15 @@ int main(int argc, char* argv[]) { biases[6] = new_matrix(52, 1); read_model(argv[1]); - d_weights[0] = copy_to_device(weights[0]); - d_weights[1] = copy_to_device(weights[1]); - d_weights[2] = copy_to_device(weights[2]); - d_weights[3] = copy_to_device(weights[3]); - d_weights[4] = copy_to_device(weights[4]); - d_weights[5] = copy_to_device(weights[5]); - d_weights[6] = copy_to_device(weights[6]); - - d_biases[0] = copy_to_device(biases[0]); - d_biases[1] = copy_to_device(biases[1]); - d_biases[2] = copy_to_device(biases[2]); - d_biases[3] = copy_to_device(biases[3]); - d_biases[4] = copy_to_device(biases[4]); - d_biases[5] = copy_to_device(biases[5]); - d_biases[6] = copy_to_device(biases[6]); + CUDA_CHECK(cudaMalloc(&d_weights,NUM_LAYERS*sizeof(matrix*))); + CUDA_CHECK(cudaMalloc(&d_biases,NUM_LAYERS*sizeof(matrix*))); + for (int i=0;id_type == DT_REG) { size++; - } + } } - results = (int*)malloc((size + 1) * sizeof(int)); - d_inputs = (matrix**)malloc((size + 1) * sizeof(matrix*)); + results = (int*)malloc((size) * sizeof(int)); + memset(results,0,sizeof(int)*(size)); + cudaMalloc(&d_results,(size)*sizeof(int)); + cudaMalloc(&d_inputs,(size)*sizeof(matrix*)); dir = opendir(directory_path); - while ((entry = readdir(dir)) != NULL) { if (entry->d_type == DT_REG) { matrix* input = new_matrix(225, 1); @@ -245,23 +241,27 @@ int main(int argc, char* argv[]) { strcat(file_name, "/"); strcat(file_name, entry->d_name); read_tensor(input, file_name); - d_inputs[file_num] = copy_to_device(input); + matrix *temp=copy_to_device(input); + cudaMemcpy(&d_inputs[file_num-1],&temp,sizeof(matrix*),cudaMemcpyHostToDevice); free(input); } } - free(file_name); free(file_num_str); closedir(dir); - // Process - process(size); - // Write to csv file + int threadsPerBlock = 512; + int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock; + + infer<<>>(d_inputs,d_results,d_weights,d_biases,size,iter); + cudaDeviceSynchronize(); + cudaMemcpy(results,d_results,(size)*(sizeof(int)),cudaMemcpyDeviceToHost); + FILE* csv_file = fopen("results.csv", "w+"); fprintf(csv_file, "image_number, guess\n"); - for (int i = 1; i <= size; i++) { - fprintf(csv_file, "%d, %c\n", i, letters[results[i]]); + for (int i = 0; i < size; i++) { + fprintf(csv_file, "%d, %c\n", i+1, letters[results[i]]); } fclose(csv_file); diff --git a/src/matrix.cu b/src/matrix.cu index e8ed37d..205e62b 100644 --- a/src/matrix.cu +++ b/src/matrix.cu @@ -6,7 +6,7 @@ #define UNROLL_FACTOR 8 -matrix* new_matrix(int rows, int cols) { +__host__ __device__ matrix* new_matrix(int rows, int cols) { matrix* res = (matrix*)malloc(sizeof(matrix)); res->rows = rows; res->cols = cols; @@ -14,22 +14,33 @@ matrix* new_matrix(int rows, int cols) { return res; } +__global__ void alloc(matrix* res,float* data,int rows,int cols){ + res->rows=rows; + res->cols=cols; + res->data=data; +} + matrix* new_matrix_d(int rows, int cols) { - matrix* res = (matrix*)malloc(sizeof(matrix)); - res->rows = rows; - res->cols = cols; - res->cols = cols; - cudaMalloc((void**)&(res->data), rows * cols * sizeof(float)); + matrix* res; + CUDA_CHECK(cudaMalloc(&res,sizeof(matrix))); + float* data; + cudaMalloc(&data,rows*cols*sizeof(float)); + alloc<<<1,1>>>(res,data,rows,cols); + cudaDeviceSynchronize(); return res; } matrix* copy_to_device(matrix* h_mat) { - matrix* res = new_matrix_d(h_mat->rows, h_mat->cols); - CUDA_CHECK(cudaMemcpy(res->data, h_mat->data, h_mat->rows * h_mat->cols * sizeof(float), cudaMemcpyHostToDevice)); + matrix* res ; + CUDA_CHECK(cudaMalloc(&res,sizeof(matrix))); + float* data; + cudaMalloc(&data,h_mat->rows*h_mat->cols*sizeof(float)); + cudaMemcpy(data, h_mat->data, h_mat->rows * h_mat->cols * sizeof(float), cudaMemcpyHostToDevice); + alloc<<<1,1>>>(res,data,h_mat->rows,h_mat->cols); return res; } -__global__ void matrix_mul(float* weight, float* input, float* result, int w_rows, int w_cols) { +__device__ void matrix_mul(float* weight, float* input, float* result, int w_rows, int w_cols) { for (int i = 0; i < w_rows; i++) { float sum = 0; for (int j = 0; j < w_cols; j++) { @@ -39,20 +50,20 @@ __global__ void matrix_mul(float* weight, float* input, float* result, int w_row } } -__global__ void matrix_add(float* a, float* b, int rows) { +__device__ void matrix_add(float* a, float* b, int rows) { for (int i = 0; i < rows; i++) { a[i] += b[i]; } } -__global__ void relu(float* a, int rows) { +__device__ void relu(float* a, int rows) { for (int i = 0; i < rows; i++) { if ((a)[i] < (float)0) (a)[i] = (float)0; } } -__global__ void softmax(float* a, int rows) { +__device__ void softmax(float* a, int rows) { float res = (float)0; for (int i = 0; i < rows; i++) { res += exp(a[i]); @@ -62,7 +73,7 @@ __global__ void softmax(float* a, int rows) { } } -__global__ void argmax(float* a, int rows, int* des) { +__device__ int argmax(float* a, int rows) { int res = a[0]; int idx = 0; for (int i = 0; i < rows; i++) { @@ -71,5 +82,5 @@ __global__ void argmax(float* a, int rows, int* des) { idx = i; } } - *des = idx; + return idx; } \ No newline at end of file diff --git a/src/matrix.cuh b/src/matrix.cuh index 83005cf..235ae78 100644 --- a/src/matrix.cuh +++ b/src/matrix.cuh @@ -6,18 +6,18 @@ typedef struct { float* data; // array } matrix; -matrix* new_matrix(int rows, int cols); + __host__ __device__ matrix* new_matrix(int rows, int cols); matrix* copy_to_device(matrix* h_mat); matrix* new_matrix_d(int rows, int cols); -__global__ void matrix_mul(float* a, float* b, float* c, int rows, int cols); +__device__ void matrix_mul(float* a, float* b, float* c, int rows, int cols); -__global__ void matrix_add(float* a, float* b, int rows); +__device__ void matrix_add(float* a, float* b, int rows); -__global__ void relu(float* a, int rows); +__device__ void relu(float* a, int rows); -__global__ void softmax(float* a, int rows); +__device__ void softmax(float* a, int rows); -__global__ void argmax(float* a, int rows, int* res); \ No newline at end of file +__device__ int argmax(float* a, int rows); \ No newline at end of file From 2ab6689441596465628f2d9cc844ce3a7ed2bef1 Mon Sep 17 00:00:00 2001 From: nhatdongdang <144138246+nhatdongdang@users.noreply.github.com> Date: Fri, 5 Jul 2024 06:27:36 +0000 Subject: [PATCH 2/4] Change optimal block size --- src/main.cu | 95 ++++++++++++++++++++++++++++++++------------------ src/matrix.cu | 9 +++++ src/matrix.cuh | 4 ++- 3 files changed, 73 insertions(+), 35 deletions(-) diff --git a/src/main.cu b/src/main.cu index 59ada7b..5b4b779 100644 --- a/src/main.cu +++ b/src/main.cu @@ -110,54 +110,45 @@ __device__ void propagate_fwd(matrix* weights, matrix* input_layer, matrix* outp } // __global__ void infer(int a) { -__global__ void infer(matrix** d_inputs,int* d_results,matrix** d_weights, matrix** d_biases,int size,int iter) { - +__global__ void infer(matrix** d_inputs, int* d_results, matrix** d_weights, matrix** d_biases, int it_per_input, int in_num) { + int num_threads = blockDim.x * gridDim.x; + int thread_idx = (blockIdx.x * blockDim.x + threadIdx.x); - int idx = (blockIdx.x * blockDim.x + threadIdx.x); + if (thread_idx > it_per_input) return; - int total=iter*size; - int stride=(blockDim.x*gridDim.x); + // printf("Thread %d at input %d\n", thread_idx, in_num); - matrix* gay_input=d_inputs[0]; + matrix* input = d_inputs[in_num]; matrix* outputs[2]; outputs[0] = new_matrix(98, 1); outputs[1] = new_matrix(65, 1); - for (int i=idx;idata, 98); propagate_fwd(d_weights[1], outputs[0], outputs[1], d_biases[1]); - memset(outputs[0]->data, 0, 50 * sizeof(float)); relu(outputs[1]->data, 65); propagate_fwd(d_weights[2], outputs[1], outputs[0], d_biases[2]); - memset(outputs[1]->data, 0, 30 * sizeof(float)); relu(outputs[0]->data, 50); propagate_fwd(d_weights[3], outputs[0], outputs[1], d_biases[3]); - memset(outputs[0]->data, 0, 25 * sizeof(float)); relu(outputs[1]->data, 30); propagate_fwd(d_weights[4], outputs[1], outputs[0], d_biases[4]); - memset(outputs[1]->data, 0, 40 * sizeof(float)); relu(outputs[0]->data, 25); propagate_fwd(d_weights[5], outputs[0], outputs[1], d_biases[5]); - memset(outputs[0]->data, 0, 52 * sizeof(float)); relu(outputs[1]->data, 40); propagate_fwd(d_weights[6], outputs[1], outputs[0], d_biases[6]); softmax(outputs[0]->data, 52); int res=argmax(outputs[0]->data, 52); - d_results[resultIdx]=res; - - memset(outputs[0]->data,0,98*sizeof(float)); - memset(outputs[1]->data,0,65*sizeof(float)); + d_results[in_num] = res; } free(outputs[0]->data); free(outputs[0]); @@ -166,7 +157,8 @@ __global__ void infer(matrix** d_inputs,int* d_results,matrix** d_weights, matri } -int iter=1000; +#define IT_PER_IN 1000000 + int main(int argc, char* argv[]) { @@ -178,9 +170,7 @@ int main(int argc, char* argv[]) { // Start timing struct timeval stop, start; gettimeofday(&start, NULL); - - // TODO: find a way to load static weights and biases - // Load model (The memory of those code should be initialize during compile time to enchance the speed) + weights[0] = new_matrix(98, 225); weights[1] = new_matrix(65, 98); weights[2] = new_matrix(50, 65); @@ -218,17 +208,17 @@ int main(int argc, char* argv[]) { char* file_num_str = (char*)malloc((100) * sizeof(char)); int file_num; - int size = 0; + int input_count = 0; while ((entry = readdir(dir)) != NULL) { if (entry->d_type == DT_REG) { - size++; + input_count++; } } - results = (int*)malloc((size) * sizeof(int)); - memset(results,0,sizeof(int)*(size)); - cudaMalloc(&d_results,(size)*sizeof(int)); - cudaMalloc(&d_inputs,(size)*sizeof(matrix*)); + results = (int*)malloc((input_count) * sizeof(int)); + memset(results,0,sizeof(int)*(input_count)); + cudaMalloc(&d_results,(input_count)*sizeof(int)); + cudaMalloc(&d_inputs,(input_count)*sizeof(matrix*)); dir = opendir(directory_path); while ((entry = readdir(dir)) != NULL) { @@ -250,17 +240,54 @@ int main(int argc, char* argv[]) { free(file_num_str); closedir(dir); + int deviceCount; + cudaError_t err = cudaGetDeviceCount(&deviceCount); + if (err != cudaSuccess) { + printf("Error: %s\n", cudaGetErrorString(err)); + return -1; + } + + for (int i = 0; i < deviceCount; ++i) { + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, i); + printf("Device %d:\n", i); + printf(" Device Name: %s\n", prop.name); + printf(" Compute Capability: %d.%d\n", prop.major, prop.minor); + printf(" Total Global Memory: %lu bytes\n", prop.totalGlobalMem); + printf(" Shared Memory per Block: %lu bytes\n", prop.sharedMemPerBlock); + printf(" Registers per Block: %d\n", prop.regsPerBlock); + printf(" Warp Size: %d\n", prop.warpSize); + printf(" Max Threads per Block: %d\n", prop.maxThreadsPerBlock); + printf(" Max Threads Dim: (%d, %d, %d)\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]); + printf(" Max Grid Size: (%d, %d, %d)\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]); + printf(" Clock Rate: %d kHz\n", prop.clockRate); + printf(" Total Constant Memory: %lu bytes\n", prop.totalConstMem); + printf(" Multiprocessor Count: %d\n", prop.multiProcessorCount); + printf(" Memory Clock Rate: %d kHz\n", prop.memoryClockRate); + printf(" Memory Bus Width: %d bits\n", prop.memoryBusWidth); + printf(" L2 Cache Size: %d bytes\n", prop.l2CacheSize); + printf("\n"); + } - int threadsPerBlock = 512; - int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock; + cudaMemset(d_results,0,sizeof(int)*input_count); - infer<<>>(d_inputs,d_results,d_weights,d_biases,size,iter); + int minGridSize, blockSize; + cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, infer, 0, 0); + printf("Recommended block size: %d Grid size: %d\n", blockSize, minGridSize); + + for (int i = 0; i < input_count; i++) { + infer<<<108,69>>>(d_inputs, d_results, d_weights, d_biases, IT_PER_IN, i); + err = cudaGetLastError(); + if (err != cudaSuccess) { + printf("CUDA error: %s\n", cudaGetErrorString(err)); + } + } cudaDeviceSynchronize(); - cudaMemcpy(results,d_results,(size)*(sizeof(int)),cudaMemcpyDeviceToHost); + cudaMemcpy(results,d_results,(input_count)*(sizeof(int)),cudaMemcpyDeviceToHost); FILE* csv_file = fopen("results.csv", "w+"); fprintf(csv_file, "image_number, guess\n"); - for (int i = 0; i < size; i++) { + for (int i = 0; i < input_count; i++) { fprintf(csv_file, "%d, %c\n", i+1, letters[results[i]]); } fclose(csv_file); diff --git a/src/matrix.cu b/src/matrix.cu index 205e62b..0188c2c 100644 --- a/src/matrix.cu +++ b/src/matrix.cu @@ -40,6 +40,15 @@ matrix* copy_to_device(matrix* h_mat) { return res; } +__device__ __host__ matrix* create_copy(matrix* mat){ + matrix* res = (matrix*)malloc(sizeof(matrix)); + res->rows = mat->rows; + res->cols = mat->cols; + res->data = (float*)malloc((res->rows * res->cols) * sizeof(float)); + memcpy(res->data,mat->data,res->rows*res->cols*sizeof(float)); + return res; +} + __device__ void matrix_mul(float* weight, float* input, float* result, int w_rows, int w_cols) { for (int i = 0; i < w_rows; i++) { float sum = 0; diff --git a/src/matrix.cuh b/src/matrix.cuh index 235ae78..b2191cb 100644 --- a/src/matrix.cuh +++ b/src/matrix.cuh @@ -20,4 +20,6 @@ __device__ void relu(float* a, int rows); __device__ void softmax(float* a, int rows); -__device__ int argmax(float* a, int rows); \ No newline at end of file +__device__ int argmax(float* a, int rows); + +__device__ __host__ matrix* create_copy(matrix* mat); From c1639bc6fc61254dff46133c1fe91f6dbbf46ab5 Mon Sep 17 00:00:00 2001 From: nhatdongdang <144138246+nhatdongdang@users.noreply.github.com> Date: Fri, 5 Jul 2024 06:39:26 +0000 Subject: [PATCH 3/4] Tune block size and grid size --- src/main.cu | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/main.cu b/src/main.cu index 5b4b779..edee095 100644 --- a/src/main.cu +++ b/src/main.cu @@ -125,7 +125,6 @@ __global__ void infer(matrix** d_inputs, int* d_results, matrix** d_weights, mat outputs[1] = new_matrix(65, 1); for (int i = thread_idx; i < it_per_input; i += num_threads) { - // printf("Working on inference %d\n", i); propagate_fwd(d_weights[0], input, outputs[0], d_biases[0]); relu(outputs[0]->data, 98); @@ -278,7 +277,7 @@ int main(int argc, char* argv[]) { for (int i = 0; i < input_count; i++) { infer<<<108,69>>>(d_inputs, d_results, d_weights, d_biases, IT_PER_IN, i); err = cudaGetLastError(); - if (err != cudaSuccess) { + if (err != cudaSuccess) { printf("CUDA error: %s\n", cudaGetErrorString(err)); } } From e8db37a68836952cc541723c7051bfdacebeebc9 Mon Sep 17 00:00:00 2001 From: nhatdongdang <144138246+nhatdongdang@users.noreply.github.com> Date: Fri, 5 Jul 2024 06:46:44 +0000 Subject: [PATCH 4/4] Fix style --- src/main.cu | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/main.cu b/src/main.cu index edee095..486495f 100644 --- a/src/main.cu +++ b/src/main.cu @@ -109,14 +109,12 @@ __device__ void propagate_fwd(matrix* weights, matrix* input_layer, matrix* outp matrix_add(output_layer->data, biases->data, biases->rows); } -// __global__ void infer(int a) { __global__ void infer(matrix** d_inputs, int* d_results, matrix** d_weights, matrix** d_biases, int it_per_input, int in_num) { int num_threads = blockDim.x * gridDim.x; int thread_idx = (blockIdx.x * blockDim.x + threadIdx.x); if (thread_idx > it_per_input) return; - // printf("Thread %d at input %d\n", thread_idx, in_num); matrix* input = d_inputs[in_num]; @@ -178,7 +176,6 @@ int main(int argc, char* argv[]) { weights[5] = new_matrix(40, 25); weights[6] = new_matrix(52, 40); - biases[0] = new_matrix(98, 1); biases[1] = new_matrix(65, 1); biases[2] = new_matrix(50, 1);