From 3b3f33bf2581238d325f05015123fc0a026c394e Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Mon, 6 Dec 2021 15:03:35 -0800 Subject: [PATCH] Quality improvements (#29) --- DAG.pdf | Bin 0 -> 70945 bytes Makefile | 91 ++-- README.md | 156 +++++- Snakefile | 449 +++++++++--------- configs/config.prod.yml | 31 +- configs/config.test.yml | 32 +- envs/base.yml | 6 +- .../generate-alignment-and-shortlist.sh | 3 +- pipeline/bicleaner/bicleaner.sh | 20 +- pipeline/bicleaner/download-pack.sh | 13 +- pipeline/cefilter/ce-filter.sh | 21 +- pipeline/cefilter/normalize-scores.py | 36 -- pipeline/cefilter/score.sh | 1 + pipeline/clean/clean-corpus.sh | 94 ++-- pipeline/clean/clean-mono.sh | 64 +-- pipeline/clean/fixes/detok.sh | 17 + pipeline/clean/fixes/mtdata_JW300.mt.sh | 5 + pipeline/clean/fixes/mtdata_JW300.sh | 5 + .../clean/fixes/mtdata_OPUS_DOGC_v2.ca.sh | 8 + .../clean/fixes/mtdata_OPUS_DOGC_v2.es.sh | 5 + pipeline/clean/fixes/mtdata_OPUS_DOGC_v2.sh | 5 + pipeline/clean/fixes/mtdata_OPUS_ECB_v1.sh | 5 + .../clean/fixes/mtdata_OPUS_SETIMES_v2.sh | 17 + .../clean/fixes/mtdata_OPUS_UNPC_v1_0.en.sh | 4 + .../clean/fixes/mtdata_OPUS_UNPC_v1_0.fr.sh | 4 + .../mtdata_neulab_tedtalksv1_train.ro.sh | 4 + .../fixes/mtdata_neulab_tedtalksv1_train.sh | 5 + pipeline/clean/merge-corpus.sh | 35 ++ pipeline/clean/merge-mono.sh | 23 + pipeline/clean/tools/clean_parallel.py | 31 +- pipeline/data/download-corpus.sh | 35 +- pipeline/data/download-eval.sh | 27 -- pipeline/data/download-mono.sh | 80 ++-- .../data/importers/corpus/custom-corpus.sh | 6 +- pipeline/data/importers/corpus/flores.sh | 12 +- pipeline/data/importers/corpus/mtdata.sh | 20 +- pipeline/data/importers/corpus/opus.sh | 26 +- pipeline/data/importers/corpus/sacrebleu.sh | 11 +- pipeline/data/importers/mono/commoncrawl.sh | 3 +- pipeline/data/importers/mono/custom-mono.sh | 2 +- pipeline/data/importers/mono/news-crawl.sh | 3 +- .../data/importers/mono/paracrawl-mono.sh | 1 - pipeline/quantize/eval.sh | 20 +- pipeline/quantize/quantize.sh | 4 +- .../configs/model/{s2s.yml => backward.yml} | 0 .../model/{student.tiny11.yml => student.yml} | 0 .../configs/model/teacher.transformer.yml | 8 - pipeline/train/configs/model/teacher.yml | 6 + .../{s2s.train.yml => backward.train.yml} | 2 +- .../configs/training/teacher.continue.yml | 9 + .../train/configs/training/teacher.train.yml | 9 + .../teacher.transformer-ens.train.yml | 22 - .../training/teacher.transformer.train.yml | 21 - pipeline/train/eval.sh | 32 +- pipeline/train/finetune-student.sh | 40 -- pipeline/train/train-s2s.sh | 32 -- pipeline/train/train-student.sh | 28 +- pipeline/train/train-teacher.sh | 31 -- pipeline/train/train.sh | 11 +- pipeline/translate/decoder.yml | 3 +- pipeline/translate/translate-nbest.sh | 4 +- pipeline/translate/translate.sh | 5 +- profiles/snakepit/config.yaml | 6 - profiles/snakepit/jobscript.sh | 3 - profiles/snakepit/status.py | 32 -- profiles/snakepit/submit.py | 37 -- reports/evaluation.rst | 2 +- 67 files changed, 912 insertions(+), 871 deletions(-) create mode 100644 DAG.pdf delete mode 100644 pipeline/cefilter/normalize-scores.py create mode 100755 pipeline/clean/fixes/detok.sh create mode 100755 pipeline/clean/fixes/mtdata_JW300.mt.sh create mode 100755 pipeline/clean/fixes/mtdata_JW300.sh create mode 100755 pipeline/clean/fixes/mtdata_OPUS_DOGC_v2.ca.sh create mode 100755 pipeline/clean/fixes/mtdata_OPUS_DOGC_v2.es.sh create mode 100644 pipeline/clean/fixes/mtdata_OPUS_DOGC_v2.sh create mode 100755 pipeline/clean/fixes/mtdata_OPUS_ECB_v1.sh create mode 100755 pipeline/clean/fixes/mtdata_OPUS_SETIMES_v2.sh create mode 100755 pipeline/clean/fixes/mtdata_OPUS_UNPC_v1_0.en.sh create mode 100755 pipeline/clean/fixes/mtdata_OPUS_UNPC_v1_0.fr.sh create mode 100755 pipeline/clean/fixes/mtdata_neulab_tedtalksv1_train.ro.sh create mode 100755 pipeline/clean/fixes/mtdata_neulab_tedtalksv1_train.sh create mode 100644 pipeline/clean/merge-corpus.sh create mode 100644 pipeline/clean/merge-mono.sh delete mode 100644 pipeline/data/download-eval.sh rename pipeline/train/configs/model/{s2s.yml => backward.yml} (100%) rename pipeline/train/configs/model/{student.tiny11.yml => student.yml} (100%) delete mode 100644 pipeline/train/configs/model/teacher.transformer.yml create mode 100644 pipeline/train/configs/model/teacher.yml rename pipeline/train/configs/training/{s2s.train.yml => backward.train.yml} (84%) create mode 100644 pipeline/train/configs/training/teacher.continue.yml create mode 100644 pipeline/train/configs/training/teacher.train.yml delete mode 100644 pipeline/train/configs/training/teacher.transformer-ens.train.yml delete mode 100644 pipeline/train/configs/training/teacher.transformer.train.yml delete mode 100644 pipeline/train/finetune-student.sh delete mode 100644 pipeline/train/train-s2s.sh delete mode 100644 pipeline/train/train-teacher.sh delete mode 100644 profiles/snakepit/config.yaml delete mode 100644 profiles/snakepit/jobscript.sh delete mode 100644 profiles/snakepit/status.py delete mode 100644 profiles/snakepit/submit.py diff --git a/DAG.pdf b/DAG.pdf new file mode 100644 index 0000000000000000000000000000000000000000..d1c5a539fa877d6bab1be978e41c573b126cbbeb GIT binary patch literal 70945 zcmagFWmua**ENbuph$6-7N#>S0=M()>?bt6FN01nRlFQ9{_aqe<~gU+|(S@F6QL#1mN!R(aOvT;FaFfW*kR1_?_Y|03NOruV) z=k{i0ySrrTppl1kjcwPIM&p=$3 z$kViyZ{%V3O@e`Kjop1W=;VCLPmVcHb{4Ymwu=sTJ${7$&4QYZb$hU(2zY2r1s zth3t+1T{U4ibBjE!&+Cc!Oum*Kz~2)`(q&V_i$yvpEFgHl^d-c%vLbDL_DGI;tt)r zu-03$EiQoBiA$I{=f!1?WLh7#hHG_!c6#`g)6BxKZAo$z&+HRwOI03YcxX!BbDeh# zeQv?9!AD#op5A@+jtsq+M({m1r61A9-V9}@Ta&&b=tT=vz>~n5>}U?dJys=BD~V99 zR;5 z2ZdP;3*V#MYM>U#;nxDgd}A0kJ0F{9Z>)p-qh|7P!2LOUvY`!VJ5o zFUue{7^glacSYRMjqp)(`PZ>;_?UucVq%Wa@>ugWvS7o7;LO!5`h?oek9uy~Vq2E> zP|D#0=41)`>o0fFT|76*mK4#EYuXR6^;S19>hYFamx*bMPO2+QFK{!HRmVDHiiVWH zR@+{;DZ!{GO!>kaUxDPq*7M%97k-1mE2x6*eSaMLdh?k7tyeP1#~#t@%t~TW)pIk;bIG*XWB2i$Z9cb98`kDj3qUyN97l=4b&0)m; zP;IoBf5x|%%)ESHn#w^B?^Hu{SrhVQJ_QsY6kCly*I8YK zPq8p7t@B-wwd&mSMXo_jt}KpPCz`{Hrr^W71bGiJN`;7U@VqGxa_nbxnR(N0KHaO4 z9WbFS5H5Vei=|f88v0Wb>GL}^tHIqTz)a>NSUKgj3BE5hh6sai<#ym1q*)PcO@!&? ze{_MUgiz)vDJL}H$n*n=8&eXS zq|s)psY6Qi3I77J=X5e|k{4xrSd(OAdB)$fufA|x?6y8%bis&qHN>^`{7|-fT?XOp zlDPq~tRHV`K0i;-i~2zim)Aj%?$4LPqBs05#vV%e@kiv%mv)oqlv4i3nkGk)?dXUfFDGU8u#A#cA5w`E3!;BSwN}xH#37tXIqX1oz1OkrZ59fK01iiO5R6`8F-;EvQm4Kz+z(~=8X`4GtNLdTx*otrwm z(>$Jt@PgD(!x_JTSH<$DHazQvg%=bL?sgdtbUEtskSHZiGa=zUZLV^GK9Dw8jh70q z_JCHQB1|jwb;{~o8?_jCdgJL!a~&WV+T6f%gUBqJ;KZxWH>q?khw1zh^f9MBpxm(vl!?`!&HhOIjEK;EftZ@PR#aSp2Xdt~u@ zzWM_c6noO2@w$IF>_-J_Z#u}g3wopF2j2Y34Pm=vLiuQJc9wS4Dy?N%*(+m{zs}Yb zl)7?*u18}{Au>qopW+Yfatq@MCL)@gRcifKSwBEh&{H&*yV*Nia@pjZR@Kqf@_MiD zD1Uh^^qfGCqTn!={-oZz$@ofc>O)w*sn*S1z?X(UQ=|D^fU+f9TgkU#)B^k~Q#$?P zUI{*mx;B?dfoY#q#YJr<{iB)jo5E<560Tf-=4oli{^^M?H#L-G_~Q>{BpL9q05fpB zTP1iU$|>`CLz&Cc2B;JIJ_i(M___P+x+G6`uyK`BB1yhUgZ?Z%EPd(K1xc9G3B#~L zQz|2)!om$zhrbg3fia8&mt|vq#U_z@@3Y2W2u6OCV>3HpA9diaW|0v3tb7RPv8j)JcM$msA_C4WGFL-4kYxmt>izo z9LK_SlL;y`9%Q!JtWHm(XH^w*R_VJwFfB5$W(f1KeCiDJQ#Z`s^i+hdYSg&g9nIvN zP)kV8^D=8;sXq#_rz~D5kn6;p7QLd?b=X2yHKHN_WozecamUtN;J(}8vmf!VOL(+0 z5P6Vnq1nR0A~_C-#c_yX{-?0tiFjaqu#sUKLsJ3UIBvK`a*?0)PyS+yitv?2wVNj2 zzFlBclO<#3krvK@sB)SQ6}g-g_GS|*B8LbC3+x16(wwBw`|`5o_E7*e>L;y)N(|2B zt(KgK_ZCP84Ps-NiTAxcX%pC!xrx(bucj7;s3caj>f6My{OG1u4lG1%@FZF3O+?^? zv-+`_Fgc502i{do+hK>JtU)VM|Lz}!#b`Y~?|f)Z@ziikGEy=l$daW=Qny%{E4pY5 z8|S)Q{ciHdP8uC_8;&Zw=W0i6B~{?{yGrm}TVbEDMuqQdsWbQ6;1px{!d4ufrr5@d zy_&19ZGFSx#PK&-xdU|$p^QSfff6>AKSR$xP=)%%O<7ml?yw{PJ}(8qT~NP$wJp(z zt-Va1-T8Gi__w-KrE1t#A?XlHuX6h3S2CY!m>Wo6cOeQ~=VSvTk8wyFTp9e3sD>E= zA_-sP6x5^EWZrKL1#X0yanj(OV+5>FI#&|Wo5>Ew=)OlEGyk?7{YF-pA()yu@2?VS z4cC|6;lIQ>Q_6)re`e1Dw;Rvr90ehcs(Eg99y}xVd2}v@aD*^BGNXOou%Y@RQhfA1 z>9-!p%|e-9etcnPEe_bL=wtNo{;qXA3lOlGKh0o9{ZeTwdB4uJs##jGZbz1HX7^iPoJ+hrWa16r+=u7-3jQxqk{)N$XrWolq1)@(gSRZD4>!U?C+$W$Z0+{yj6(YC)dpT0U%o}8AOLvV{u}r zqXikSQLc&hydj|f6klBBXurQ_93BrRS!a4TdI^beo50^<|7u4bqVhIF?GShT z9}DaAv<$ZUG(Fu5HD51n`GogG15HX&8X6TF!(r7|#XegGF(aNp97NiN*i z$PhUtcqSBR6zji_*MCFNV6gC7bCx36(pD)7qEbleWISZBr#m1+=YrugZfZF)etiGij}jH}pet^d@Onhz03mQHFRT3y9W4*}6Et zQYVu>_L)GUMAm7s8HbW*-Dhz_=@p2rbn`El(7KK_!%-zM+LW-rI1P7~8UWNQHVyoe zB+|n}of{$|owe_=Ik7osVF%+sXE|SI_8)QOIup}hLVW8zG91Bo6Xq zKFMs`2^H6WWLcUqZaF$GiW`S{p?(+8?FG!sv1gf3Y8aEkx1>DMDB*2 zrz9d~$aToc0sUYD6A35?C*t|omZ-`8<&yQxsm4WoetT+LX4XhOWeBh!t#Jx!4%i;a zIlgRTEzN9|e5fI$UE!#ZIsh*AKd1jP)SMKwg^B~{b3YsP?CT$xyrHbAx1_jYy$@@9 zos(A*2g&QBu}2kFq^}9d-1^H{ck%)Dhw8SOc9GeGjezZkO_Ar#tWA@P4Ly<%6o^O@ zSLJ^bwZ&dp7@M|alc9G{rqrSK_BzDbLsLuLY3ao!yZg|~&@Aw0XlSpt30$j)w#5Y;4jag2v+B~$ zT2Y!Cf8xD(-z*FXTFr?(c^l4$ZC2Ur(O)ta;FI*vu;A~}(D=mA8;A-kvTb!0-U$J6 z;z-?IHro)J)+*>nfIwY2#3Lup_g&hO84oKinRW2%S06^;5{NVEm6vM*sb$*D8ajJZm#3ZM1+7BjGFcH>WNlF|yji2NPGS zxkp_%*+(gep`{qI9hN8p_8zS#931y(eXowy(6kK&>?idpLU0zaN?Rf5l32O)DWI$F z_#2xp_{f|1^vY$8>@1bgxct;ua-H|v_Ukz+8efZr!%uFUY|gM9D&a3bu^~@lGyNfN zE!<@CK&$lMSKE}nAH~Xu5D-uPbGZhgL00qTQ{1eTC$7#1e4P2vaU2*x1iwAi)yiT0 z3W^;dKVP(VpYSp2uV!jk!fn7ibAlle>_rrYLOv0?N18nviMS#@>_NAgi1AK?8dbXl zSn8z7r}k+4H@ky3SgQkAL9{dq9!$4p}J6uIi;cLzO6f=_k?g_^Ud z#Y052>u_L9pPswqAo<jz|ZEUZVv`M;cQMX}{Z@Rz69d5&oT(OnmUZMk=AroSg% ztJU%%9&v=zyA&X$@qtA=QxV);hCO1h(0v=e(h_hH6_f{$oh-Jqx{$2Xgf%Z>Zim&J z=DUQVkF)WV^RdzMH5v}##tPmRmjrg7Brx;<>)g+c%ODW=MqQ9{m}+6;(k$8iM%tNs)} zGv>ExL&1jw>=gq9K|{~9&YdVDV!lO_!sOkHksJ*Yj9-NcKf_PrS9chzm&o3uCj7p~ z=sOoRstccz_>HWrIJjf%yHL*&yS%t-QhsUtsPCkUG2WkRd>j+Z%sfxK6z5StfzrxB z3XIgLV#&O;BDeWH7&~|>%C5RM>E^`7*1}tRxvaVl_Vd_lX)u@~%GIsPJ7xy_9 z8E`Fmiy3~~QI9vJX#V6y1PF!iy^T{(?&JI^JiCcvBv3NVCd`lJJ1P5R7e}x?ccY?1 zw`lF~c0ySXyq`<$Aq+6`F>6DWS+5rjGR;sLm^*W0c%;S4GF>K zZz$2t5z=_F@W-!teVNR-hzqjMRHQT+PJ38BZ7%j&dy4VvRWe6MEeXsH(mkar!%OlI zYl|O$P)BaaM2(KLJ?atg+C6mQAZ^ez)0!#C-T$>CYE)e1Zb=15;`0rXW)`o|S)=eu zIoTMxHw9YxF{7`)`{GvhgW1WW(+os7E6bqCX|Sp~A>8PN?!I0?H5>Gs-d5ez*9vkGcOGPK%^5>X zf7(Hlv>>LYl8}U1iWya(7T46AH03FJM2XNy(NAaGEInHm%6+kHRy z^$0(%;9c7zX~R`5hCI#oW4tq_jXGMRk;z{Xe7_TR*zhYsxcE2uz?s$u^&8k^Xu*{LzmX~OPR04SG zb<|^3tv26Ut!DJzYhV8(3S_1Dv8I&x=_c+&qmKf~>0<@*`%~HOgNH_`9&G#>iYoh4 z`R=fV)9UyyfRR?vFI9nLlz(BeLnDEVF|##XK#HO(a?gV(vuRmj;Dj8bsd6Pg$AB4` zfLQdR#jNHxX31$v{Npe2#4ZPXC4twW&+OhC*j!cz9~0%?aIjR{TsJBNx5{5v65TfX z`0r6bhU1}4DYbvWLifN=&&e6k-9hN(259jfnEAvm2pt}SUhaTiQu^P3e=~bdM*SBi zy9bJ7K+YOQ=LC=BmjE&(Ggeb^0LG3 z1CUS1{E>=f<`m7Msmg0R*uww|+yn6VIEW3Ufx57h(ea)sJBnWrc_bdex+rjw$utw6 z0~?T%fv_hJU#r=1g3qA}ILK{KL1v9dAS?>(WitJVKOs#p;qw!}3KO80j*!Hw(cve6 zJW?89Wec1PqBZUr{2)HlRt26D=JbbO-tc?-`!e?pYTRd}WOSp+_<@IFzZ@{|zKRFs zy>^5{@s2Wmj5^XTMktUw>hO&{5Pi?k-}6ZLPukOXuTeTQaS$5$7rgaUsr0Oi0%aM4 z4()(??}2X5y7W+Ybwgbku2KM6HU>R{tsab)k9R@f?zNqv@XBxLAdpQAIY zcI^uNcsKD`B~yFTjMz*4TCj6l|Bk}=zCQTURKA%{hkURBIn{^~)qC3iH-N3QV~*j`A><|e zC+4IwW9-?~T=fe!iK{ zfLz1Yab`{%it@;kRxOHK_sB(a5XW3f{R>+hqy?FV>41wR2;z66>v=V9ey0w-aekPe zTa3%t+i}j>+iOVvaQ#P~L~N;~=7sY7x1T9zT5B>?A4``gRttA@M~8P4_x|V>x{Mxd zo#$g@7$L2f#yoOHyz=8|UTRCR@Tq)|_m7yGBBWzJ@5?x1Xzr8Q?5QeUkKsBiUKjCN z#~F6^a6!8i_`K~;YujHG+J;FwO78Qc89`EFlxs5c#h8m%@Gf=t zzT8Je!EMe(0nPCSoW)sRM~|{aULEmDIc2=y@FqQYh_AVVuA5=o2|MILock4~45SYt zTpqNqDLUU#19ZriJTp;71Zs*_P_JK@b=nbJp{UqvQ~ci0)BS`ANnFK0M|#zRKg{x~ zC;N7yvo_v69l4KDiBdcwvsDfKx)m=W$1 zVnL$(n4}TC-*bby5bl5QW)!oW7xkxLesBdrH{8X^ZwPgeG4lfjZgT%RI(jl^8-QW< zW)jQ?BXK$bxaVR6Cq{GoO1lqms;6^k3h>$`LucQJOkJrJC zo(=O?iR}LuAVvh+F7a@M;xFMV8HAKEhCs>tqx#DNkoMz8@2*CwZtvgaSFF@sUOr2- z<~|K$@@%C*Kj?(wSBU32(c{$Z*KL4l%@0_qJ-z1|v zDvmu*S%L-^fgf*@VJ9C(%n!SR&o#B+lM{B;8yGR$3aj2EqdpFgJztV70tK=l1Y~y( z^GmnQXDt#YNUu)O^Tx>R@)tB?cFV@MfLq9<9x^yMOo=d)}cNJp1#fideKt*sS z{If;Ub-VhIJk3)dkwb5p?8B%5WtM?1u=WS)Ah(KK$08)bn3c7+3HWoRlJAyf3^KG6 zVC4$lTB%ez)lGjS*blJ!_#y|)rvC-_CkMAIAur+Gn3c*Wq?K`kDhLU3bYWm+kv|?j zyC$Mwkq96?D$K3p4gHpi?&a;X_wV5Bh03eSKnTTk zRc#R7NGFMT_KX^9i*NtaP$g`U=;!D;DLVRA4x)(6Ww_f_kf>+DPBb_P9%70k?tL3n zS%=lu3VyT*+d(bV1Kw%L6KFv=1_^Pazo`_>aM$~}r2EK13a>eP4rFB0=VU#O-!xz2 z7G2|pU*oPGL`*+KIQvWe7TKy3`9cgEf(2GzhI;J$F*%zj{}=cGGqECDgLD0*0do+F z85gMTEH+-NY5l?Th9)UllEZL;0yKQuoS78C&}Mz@?0N(Mw4;9SukuRDg+Vq^8qxFm zE=Kdrc7t2&3-Rh{#5CMj=Rhf5u`O<~Ee^5VWePV^>d9B%4p8LMQ3i0fIbH&}WG_Q} zufEa9{1*HlAQ|;V42qo4A{@W1=>r!jF$1CyBlU<;xu8SqAw6yV(WMS|l3YvDUnRX*d#dkCrzoh;N(W zU$4S_g{??8Y*9Y<*uAU^?7PEOL^2vS^j}N$zf1Q54D8`0rM|F43_T*k-UgvU6f3Xd zV6VuS$cj9ka5xFr-w$#63uMiGSdF)j4)4~{k7P+dYl(7n1pg=6&7q zfQQrJmGa0}ybdmfNo+kq+T#G)QR4SG#3lbQ&xGi!TXWS1Lcur$2uv0y$0n z6Bef_FwX(d{b7Dh4tk_22Hi3{{X8o=2BE}12Y|SAD~?y#Q664j`cIU0?^K#9I#LH3 zia1>+6HG~t?0JIE2iZsmT(jR9-v+`aG2Jl;HHsd+2g0m1Kh$9iy0imwItTAPRVqKv zr$ayNgPbm3m}3@(YyJy9eX3M_e&NwJ)#rIs=rIfrt5gKDpkV>$pd6HnXZA9bDd=u? z5HVGG=Mxa8lL|ff&>^lhgG2nmge2{6oqCE2?fkq*RXjok76-08zwk(>S|@B{7Mo^~ zPe}KGMF5z@&g?aUs3-A=H=Xb;?j0ihrk}AEG7xVD;a^e_?EkUjl)UO6;9n}ptFZrt zc|Udph@g59j9eHXXCThC^TAVUefwLwhp2~pi=U4}+CtOtp3ltGzNXd)N6+llMRwrZ zpfc=i6!~_IjxtRb7SkmZ407|O=%-ik-z8&^BeVUp0e;Ik+VGx5#ho}Wy$o4Z9d>QU zh$R7hSGnlgw<`1my`IWmrLpO^9}3K5xhBy0YGV5i@QRw0VG}jJ!g`a1erUT8Urf zgv=ApFV1UAJx5WHp)&DxvY->u|Lp^ksI8735%wdyUyV`P`={*P?5-=G)5~(V$V%85 z33}WJl2l5Y1jBp>4=An=Q5)tvgw$Sq2R8RV-_eN}Ui?cF@wO9Dnmc1P1~s+A$5;#g z*I5Xv1ox|z7}VHDUp>U}{U9?@Ryh3+9QC@Sjcq(q24d&+&Maa$cpqphn5D(qzDv(jHNyx%?I$xT3pYq79%*Z62zFv znEBd?pbTsA)Z0~c(7<#K1}ijWwqY)gwU6C~o{|+vV9j*|e&kFt=_I3F-2)~_3 z5W!h6auRR`e%~?vM$EFLiL*se+e>w1gT%L=GOoAs7&bm@-gXDrz)xX(13I5=FAc05 zZ_NX_X^0W+2D;+oD6KP@dH{$SkfTFhOq!dMW;a$j$0faUg)!`12Z@b`!Rsrq_tUOZmubI^;>RC4p zBmQ``fC9WT8dQ;m3k@|giJ;u7#wrf@$1g7)j7b1KIv!4jA#f7iG#!Zp3xPNTu1_8= z53l0@i9-e9rP_xm#=ls&L3Gnpq<__ZDiX)57hhKhqc<-R_?1hVXN)WC@A)!M$)o1f zad+&f$O%O6Z0w(il-LS_^9Lb_!Sh_ZR0mnF5Y79}Rv*FvA7&C|KHYciEBy8A1IeQ` zV}m)cuO(i*N+yB|%7MAU7~HgF;2??VN0bzLM5PdTdYeGI_Z>T!9Q=q%@$j%kg7uS7 zQZnGJZ34aD!up4&8wmm%n7Jugjxb#Q$J0Tm=+YEj=i_(iO(RrWLO0X8(%yGetJk|fV_Cgn_YQm&Hn)Y75P(Op@Wz3 zx*Y6*(v_;D^4I~WrI0RV5-J7HS^b+2llL;g~0 zm{obPKm6i|3s>gxy%TMS^kke7!|3?5Jt9b|M-P)F0){}V*Bx6(RebQQ@o-7va8Y<2 z@6k=&5cJaFlKvx0LH`B#A6dfu7l!5SqX4`m5}5A!upMc`>PgdWo`()k*y$*q6{laBA+4`E!3oSZN`XWef#X$gKRj{0cPL1T(kNFWj!Ulq zkneuEbUFi~_5*ND)!qn}r3ON^rAwU zx0k-r^8adF`*-N<^gXZ+&9Umi7+JFc?T!awltMgjm`eK_f7Nyc?m1^Oy~5&$P-fZ< z7dgf_?+q1-Uh46d4>;GH->8C@;}M43!??HCiyFoQQSY#dLAnfU>R=w&vFqYKQdhe% zdrXIGkcV5Q4je@{O+%{Uhj&zgTZWZRu%;voX6lZx@DE`8(v(D4K=~IYLpK%7fcuEz zTnNSaV=)x*{n=s!RgW9KZht>)sxiWX+8C&SX4X%R`zq!KX8osz>vv*T*W5hx~xm65wZ8&*u7N zYoGoX;NRBP0E-mAgxBRD4~H{xlr))xj5O*DPT0ei2|mjUEI#ya3Q?_kI^s1*=q#IO z@$r8)#+8XF;0nC+V#}Lx>I$}tYOZdA3HvTm`o&x01i>;{c$a~Dqt7Dq3|Yq^94CCWO`cU z;Y&)^MWE-~XV2%4{WFQTbIZ9EN4$x}6~rk+9`qkP!>s?lC$`t0`S1nrhqq&!$l*|A zMy;O`dJ{IT4jJ^w=>!j??eCxSsiCZoZ-zXi`P!Dv~7Fu-K+*|>r z|7kCJzYaMsIs*pW-L2++0ONe5l^vFRLjJPM)A|GiF@y&^JzU&owLiP?$s8@T#e!=8 zKy^Nk+Jn9|w?pn$`w5|Yae^vns?q{_)o8Jv&o{8U0K!_EX$hpm6F*UZHlVf?jkT^y z^br@mJ_`2uBj>*OlY{Mjyx(Zi+aJjffbfKGK51p6-^qXA&KU0ST}RQEgdn=cq?65D zxmJ!pqTB=UWW_m~@a(Dw7)TB+L&9{w5312Rod&8iRFEa8;;B%`A!)1XdG(U#x>jQ5 zT=@1dka~I$@>>f++6xwkc4`Kkj>&iXNW^*I6&IQ0otLI7Y7Dedf06s~}IQf0<`YdfwzgNb4Jde^9-1 zzr8M&k3NsWhDTGOG2@c}Q%~wd4UOSE0U{7@84hpWgyC?p2*3xw-k+Cn+-sus!Xq(b zfivP&c4u36B19vK?eXP8^MO6BL5p-tVF6;%c`xoJjUcHh5*Z~6?HjnF87R>EssgdS z$9~!3;4hh%g?^F)wiEs_aKR@o;CzRJ|F?&a-D}%n65!Jt1xwDMQy+3MEID30^R5C& z*Vzxk)QsG5$5C?9sadX?a9u`7G}^|Ax0*)y`4ZQj9p_Z(ICo7)2noo66VhV(@M6S< zHS9Onu&u&aRDqT+p<VqwVzwj3QG$s?{iG& zgspUuh7c!|wg}(t1qCm@FNo$}xpXd?@HUR{TVgC7%ra(69rT5Ex@ zuH?;$@gKZh^CJ$~3M~4nQUrK0zQ~UG;`el|d>2h_5MNG_e}s?#qrTAFsVQ7SQ#=cs z?CfiZ_IeAfVyVqWFGr0t81C>q7O6W?+!RwGt@VX}GX4|7ZJRxOV*!&^2hc6_%~ zzs4x!@V*LNlu6m|@BfE+l}Pd1Il>eF9|q5&G-xdZ$PF}vR!ys`a?ZM8q&fzA?L z-Ft9{r}8s98E_nLtv7}Hs_=Vds+Q}PYV>zXM1dds=&6+|Z6Tr`HvMsej7EO4y5XkC z&Vb4@#(m_(f`?gCq&i#=y$Aef5@fFA7(LK!byyAp+w+&HkPbvcs)BEcI&zFZS<5XsX=)mMPoPBG|OSAM#8?nlj zPq8+gpCXN!6sJ0OdO~;jT>!{*?r`!rCbhbqj`9rWoMxEhRt^GicfAid_b7(6W7LL~ zetv3xL6uW2bjSYwYrI6G9#l^go;%s$ z50Sx18Y&nH@7_OL)?BTpvv58sxKr-ltG!<|+v0l{pTv|FrF}Y}iH+#)@FG|65JgOk zfL%WLDU7u{gSudo3tC{G+vrgmBUq?9yu@{Ntglk)Y|#^d2ZS#Q3|5Kl7Q%rWM&c+& zuPt@IMjtuDtNRc%?`e+Hx9j$*y~Cx8=(5GVz(<2Yaro>R)-yQ>qb4ucL%zpaTG7MO z9iMDo)aewXbG8#M*$~AwN>zbYy$_>KtgCL4E^JZme|T`lllZ@kwo&R;v);9RFSu?m zj7%#Pz>5#LOorXVI*!YN_H$&@8uthT+9I~T ztC>yr_IJt}RlM90MydiXA~mTJ-&foL&tPc_KDdf+;VB}?57&%F#u+|jm4JOp5kEJY zTm1bmKRpQmxv^z#WqLWUfA_EuJTXYb^3zqlm!^A3mKDlKwy*sg6Y+^7qb)+CwT7GF z_Qk;f3~UCYcP{;9!S{uCCan+S<^m8EWd`;4VdY;)ismbDMbRy<2NF0r;N<3gZMg|8 zQImJCzwwuKcH%IF!RrR&kRw^HkZ2_Q;EzIF7x%bBuEsE%J0Vx`yG2#oN#(em<^Hou z^oN_{!?eejJ>>oqE#D1oo|;ddl;?nwy4tS9FUZPmAGMHN3G66t@+m!xRrgVa&zbu$ zrw?;OT+iu4eNp#8NCF#lQqdh~Z&^w8c#1X%29Wl)N99AjkmJW3CHq+#B?eUa>^EjF zW(2*oZ)Zi#nRaWkr)cxeOoYZ1OrG8-m_XjZ=ne~`|32r3E;Ql?QC>+xclNqLokN>C z92Zs@Na|G-D3sveZ&lj1q}!vnQ)hYA{k_exAcAo3WtwKSYl=Q4=1;E+Npf<0BAY;)iYme_+_p`&>WdC`z0J z>Fo^rA3U5qaE391HOFujY{tKhC>q(UKYw&srT$i>T_B0Nj_SC3%TD{_f!TtP!-q~# zyF7&@(tzy2hF?ndtpi2WK&8qfip4^64^eT!DtM+K1P|`r2&YXodvz6DigskcEM6_K zuuNUXw!hw1jg2usGMBZvPH2p_9W*^ezYA{j!|QkFQtyji*-1)%Ar?BMHe>bdEblJ) z%u(i4CfTbo>X&rO92xR? znZIYTt(jYK)qhD-`kJ-jH(09T%ymYK1hqDBW%}Z$wFA>H7Wrvah%IU-Ep|3`Z5}@g z>1uVJaQESS!t^(X|A0{B?nddy@vW)h0F7VQnNhzk1(#TTGqzD5i6Mn@B~LPTKQxh= z$vVOpNFw*TWwVT)@kfu0F;077L!-zA*4w^QJZnkTocTPstLmoO*L>^J)EZy$tZPkQ zbB+}3)7+|SQqHoxP4y$0|4#H3T>(;8w;1|UQ60p)_*N<9XYrlgB~NMU=O@=f!@en> zv^dl+c-t-NSI~!LAYG8!Ujwj}-1FOkufH3(4DiFxJJR6*(y!Vn4riH(1OhfzNG|Q> zIVG4FONqTp1QJ=!rFEF7Xi%UZ@ho09A)#k55DTF-2Yvlj$OHh}5`7WFQqI{#_7%wB z{Kh!Ab!MD(`>LHP-%k5vu&JM?A1(YF=uLcrx}|{Qry)xq%&$kiag>q9>J~jc7!L% z4R&#daYWglNSQH(o0tp^ZQhFDi|w*w*+A~=-*s5pFu$(Ve5b&oDWoY+mbPr8j1VPk zrMWI$7(RQXG!kM%!e*h?IRv37t~(<$J1)>{ufZ&6)`7KuLX9q(7NX9ZxB zx$z!zbO`tvLsWy^m~2T^^uoz!5`=?7w&b^6iIk+_xmE;bell*qyA-$3o-QbLm+M4@ zKO6Dc<)iMk2|)nTHV zu*+ya91~AHgR&F;&Pp{!viMP^@_Viu?x#aU-X-fys<#e%?xr|wN95@iVHv~bnETDI z3hga@J#pYikWI!yo4SAAnpRY*yisJjMIIsixs9aAOGQ-lol>{xJFKM28+4;D@1M2H zudKL&G^RTI^N4|QPEpfx)o4DB9T;u(i|_&pU#juN1Gf)*z=fa z=L!x+aqA{`3!g(lajUKDgRdZnX$5FjQRk2^N(I#um9epx_{&tSsHB+6`kyCR6PGLd z6Q}i}HFP)zHdf@55YO}WHMn4FNeZ~*_ulyv_ronD6gvf6TV#0^KbcbwWRDPVY# zbS@!6V5d5SupmzqPeYM-qBrsz!NvCO67{-lWWXkpCSU_y*9lLtR050sYRrv2M>e2{ z$*Bqjl}u?qOjxj-rV*|}lFP8kcW`rNxv|gfG|0iB#1XGehs6KM0|g&3pgCEu)79zS z&y#bv6wbf7TdCTo5gvN9pBP_!{Q>?+u)`xy(DZ*>#!qQ@!*dKbw&2>{+Ev9nG&_gI z{TN-Q^|zXah>KEW80Li>4S=n`1@^$$Jj3vyP{t1fy*{MG&k@t6uMVO~>&a0fAV5=IlhOVgOQo$yPwJO9 zNVNF~jor0}_Rf{HJ)Nh;u|vm!`%k)}egU6q&%j0C=SuLiD0>81liAd*O`vS6++7NT z0f1T>&6vRM4xTi>Tt-a;Q4vj!ik%akIQU{<7%MSv?yFu{V3>InQO`ZHBn@9X71?A~ zX8_WWo6rw`e0H&n3!rQCo(nZ!>7cuRW_DiD4vIiwhE%PYHOG+wFDK}N=>|Pc@H4A5 z)fDS(DEB(&54oKx!$xe_@3Pak1}rw;2qRP0ca2G-wa*CHk)`hIFu)5=;-bFt5X$sM zrQnwkoN!}G^zZEOH=fYGIrP)J;mFW0-#BQ5IZPaZkx}o-2{&N+^RHESpZ`<YF0j+2{~76$EuI`pMRsA0*iK>f##wL?f6eSu z2}i-1AEMenS?y^4cQ%-{nOJ84X}rtrN|>IlAJ@T?zOoBx@G;!;#~0*%Q)?}Y5B40Q zSVnteK@>*%I)3GZ%9{@(%Xf_Hj+}nv=7+Iy{#zEG{Z0_69wXym$qcgZ>=!w&#`ZhG z5V;A83!w1+o+-T+a<=-5Ir(4x)**}M{gv%MGZNGzGe`YY86morbW~@{%4Xw|z)LSx3?{TX;ev|whYic#5=9Im@ z)P2h|5E@D&Auh!y*G~5Oy9tKFDxA}mfkl64?^<9_V9s}h+R-GCAf*vZ{6umqF1DMc z#S?|K5qA-BEvC$PjDFM7J_h!hmU~j??mdj4@r8-g^ObZMd+C46*7e4E+x1BALSFny zZe`{xeo~&K`J}clpS}6k=8Rwp@zEA*ZIXiCH}>+Vo_uajnv-e$K0xeO#X-xEOxG0m zzqw}SdP#(OEr>_|sSMh132jKsobWu_{$KaFn>fN@l|kg!Xsu6d^k0l^7+kvriyEzn zC?tgne)$n_WxZ*~3J^eiJt#$csZTwm5*Xz_kK+(gND(h?IZGm%;>i$=qV36#T(Rh? zHcK8Dy{_gd#At~0t7le{hx!Uu(*+Vw?FAl~OE1KS!12jr(N^rn_P_T3^&HM%8h@cphu)AQA?C}G##MF#nILhPr1I#{&;RN+vkf$1h;=87+W6Mqdmq?6R|qq{5yNIedD7gYbZ_G5*c%Xs%njNHMC3v70R`c+;hCS`D|D z;YX|G7?o_4qD{q3rgEl+c5AHis4``4QCk8gJMCD>&A%nAR=?bkGfl~@*yYOkLb#8D zgYeng@l)^jX2FKyo_E$L$}qE{UC-=Oe-KglgNNt$rJ{oP?OWqv)Xq|_uZDHDhL6a) zz3~u2O6i6>BFr$rL#xGZ)2qJ86Dwp+02*R)^xgs6sqYh- z?I<&-KH*Z$b0^f0$=TV>XGr(hK+Q3i5m~Gt0!0e;?@i(?OE3g`FGH z$}qQ?7xIQ&3>ohoNe=)iwQ73KTX?j=b4tJiAxXKwHS^nPT4J~U=^lxPJ}~o{9rlzO z(X)j0l-;2uA9-S``O~lb!(GNp4^VZm}sQJ<@aUgb}jRj=jzYyaXt z&O+#Bdm+7m|Dj4Im{s|dhC!p3xzNq#^|*M-?iadI)*`Ku9R-$oq@5XLU=oaM_DAsy z+6(8(so+&{?hq4Kst6AJ>;Z^r4sJbBqDdY(5;RNAJ6J*9R0!YvF3DE%xVPq+nPD>6)D&S}pDFiY4+P*+=B#Ob|Zw|Vv=k;e%{@|%5ZIG z`p{v~zjJ|9{dyh2^W4kml>Dv?BJr>TNm3&@nt2E&eae}BTYErlcD-}xwDYzaJ7Xt` zl~N;4Ko4zfbt=EXTD||X8)P1`0?G^IaLvz5N8Trq_W!xH?Zv8ztf6%d0$i%Z;?31I_bs+Hn?Hn7&n+mey8zyiqb%mDlCn4cv8h)-V=NdnRME-Q5)3;gi;D-Zk@ zCsSv|idsc=S!7P3`%?HPIGhsZ?}@w<0I~$?fL^XUGhD8)L(9!Rg%pnOfCiN0Anm{u zp$(D+^MT43d1O{WYi(<_T36SWtZ%jMg4n0G{m+g$;66Mil(ua66L8?IePzs_G&;PW zTi-q_uDXc{$EGfJsqD#-BWaNqFwdbg&i|#|ZVE?R#4j=2(KgI-UG|&T40T1s-;mYQgwo>nveakwy2&PC| zBO#2!39B{GUliZND(mFFL8wmd@T#RPR>JsAeMa2)9rJT&9TPEy*Ss|ckt`9PKNi)@ zGhODV#zCm$LG9_K!XNddC&2XHfXej7pzeRQBQ%>I&z}T(A~)!x+0@TtQhvVjxsA3=JT@ zEdIn7H+m3F9QEO!#5fg?_S1Kj}QvtXh)E!Hn&ctarTi}8^Sf{vx-|uVq8u)bDjCL!GNH5h;t@}{R1DIdNM}o#Ov-Vl9J6`XV~x0Q@K=LYoGY$X6`i>>1FUAg zVZXg;3}}&qCpaeQLC978B(aiIYPU#qrU2$SiSg9^9|#avGx2-Okv{N z+r*cy&lgChZAOA?mf`tIhtnoiuSi>sF~g|eoUcPrCh2l%A?w7kwHZy4?g$L<&vIWe z`qb;JU|!H&lC1)ai7`H#-tjWRUhoQC$f&-*U^Sj_gNatjWi%QHuAR;K+jPpCSab8$ zVL3FNs%!zV`%OmyV>%5xoQFJbq_q+?4lO|G6osim>l7``<1=TWgW_C8vK!+VCoYPf z8zT!B51Vcau^9I5jPpz{&YqgrCwZcx<1djWDjM_#561J5vX;Vi!li<7GGBJJf>u6s}AH)RpHAz}y> zruSxT8`)4;>NHuRMW9XEz-fSL*|X--j%TF&?On_x6w7AIOfHpv=EMm@2800qKQ4{t znS1FNyAIvjuFFd3oGGqcs;y&tmQ&{6pEms`vlRf1E6=EzuXBOzV6t&F6ZvMT`$fC% z_bzjZCkx)OJaD*=>kI~wf?~WjPEat`?7r8XENU4U{PIFGwq$kW?M!Zs<9J!0RE2X4 zR!~Qh8VWgtf9OSUI=4!gP0ZY^NJ}tcmnDurCLU_dHb5BZ6(Rj*NC3B+@8_E=<%x!r z8G8oDYt!E^bi)`LC@I_1Kl&!*l)fVTb&Z;D*HCRMiG@~ItZClGRu^ZIvK^&1jUyJs zNUoihfo_s=7!XoMaTOeZkg^Tv6ocTTjSZ`!-=wsabNG8_UE&>$tum*&)s5nh##UVX zEABCYYc?tQ=r$?;KiOYxs|X&@mU*~3@ND3M$01!RgNeSZU{o@REfawdryLl@SSpz( z#oza1duXVZGemPvfWw3@Lff0*#a@-{n<%gK^uGr?;;C4i!P_(wThY@M7%rZ`oXj$@ zim^7rB?5Sq-o9mDfN_<1n4=3uDCPQAJN{Kfi53q$5$fxI7x(%9RopuXuG$a*W+cc| zOi^&L>ZJyRoh_;FfOJ|_*{+@xY4>HWaO)VMFh;GtC@}m!K63p4Y{1pG*R?5<(5U&J zO-tyXqz2*c!y1kW{cd@(A(mp;Kidn^!dazsS=UZ8SD0$lu>?bg41eJ;BkIc>-txi6 zTNlz_eHbX@_GdsC2^Lw1hzxnei=3k4bp5%%gnR>uL^6`bPxv53kakVI5G~S{PWUon zaY+){TvY`v|TkfLDan?Hj{(h{Da1}kgjL#Np>{Ww!mPoP2Ku{(YGCTJo zA^nuK(aH9NM0ValLsYdmU-kjTix`MSYY&t6}9Ur9EDF)Tp9j<-2yN=9Bc;E91u$Hx*Pk=FOuZ#yhjcryDRzblreWJ+DT!bk2 zDzq1qd1Xl`J04*Td^KcQ#@XFiK(~R+o#0aAQMbz8m&~ENl54TNb&sI>tABP&ya(PPYND5iS4QP2X}IO z0V1r&TK{fueM;-(erR-c7wv|3IsvU3qldOMIZq_s;bKSGwOKL`BR<9ye&ipmc&c5+ zr7yDzQ_bU!ZEztjON{0iodGZay;Ru+Y`iu>2EJ4B0FC|bhuM9o<=vg-r**5mJ4jJ^u>tuBK%zCKGB97Ie^@1 z+SB{6G`b^t_rvm!JKKUYd|Vn;^(0veJKKN`t4V6amm0>Uc;X_uf=W^I>SWZ7Gs#Vdmi>Q5JOG^i8(#!WA4Oy*z; ziedsw16nqpZ9HRZZf@+X7Gpi98SIQ5Dae@gh=9M+X7j#r9nq^HDlJUCY+!vFCGiyO zd4fm;ecWsj?B<>KowRL+X#AqS;e!fiT#J7Q@qlEs8gKC z(s=w!a`>dSxPV4sj4MEKyMI*|U~(X<#KC~l-oV*qdEyEck6oY&EzQObdx-dnTjq8% zO2iVEp~DF+eIJ2Q3G3sWM4G2oKW}iu$3pzmN{5)|iRVDIL&7gL4Hs|6)cofdOIyn8 z#E*5qotQ$-)8%hqzXL7IP^{kPsj1ae$y9 zHTEAc2aW)>1~)n!;27og>WWlq2TOU>*fj{7$^U3y1kpFCy_bmQgpxf}+%EFAZO*yU zr{|FCjLChSR6W+$S>~W$YnWgO?y0ZdoU`3Qq;`aTod)}aE*we`Q==Kv!X>k{vCD61 z_tS08ejoy?m?X7h6G&buqqcPec-)2DCH(C6+UxPPURN7YxLpCRoW!+gG7KMhB)L4y zRD|>OE^}DU!_xBHds(Q)T9@97@$P9dA|5z0@I9GbRxaI-R=;CKN5J#pkC|g&d+K{* zD$qetl`f%p!TQnnPK87wb+XAVLM(qrsQELlEMPW%tuG`DUdNh@PBZ^v0N$tOozZ1IHwuvOIR(-pP<42E;58l8m+ne<X0EyBY5Pu(#TuF@5K_A3hPz{%Xx^Z2LgznhHlK}N-d z<)2WcLesCmefr&B>?)*X%jaWatW^JPXU;vh`^%hnON!hZDHi0=kVgjS-fqfqNAyfW z*%zAgyn=|TN#HOSLDPgZsAWr3T&j1Dm6!$t>_nKhw$q;!-zS~ zGDC%sykBR*7o>fq?F%r?eq*(+B%=+POM=DP3J-{7Oqhb-D>cK@5Wg_0*G+2Q9h_u<2egUXTj@9CF}AeK zEz;5krS+;2P>wg5P6QU{q>Z7HkK#{u1UZo*MxBr4Hy)Y3G1ZV*D@ER6iXSj0%>@>& z10Y-M_qmzii-6aDY?sK!nr(SL7`&H`)V`H4FRiO#v*52{v*X-m(C!RG0og(EnuSoE zxO?@7Rtjai`(8z7%gFoFr$BAh#%O?y1^7oTjWkdn#win<1B98@i)`4z!&|v50DTiA zKESqw62^H_qEWtA0oaz>r-i~AR2f(p0RIc6#bT!U0k%5wPuo;!@?0fHWiF=j-2Lso z<=dS^oh$#xIIyay1h^%inLZ<(tWKG?PfE zt+61u7>!x~nY2<+59b6hi%CL-In(;mlP-}Vl352ym#vCi=6NtOW__=^hnRY8fVu|# zq*=}N=?j1$S#WbLU^N_m8(0QH$q*J=IH9ydr%~BW@hA@b6#ZoxD&*^~qV(q}TNN2; z!=8#l|6jRPt>OKG9D{u1mS?w!QV#$>eZGzzeFN;5bf4=(`DGGm|_1BLy^OzgfU&6$q5G>FsrgpY43 zH5zwD#^`l(XUwXh8cwd+)FQVhASkB+ohX5WJ-+>*-c&049$E%9#eEjY?NHj~i)4+M zaT$so0Kk zRx9eY9m+s}9%=bCBU~bF6h!MCN&Q@*F;Mr02=F%->_-_ro!wl>Q=9Vg1N{}CjJB5# zU*anv43!<13=)8I>PG^#+!S-G=;F~~_P*(+m7(9f8TW;$+305bsP`SUPllq_%L!I? zrAgskrs{5>oZCc;Jr`7Q8||E_4zvS&L!*;O6r+?V@39?9AiVFN7RDVSvyKCbj1fqQ zq1r`b|42cNl?~!|?ptD^pH|nM3rcfsX!q`kvb&rGCqC(r+kI64R50#`X@z|*)f!W* z$7I2EEqpkrOLMNV6ekZZCyH$N8Id)$_#JrfG^q1g8iQn9GJ=Zvb0FbY*sQaig}Rw9 zQPwuzT7=akOU@#sN<$YU0{~_HnOhdw)Hii$^mKF-) z(ixeegb#=PHNTBy4j$CnWKvH&lFt~+65as7G%9r^szv_7e~>BQyM1~T1D^j2n~<|l zYJfOw1nw8cs;X=?M;(yAgziDBikhN03)@j-49v_b|i!3D4r&`yAI7^1Cq zj^ic@7MLEw7<6Op7-QL6)#rGDNd!O#rU2-`k&xrI6>4Q!wLby!Lw=4=gFifR$FAoO zapoIldx6_>?#J&}{kPjN27P|t&-=Z&&s;+F-Zl^8 zwH!Z$&!K;};7_Ap zTY_)^>1MWj0%*YL1>{-R3h5!dl?4iyk^9k7TymF?Q2aSwX$PpiA0Sg{iB>a;IAdvr z9(4HMdNjVm2z3ox?wC=2I@LXRaIobLk`m9>A7VXDX!LP4(9-( z8CNv8znzT%$1TKkBcq9j(DENYOY-xz-cI(b^F{{de*tA48ZBmnX7{q^vG#f`IH* z;Ro=23k=;{m0WB=DN#1i)2yH`Yhca}S%!GfR2_Vu<>wmF4gNs8W0jhH)C#>->7T5Qtc@uN zpDITaHpAG@^26Vmq-{^<%+|htsPxMCuPbLyx|*IvxC7BI3??|zMS!Q-joh*o`(`*; z=Eg>=IrwV>o_=g40A2>oQnQ;*ifwKBt7tkHXyajnGE>6)H}qNfP-*p6djJ~xtPQiw zX#f~2;r$(L(#>SPf<;37Ri)v1WtLNffFi~M-F6mkZ^0CT09qC)Sdl571+ska7!@4G z6gbwnv?fekcamZNpU8I}dBQM@D3FKZ0m>0e$GqVt+13rv$OcQxmX6Cn)$O+d)0?bX zF-d@j;o&o?>|!*hLw}$GQ&x}-jiQzOv3XKfm0jeQs92!XvZaKUdgA`Oji<&&-RNSg z#3&ix+}pi5po#r>cz#}s2i%4UBkiijWSFyeJVTT}J zMQLXq8nF%13j`N5cf?WD5?B*cIijO!0SXGk#=4F=Z?L*6;^Ib#55wQfm+7tQa7%VSacLU~?R+BI}j zzl^_<5(O{eW!rRD2CZ?H?tcBFDnE8fNrU*UhB-8=#8{+Ey3Ck{Q}0h(hw-PayZJ$X ziIYztjQ{1YjKYY$s4$#Qmmk~tU(I)m2;lSV752NX-p*ylZBcVEw0gSRwmq?ZnKnEd z8%(o^e9(MWUA91K$ybCL(Nf^Fw0j3gT&x7Nw9UH1qmJ3Lk+m%YP4SI&tI+N6+`pLm zfq5(|*pv0rb_qsf%&?oCgen26BZMkLIlui_&Jv}zGC}94kHpa&jILykrcG=mGfFlz zL1(Ft#MU&$(j>Puc9y=7YNxP|)cC(ddH>huZnp2L5PAMID!d)R}?R1=q#S39M|#m~stCOZqNAxCFs zH;i%tVQojK7lNlI@Zq6mwyJ8AIaay`-?Z|}I>1U_V0Q>Q&^Qs5oqvDSu}yFA$p~oB zhtR%V`4u}ji_++E8a=a>ULqKMpDq2h?E@Bqcj+6Gi})eqD3?eOSv`16QgMU6EXfzp zQ2j-zmPsfwXNPNtQ`_yxH8_g<0|l7QBkg)0#m@U@SPlpsEr;zt*9rgMgr;uqoRHeY zb8YeWHtBKj?>;nrOMA8WFVc>^{;x#&G6d3k4j`cQ+U8b-&uE<#Mai~^lF9lf1+UG82c%Qa@cM_^&8?q3r|4~W-&(;(|>74(z4 zjC~89Kn&wH;I2H;GHj`gR1oW+9y?L;X=%nqc0vq><+TkMg*3hHBFfe16FqYey3;Us z5m^r1YG_0WdzYrRO>tQ&fFWvN2E9|DnqsuaHgV77gNjw{ z>~6*v|G-pqxQ<0YRgC;f1eb(~-cjiklY{_kKA@KLfBZf;BPc#)01Oa*V9$gts3LoKHRZjO`tv+f-!bhP zfvM0UV;1Wyg-0>%Qv0mR=!#oK(REB~U$Bi8_ASdad+Y1~9b-HSVlQ)tg+`0G zT%#Lg@IOqP3n{S$^-M_zj*aGj&6KfK*w_328NMin(&WKCb8y9C2U|>!Q^FBaAQ569 z-(M;?Am6Y+V8KY{bne&hFAHly7Nbv({jyT{H}|m5F?n1e#}VH zycV72yAJ5hL0xbZn9GeM&OX`L_R)Y)5&VQw7i~q9@^Ark73Y5&FplH@=qiQkX;6XKbHB2H!hA|KNQb$nE(7h>5lt+K6%#f`nV4JBJlYUMV_^Sk)kpIGrIfIU5deE+d`DLvQeSX-Vu@sm32z>ha75*&;MJIoi56mPhO+(I+tU zU?V%<0$feaxv0r;qAl-LiVD*k^Og6!RCZ*3yboZ`pRtf9nO(J`@4!>6BrbGr>! z;S!XCMQ{R;Z~fl^OH7^P=SJak8aD!T{}PKkn(yc2zmk0m%KlRp|DCg7V=w<>W?slS zar5Az*Te>(wNzUIhj5n3W1KTZR}Uyr`+m@Fp(fb&=t!_Xebx7IP517bNKB6Q>#U#otBkkd<^u)oZ6fb;7jm&fv$xeY!hiwGBI3xv=PJZ5=Rk zQp`aX8iAA3FNFwgII#@CKvQ5O0&hJtIQ5?ync`DQ|6~J^glp)z|AXSN(cZT}^mL;h zTr&VwZu!35kkHlJf4l2{)bid%sed=`r8Vgaw!R#wqm*XT34g1WZ9{QqgAQ_*|e&?}9iOg$?*U=|Jf>7#maV0lYnpkE|aAeJ zxcI43jeF(p1{Em_?Al-ZG;is!fyr8+EnKy2Sm-!y{*(pyf64+wX&R2688=9re-VSH zp}tnRILQ8=s=1W_@bOjKMd&qvZP}P zi1nT3l@1zyo|Py)lim+QH#8=daXY|(NFyfJA}eB1Uv-bR^V@VtmV&aVDT?8>q!+^p zA*q^Ow?zdoJF&yq5Fucx2B>xuX$T!U*yTeTFf*wb*r8!vm2fD4>~8)uK+8LlM|p)H zNByTZSWpbFXri=Kq`=$L?S?&IbFa2*jRfP)gO}MQ z1#l&`yG+X+um+pWIIk^iA2FU=`k*dmuvD8hX_+%du$bpE-f7J#cQj3S!Y)`P4$LqF z(`b?}+vnLGkVYEEo?yNiHJDM;c!&;918{*&BBHLfyk|Ch|BvaE22-5=ml_E7nqOpc zIPKQ$OXg;y{E)S+WnTJ;Ef)R*GTM`!{9p4rIer}jN>ZST)E%$K1BLGh=b!eM+9_ni zF|?02=uKIAlrbKlYg!L><3lzW#>S>>`FOIL<|(T`l&U`7dbx)2O?d5Slu}3M$&U#^ z386s{!USUNrz|87)|~WYvZ=OhdzE@ID@8^uv@v5t?gjK~=UeI6+OM?2V#T}{DZt;5 z9S5y@nTL7lG;*Lq=m3G$E4(ggqRy=yrWPVf>fL@|!Zu_o zTy!X8dPV&p`(!51szwpdRq-`$!Wr4i#^l1h@sc*Uw*6|6o{|5H_G{#TDr4l>n8wa8 zxwr)-%?WD4!Ip=r4B2GHF2ZshbAKhN3LR#OPP&&6`S2ccADw~pSN zl%0z}K$5)%2%zz~pCEt$%p}AfYLH5o0u7GitSGdMG`^*=pQ$xenAAMu`@lH0;dA-t zoJ-WQikqp4$RNTG2WR=GcL&z77`DyPX|O&kh-eOF+<0secT4*f0h9!7fKi{+)^-(3 zkd-uOp(2-t_g_-ACoMJH zyKY;*0poZXJQd=!eU-M*CasgUUW!?gn}>w`9~G*(SWCQG^LgmIT+2Kp-oO+4%x{_N@!0+)-P^Wvi(NFY2LE z9_Gndpjy-tPmP$SGV=>y3pWJ;2TelcLxy)!Z{4uo0VP#V7s|~^;JKFUYBS*!1UGe{ zvG9^wvM5&iigPs&uMz!n1?W#i;O{|>ZAg^)=3mFZiq?(qr5g(v&Z%0fx^;io9sePD zlGWG`vU^1FM&oQbNJXoHw;rdurO~t>m3~l8#thm2synbyDA)i3;OZjrdr#^yqj#Ab zj0L&))TB+>YE)@nOU7~w@hGFKPZv;_7N?UGrIupYVKV7Sng*~tt4ba?{9HALY9QKc z8r8GwPhQ^0e&_nVKP_#YR8X1>wjGi+&S1AZ$Uf+EP4dCRQNI>o9^LwvA5`cWJ^XG} zJmq29^Af}?iqzuhAAQv)*C z3JDFp(OGBdn3+wT4{T-}H4I*(#v!E#0?ra9U>EoeSJ0Js4J$ zxlC_vhS1EwfaRA{wZ(e%2zC*s!IaL|ANI7~Xk$zD?vw|hJ+leyj}xxN7t4`q*QQqr z%&TgtKA;n4V%iU}gi3B(O&R!q56xr`(q>R|)lj%&n?nGw+k=9CI`zKW?t-rP?9ZVw zv}MOs{VFY#Ci1}uV&@lJH=4OmBl-0(j`c~QhvK#N@u0X^sqo47?PL|QuRp38Z2?p; zY{R{Zarkf2Vr`M|kJVh2wbHxJ`~n#0Lv6wJN1$=Dw%0FOQwMd~_u>|m5PxW5O)@UF zLidj(;O4}#Fq}5M764*5APm?Sm8vkc+FFJ7tL0-Qx)o*}RdI6pTFPpiejZ+FErbx+ z@5CZd{xl>!4`t8@BrE9>R)bNjab0 z%-KB+`)-_WUc5R^0XJ^pFoW5DeDaMuof9W_@MlC7m?+ixRnHc5AG{fOY3>RmovgvW z!bP{fp#b@-ke-P_-B(Pm_JC&)5hsgpb7(msO>kbgbkvV*TN=tQoc?G8(ru+5(L-ts z7rafw_(jt&T_t#5#c|L0F&004WEJQ-XHZ;HzmnnB$L)>1M#nU#x;*Kni z+D#by>dXkI*nf&)EfW<+xnUz<#0dpwd<*267gT!~y^HLQhY(6+)iC-2!w{G8jhr!> z;S1o4qHJdvoobW=+Ll3lp2?0jv&+rNpnrvq{ zn9?FNZScZh2kwAhhpoyh4{et@AwmQsxD=M8GR^M8`Z{DSMyb{fN@D_*#wGa+=GLsN zzU^T`u9^VpTdza^zL0x-uKb7%In@6&Uu8F3Blh^=7lo$j{Ek8}rlqxACWqGk9UC$U zcqP;$SgRam&my>6u;R!g8DtUK@5uS-NzeF_)WG{h$=DG>luUNgC7uQQvzlN?Aj~hM zFWT9#c`I)))-v;(hDnfg54S&Dv;p?4ppE#hcG6?LzYF5yxrB~u7y1ilG$X{QUCkh@Ycugo6@Oo zANNxXF~tyk@QvU=egQSR=Z*tW$QMYJXo3TxrPkML=$k%Ok;{k%!04XETBVyhme#N~ zUk07|swU26A;($>{+uPCFsivPv)hpI4dzccLf$bQRHT*O-GLq>Q+C< z%RMrCj;AbVMzUHG1yzyRD94;zhOAEUvnimX%On`?ZLc zcV6F_CXmx9-1RLE@;TU27qEMG+d=PA$Q-cSx~&X0Gy(4TkF?Dzh)IY-AX3)oT1L&6 zJ(e^OZAO;pje!Mu`7$x4`YmwS}i;Pdm+Sf|lks2{hhe9=C^xyoYzK`%WZX5N3(fgLw%Dqwr zUpqn0*1d}FR;_N_D7Qo1A>x59_iPgsyvn8oc4Ze#k5g!h>*M6ztuEmUnzrZ}jWJV- zMbdiC&2ud6Uo>sQ>M>deCarK)RC?eX)uLQTTb&3h#FYFSN92@Pg^xUz!(ru-Qup4wujPsroS_9Y5@q~Qpk@sx~2W?#IhqM1YwU~q| zUFLYen`_V<)xNu5TT$zs2ajyCB!%4i6SbYvtm5yBTVn;@Lp*8R>_JTCYFftpJD#pz zv@+Z`d>VdP5pE(p24-^ch#3Z*wsjd>V-tP{@C|n-m^{C@gNQ|d6|;}=#((y`l4R^b zryq$-9CbL4x+-?el|#f`_20lyAHx|rXTJ(2?6HvH7m-Uy{s><0rW}~qX1$9 z2-&n=X+u83rjWAs|ldZ2ghzb(th6T-^an84ee|$E>92EQE%a zMG^lE^MX3L<^YPvM{n0<^KcAn=_A&$h%|*OEnxPwujv? z`o){>9(@^!!n325o>V-Cb);Lx@kEY?F~AX%;f^p&g3+wtVm4Z{{gpFP9y_G)W}+_( zH&dT%=5+cpw01t8&3avF$*Wg4HF4G~nmj3_VkbWz25o1K{<_?VG3E z%C%p#AK{=u`Z~)Qna$<`?nAV65Q?wXDC4Q=sOnq-uXR&rI(+l4BrI0YE=Ms`5SA8b zQs66DH=FCQG(@P-1JdsqBJ$p~i)+&mwrw*vvS`&s))V@*#d&=HdkiCiYC)ZB4k-Tosmovg>}@5oH%C#*yfI;6{k_i zv6q!|fAiJZ`N;lsj_neIu<@WIib|5n5~2U8?EMJ$MX?@%NMpsfZ;P5;WM?+N5S@%Q z3U~$A^SSasHG0wRf7QniA&pY|kE3kSw;RLu>gwf@(9?FKC@q??gLdYeZL`yMLm4V-w_ox6{#IE-_w$#XZ9Hmhlc~FsJH?fY zl^*_J^LgQNimvFq*jjyJ&j#Q*?PdRaPW6cMoUUWB%hClSUpy%Aj<+qFJSc2so_lz; z3+=oerylKBcF-|{`3+tsxm!U6oaABUiA!Bvk9-F?WW=#e0xI90Kf$FsF36IANu)@W<&UH@}vs<=KS8S z7Ftgt?o$6q+18L4GTQ(ZE4$2VZ~n|;>8ux=EJVU1jyTza37du58X%HzPIs^|?x%bQ zzliO{vS^9ceZ6C09ARF+9^$~NMm7qeat(~$%oh9Wp`N7=Wn=X6Pg&vb8fthc2PaSR z1+N-L<6j<+sOgweRh*39%hHBVCw8Y&6xJBlJJm+%!5>PiCO6q5 zU|a}S+5V`#Lcu^v2$lqtCYom7iQ|`^xxyhMzHDEYjVN5;p>zj-h;8k7c(T9F{h$p} z9o36v6+8u>h63&QLkj0^Uy6~HqFT<)8niP0B15_eFt(iz$V;n%y0(W{WWs+dXZx`! ztM5a+D-gNsn^y}7Ukv^D;yj0_f)s8dYa^&X0(D#k=h8(YtOy6hNa&o$Wn}55@=#71 zT_WpI;X(3)TcgHN&G^3txaBiO{tRNyHfNu9*vn~Q1LW}t8VtRuGJdhuJ!E}~Q4v;% zHlQ6!H&e9OA%4f9<*Z*nUQ9wQW|vD7J1*dXAJQ`-3d=3e?7`Mtzk^-cN&F0^kkQFv zI0J;4C}6K}?bH+93Pqum7I*#Ba^$U8W`1Y?P}LJB_UciDERbC()~a_Sny8(~FiX-~ z{-MKmwq&%dI(tKTqC5YhPmudzBc5>f`3Nm_R1enaAyP&LS8z$TV(}qD{>ZcDs;7=q z7e%I1v)jbTjO0c(;Hvgy%fz-sIYgo+J}ux1{{CMPwo<8k_O6HjP@a2=6v>;E2b$o$ zvImVJ7CAL)y@1?Eq!4`p$_r~;^HYE_o`y!q9(2bD$4DZ5!}!wQi(QTI-^Df{)iGB6_l#bXeNNk-uP+2qFuKF@DxCaXf2#GayYX$Sulw!l^QOjNX3b>gu@$Pc;aDf>{i3HJVHrka z%wVQ=VG1?U93sjNPo9ct6r8p_8x&p1OLO z-^pqsUMX#WwyC$ZSCukc--ocKA%2)UH#xi%3z{@0?^ENFfZ{GCcHfl&&K~(D(XCWU z@97J}bI?YDI@&7t7xgH)`cFhMl4k=RPu&o`4zaGSP23s?{iA}~s~d9d&%09jFE#pI zC->ukf^l9H>8_LX})3TAcIA^(O>JCaf? zA)0mn!cQM6p~!B!G5ygC}YfPT_WDJF@p-*bp2BTCS#px$HOH3pN!xg2}pS>{qX6LPa2+tJj^d$PpRF^sPZ#R8{mPPWi z&WJHrj9WiIrz}PIbxhgI;0uIA3xp!Wn_}elhv8<&$glBw^bre*)JuRyd8M$i8-; zEDP$8BMFB`%h=V#0kUFsJ3k56+(XE}K7A4R_Hy$1=JVdt{qeT;MSnmv z!s#h#&m4(b{^&`f-Qpz@uBqtZI9c=RUchBAmxq4t);kZmyd94FWa3C+Fp~$~oy_VsDAuiASTR{1 zk)bgY3iz78dmjj9AvST?=YBR$$8XLX>_i%Je4UHX<4LW-NRc|6Q_Ki!D+Dvn_0Wb! zRMaGB80y+GP0XBL(bagUW2hJMT$JV15YjgetBfH9cX&&jKAT^gh*^qWbr!;PH&U=e z2FxNu|Me3gZl{NE`OzA6#UnpXbBExWWt9P@1?-RtIUE*GJkKC_PrL>1NdCu%#|&j4 z5JdVo;gB~W&eLM~&?r=(_)tdKAWK~1g+2__jwaB<%9D2LnVC4dBH@jV&@(~;A!NYp z6()SGp(uj#AZ6l??*d{dVUxx5M6sc=WQQd#`4Qs6^tEX6+8v+lZ_T7 zw+vCPJ%eP$$b<*CX`?r!kCAI=(3@x=z~X6@l9G!n_;=0$S79_F)&yS)eda|OALbZ- zh!j>}K8vdQ^43Pm<|3Al>{6YiAK{ABOKhH{fVX2(cS}eQcu3lmz7!(-hLvgUZ&B0`qcaFMy@a7)4d2FioojxDshVnCcmlcddDc<>~lG z;u0y1>Dh~l!Duh&IyI7aWN$vU{QKLrur`&>8tg=DB#cn=Tu$>uVVmTZA6kak6o14> zMy)vI6SFY-+Se*P3cbT~&!rMk(BSQ@J6Ebh$kYej{CZ*}|L5n`frG%`_cL@X^H5AX zooWx-jf7txAIZb+K2A>g0~zx${m^N16{d_hLil?Kj=qS-6n@!sV@5;0>gLn1u5!kQ zRZwzu5vTtG2dCQ|P9_JYnG+{Mk)OqJ1|9K{lV<9<4sT7= zCQS>(IGh9qI^|N{5KDiKOJfn|kkVei^kS0LkJ&2r4Y=HYf3|&I|CX`tm32l^WiyN*$(k zQ}hnO_$9ckCK~!D&KAQ8Mjy-WOHDFR)rvnF+8L@h=($yXEi6y!RZef))|NDvpkSN- zyRK~Vn*5Ko?PR>>U}NgwD-4}zHPs0+LATGSoE*s31z;|ulKCvq6DyNY2tP&08%#xQ zxBTU!NSa3pf+559%k`cemA8+OeOe|Ig5$Jf0uI~1IzlTd;$Sf;jJT0S+Ao$PI2RIu zhZ6BS6NEg4q7Z(D)i^S0xg@R%HoRN*1%u;q0eL?o_sCpkom;Ifiqm#}j{T4FxJT}z z7)}WIp^|1r%9z>g>|?4dUnhgBm(WR)*HBkyIQzY?(BJ{%lT7Af+`>5lshmXKD5?Fy zB}rMzmY}mC_isnJi(6bIdBSMy#pnlW2-HnN!oRs`z&t#JvF(H#5P92w?M&osulH<) z#xlu|?)xBc2u_Wt`s?Eq1QzC4rQos%I@R?12ROQm4zK(Y$byn(*?j7YtkDuJ(c;G` zILtk?N%%q z#@Ma%HmadGCMhl1ID?yVrh@6VL@`RzyrnfkWTp4hO{kAo;m-W5+;qgyyz;qU)Ok|N zSD2n(K6GV4mwlH6g6GOjquDwq&&p;N%gKLFt?r_vl9Huw391-&|0;K1{AYUX45OAk zC%0tdko-T!?JFDS^gk&ncGqJYD{}Ib9E(K$JX%KekK+(ub!gp&>TUT zeb5KOMQH{9JW=+oeig4p4RxXrn&*ki)og`GC32xEaJhqa6e11&I+d=(WvO^MvRZj| z8!{@J47zJlws7gxTuJSqK%{AQ!9?`6hY_OP@(5n=5hTi;87!#?)dzd0B$k(?dz zm+7`~kCEVTD56yl#w&@2VC*kbMGcmVwnuQynKyL}bSFhU)5V*?BBbS=(MmX-xMFPQCX^L_NX3S#cLHY81 zQT9#h%U2@Wp0u)_IKTeOFnXCO!YBs+)lal!WY%Px{#D3W88Q{lUr-vo#gcHzf5GFG4j^^mlRYY{K+e7y*ZjOiwey*Y3P*8KlO(`NYG3DGGImSEEhwQ zPKJ2Donj%*IE+^c)`og$SKd~A=gE%K7MUzmA+2kZ2;R$&cN}+u@q^YKt!GrP0oKo1 zvvF)Aw9M1qDc;o-uhv|qm@}DnX7P|(2@8JbF`e>SVkjW-#H;>ip?PWt7aHD$qW$3FDBa2tXvlgpW@?h9X zo+9`s;LK*vS((+Gi3|5OsjC)jdr*7u?RWQ$v+%C+2aUPx2O}%g4#osLyIARa2sR^O z^8oarO*VlUkOgLgykidQc*^CY)k&#iGyy;3ZZo(SI>>wI)iz_W0w|lr8JZZ1UPo=v zBr`8CjJww41WhlPcoTXxsfWe}$jpRGIANr5X;p6zt<9UaQ}YfNLiV`8g<7d^K%?Hm z$lCg_3i3kR*uzuDS12f+mo9Ksxd%f{vWI&kPP-d*NNgRLVqlt=nZJgkLOdFRb9#08L`trB$6lCA?i!2cyr^d3 zc_=W!*nU#iTt=fyV%2?SbdA37hN-DG2pQu~jiq16K_(_0pA=FUm8r#}1l>1NJ(=0j z6IqFwWk@rwx8QoGLF@bpzcSnP3vmW1l5;mhMl?||{Y6~E$ZlwE%-mZa35pm_xfVB^TZ04YGdatp~U^OcuN)iDJDn*q<%sBGfBdNn)^<(mu6Iaa7z)|tKBuO%T) z4^ELjwQr&g&Uc1e-i4HP@-GoRrHb7~f3SB+Xwqmi>%HOzyzsqxZ7}N4RP4f$o=?Y% z3BoKnd(*|!GgQ#Xv9-IocinpfIyT2|44MB&sPz6Bl0bF^UqeH5CWR!kM5b1DV1uFg76yoqQe2rl5ER zMs^8;5=&>UuQ%8rjhmKSK4N0V-ZNO{iWa`B{`KKz;5}>Nu3iAY@X}(_Z3ETrQiyGldt<>|I8n$)`AKFExZb7$Tft1qQPyG zS*0?B3Po1yUb*g`v*G*QT`Ip3?x9^LKk<|YrN^5~T9A4I7KlK# zfl^hTK%lgTowV8-)?&lk8`g1=#5p;eb)|88;8%f9-$j9dm_l}^7A^d{;E2P8dpLFP zmcPP^4{A7qNTgLKK?xE5tOnB!7mBhY78JTw4+>=>6!CV`l_e3B|7bU7l~T-G<6dMI zg=4Yq*&&(1vNmHzCL{sN^mq^ea%YiENBBb#1PA#_2nL`#KHC@+f0Jed$fLW1WkK7Q zIm*(q6F*MHme$Q9hR@VH}$K+F-WQ4R6iH?1ny{(|QhB2CkVF0byY>6Wo zt5+C7I$(WVQMig=d4&|LdsIy8tQYzKk0Zo5MHt4q$2~@P-yOv`L+5-fx?1aY z<04IEDOy(t-k4X@65P?S9cUX>Z>sk-~K>1a~_k!mVWt zOcIJE*Mt&b)i6EQ6(|_1f}z^i&(d-+oB$cQ{gQU9 z1~w>0QxrTJw4=xK(|#*=Z;P;KScdlHtWw}1u8g-|e-T4j-s@bafK8;X+2>#s9)9*TT+u}P4p zI$t)%I+&%9&4c0mC`!5{^J1{n743f5F6AaBp6Q-*q|9NlR2FgmXCaW83oJzGjdL`q z8m3_S}jG3MJD5tl{Lm~LJlo}wwx^M^#o>?B_esoIOf3R^wHW~V$OeBE*^ zJ|XAzbY_Ojq-CpIi&ic1X$#I683b4S_-(Z+>rR%r$+RO1{{Qy2ONuD&0b=6 zlPBP5eXQv5=q=I-pc<+-t={1TO%;}S5y2KBpyz!ylnMmsDGFp0?S%gG!X6!ksA;LA z_+%lPV^2ks6%5v0-rVpazS&$ExhyA531M#!SFHE8I#Su_;FCfd_zK_8I(0*B;TcW0 z_z`He@Ts^RKj^RN2meAZXcR2iRYZMLZnQ&>R!d+39lTH&5^S9MXI0Grp|e(4V6@bC zW7U*|^zpAxKykd$dw?5mW_%I|USzxs&!-wqoMVbvIfFu_cD`6mC6?ok_3~*BU$YVl zrn#3mwJPJ2P;bx|EU>L?**r%k4wwS<0-pLA{K!eM{rxvZjOdiJ`)X0&iRxaA=5z7p zI&|6so$qZ+WdhzM1&1)k97WrlFqjjhFn5EZP;3aSP9ZlQD?8M&t8oEV+8J&K^Po;b zfuZy8jZ)D!EZnk8`jUq$D<5!$aGNfK^4)ag4Vwo?W}NFtq)G0LvBXT6^;fe`YL!InAY3}K<< ze&z%X88xdoSs_svA**IB&LC}r0m8=S7!?`)tHM)bIKiwB*oz;r(BP8t;V(<>=)MW! z4JfLfnydYb{f8uv=PH^39NNG`u{feKCIqVioZ7qXkNFf%f#D&2G1YB|rX@*^2k$7G znIgFoTe5Ndu0cMo0i`9)ZZ%Ep896T^8w|b^BUr(wEnI1_iqvbXFriz4?-lLdNHzh? zB11C1q!uhV|F)3{f0(EJUNF6<>TyKg_M8CjC0lzeiY+K=C0=(coE`;f@??NxdfGvE#mfx%HGu96py4U!U_0{NG+) zkG_0PpLfm7VNDF0yhtXrg7qkDrIp?sBA!3JP+-_ZahVE?I3Z!#P()$tnlK!%VU+4C zB2Y0)C4&GI2t*#Sp4C>XrD3qpfZwm=pgX@k4QXqfP4P!tpnc3>QH-vfBag>KB#VR=WE1>@5^VH zj7dS&F@4!9nrpFc{Cym{&R5n|4;%_2J|wx!_zvh6EyjoGMGS6 zn*bWBOAsFS^Ez@Nu;fT_Y8U(MdeI~t9kh_yGLV-~h zkAcn&BB4ceW2b5G#7$`jv7(@CbVeFTGad9LeC*_f3+U6k%%=}zyuXz7g4_*I?^>GM z9oq9+RqPF5dc>#s^g1aDxF+I5a%@9WfNh(mDe6>^Gj+$*QbkwhBL`xg;d3^{e;6Tl z>aYFq4+W=BX&x-oL+aWciea2#$z119e80TkzKcJz!(43MI8Js~CTU;ag+_-mg9E_a z29&9VBV>2cJQIu#mVA<}jez4{s6_ZKX^@Oq6q#Gx-5(Dx1&wy1tMm51&N`iTdwkUA zPz$WwQTFltCL)HL`Uze_1;!j=LHgD(4w1iku>D=N1xWj-JvN oiL3pegOfW-oY& zPu=jO=-Nc+Svn=*(D1Z>n%PzjDXrw${?`20D8W|DRejg)6(hdBUZ0UW_^->*T90A> z@8h;7*KV$O(5Ln0$H}jN_um;aObOb@#6gh0vvv^q<&prrIus?N+_s2eG*B()@8v$9 zfHhTPe;>h0^s|YrQ82?Tbfv6GA0}HZqOL0q2(xzGNi4Fd*I5Xmw?NTJo%Ee~s=vs0 zX5aC1)8--hmMOufo_1%MTl_Bngb}Qx-&RI?3K&eh?l%(**Xr$}ppFGKj{apLu(Bf# zs39Ew^RJ=jr}O09-J(&W7@6JW9}23Qf84x~M~qr7?pJXcK^Ld)AFZCXB)47c-Y=F( zAuEX{Y2ma^_fX=w8;J+kVYJl8JswK^A5gRPE(4ksujZ4Q{kv&HAt8{%`5+2ePSIGb z@Yl4xMM?;&H4xG?cZ|1Pd%H`~{Z=W2!Z%XCTZ8N1HSo@dV!*G(SvAZ4-Xsi)Zmj30o6`hmpfc*}C+oq^|#iYK{8B zZ+)R!BUHf;yt{C99_a;LhwMH=W?fNOqQ!xi$0}RUMLh&G(OUP`rp>p*eJM!09fRuF zSj_b8i~0*rSoOxhAojQ|iaLOC#Bd`h4`>fKDNJEhxK+Lp2uypw=ODXbZqTo2PVP){ zfIE3bRUf1+F<^#q+DQ|Vr0&`NNowjV5qcuR^!<}?`~$2N(wu)!wb}PW<5%~fu4Z~G zqc3w-qS#8F(}n!-y@nOvDuWlMe`vmf3!w}h*=DVVrBcC>W4%~VZoY@yf z>deSXxUBrKO2JhGWlQCNde5H|8$UBa;YneJxz*;V$XCV@)>p|tp~6}U7f^HuRJ3-( zqvEA;BI(L6>yPh(GDTLQ_w`2up}YV$Q{x|nx7v+TwLzBH&)s+0uO7~y@DXxr&?<-# zL2c3(efCmbra$kP|GAL`H)Z6}DD0C~)C_06G?iDrtbRXS*L8VZ0PJ=VP`UvKS^X@Z zK!luYas-*D>vlB22Ks5~0Odnk?lxlp4t0Q-o>FWN$AYyt zz8Vy$VA~Wj@G(LUyfTmUl7#8S!A~D4oVb7?Q>qi@WaVB8jeG%SF1$qx2rExc-ZjqD zfJJB=#z-GqX|#%l1XpP}$2lK@x5`;O?~uiyxJ6LRjl%Ka39j*A0*z1a6ryR5WRB}j zh&yVcJhCS^oD+n1t#4OIKLEEFQ?T09*Q8V9nBpX!r8bT_y{3UQ_Yy|3l=ZY za(ylHps`>YynQC{tR$;X(;pi)uL)iJAPW{fox%KP?4$&SUhDIhpE>%);VDL27nbz%`PTSCHz&XCsHKnHI=E2Z)8382tGcyUb+Tos!wl0 zPd6(G(98_>rm39(xreAXOqGOxX3k<%iK_aeY`&@fS1G7%7KW&l1x%U-pRy#&um?A` zG?zY+9Owp3c}sJZ3Zo|G3MK41tCKltI%Nzdt`_y3gol1bkB<^6Ci{29N|2#q)7%NK zO{jRF>O_I0aH5gg^!jVpr&~u+0!^KDE=Ra83>GP0d}h+I)2g^XXJY^|=S}lY z;$>NRBUnwAN(xk2WUs&+!#!O^m!0^pr`$0hqB?f6&OHw5%4}+sICrN*X3Gq!3O|Y z5ijyc)XTz#up-Qd0_P)$E3oh6(zv&SR&pqVI41Qu>>g$8mvlLL;~uX|T+}V$(N6SM zfyX5zSN(*VxFJ;et75d+H#%LgIE{wVMExBcSPK(}gF+GL6%^^}NIz-{7M(J3^hU*@ zT?~t$tnYY}m|rSB5z_QBua2ZofsA>h#*hKTNjLHx*dk^{yC63rs0(n=RXM~UYP9o% zMpV0cdWZ+Rz)QWyfXu(l5e*j*WQ&C>IHd7-4XA#>dOcs(V|PWv$!dkr+uGV)%*iSc z%+!MmWM)yHN?|d;{X$$;A>#Iw4=x{aD9baZBIc25hv3d@Z!U(z-2Fh83jW6{E64ty zS$_thnefMUgVE`ZD!Zt@B9fyeHQVNH2_=E^QMt`wpP&RwI||#v*0K@JpKGnYGQ;qeo-el%Izx^xMk61MX;W(xwM>)M6X* zNqkh027+PT1XaO~%YR1e;MG%MN zbDUvmt6zEO_u+yvZIQ_4AT3QI(RM{GWNarNuMJ+UjeP1uGFr4}1EodkLYL2_Mg57f zd*_{Sfr5@JPRq4Lq)l}>c)S#Rq9*Juej1<(E>=iU)Uxf`HfHgu`_biS;WDanb4gsg%K#%6 z#TpmoG-dpx9P3;qtHE{B+J7+ZC75d9qoG=KGP2`kH7ARdyGlnIxi@FcD~B2r#*+7S z@)X8Usum&g{{talMk#;J3mbox3mZ>g&wh4_*A*xGkl`@wS#B0rRQx z5K{h$5*DjBJGHJn;YYk!AZfx9ERFA%*xDg=oJil^^#gu-)tB46^g>{BN`gmjf>p5B z9#;oQmZ9sVGuP4kjkJ{5F1;hQp_wuOV35&w=8qGIX7Y*j~PuAJtt!Y#=)_3p0^6a~E3rpgnpc z8YLjbK37~diBvX4CRbTB*~#y`Vwz4Co^no~2%ye@e1s|*6Md7lYdRlc|qvkT{pYnvUNY|atJ1(1;YVJ=-wJ# z=7X<*CXP;eQzK$a6gmqBTdZL;g6wI!$`HzV*TG?R&>m;hy&upJ&IXU3DET8EIZ`xSOG;u10hVqxsAp3T-umlo~x1_ZDiFL&7P9L*iZPJ;K=3wzLRG1}tn0 z1Ff7bFj4U6Kpk;^lVUcf@-ol}-a%j2^I{W&%3NqhaPxkb)h8<22?P*&=UNvz;|*v?<_f_o%IZtgwN}O zvp?(oPSyk>wybJNO-`}1@~-5lrhA0A8aV})|$c4$9Iqg_5v5x3S z5%pEJha4s|%=$r2>jMBSlm^jTE8OLE_lpTnfC{GPCM1iCp`HFmU#x^fIJ?(-{{E!E&U}E!4QGFi9!adzo+A7GGb1xdHgV{lNnc&aaZ^0O5``ixlkt`Rn+`HKUF!zZ>3VnV;B z6H^Il-OvD^$)#=8UzcEv#Ub#&xcg)gL)*K~APiro^m}G4A|zcPdL~ujlt>HkTz>~E zfZ!G~vTnPA{XSIc++|>5pr%z^K8I$o?79FqtWeBcUvmdGiDgH!wsrb2}Nfn?@ zq!Y5bSuBh(opTjkTyiJA?;sT)j9;@U_>EreO@>9*ciFRl8tlz9+u_>HdXwSeZ(q|N z>rHQ9&5ruto8;YMo{Ro8vCcS%!J{>*5`5kp@vqn;Ldo@*WAV#>VF1#NCy>QwBHVFY zyFovJ2)tQ0N89;=Aem^rVKTL(D;!2{${3aq0rYs07HdYR_)KFZ&a?sQKo6ErREDHG zgYn@qNzoKYqN*a1AF@1J83n&+6|7RJ7~xf-V3jM51H2t)0Vs=9UX*prB^SM9w$3Uh40Bj?oagU=_MD z7W0whYDBJ8uUhCg>pJ>R8bC=tXjpn6jBvHjuIEA$1<5zS+-l)VlZWsOxD1ZsS3yKfl?c%P&$e??yxmu;>baie@PkEs)BpOY1l_M>4W`= zNNftzA*#thRx4~in}yjC)d?1=EgpyvJ|DI7RR|uC?>Ro9ico*lOUy{KTxDE*Af~-T zD*mLFBPuFLE4?#p)g_{VIC$F-Q0 z{`#Asf8`6TeSh<1ru{d#Tfpz-w*pgz^8Bvorb;&DXg$t2>>iC;)>CHXSZb;jjw#pX zN}s!eWY9Lo8g`^cgSsy$Q;`Z88AuTq*2^k=PGSwJU0@s&Vw9YHM%GP8H3;OYl>miED<+ zO^6gxeHnF%Yv`cs2l4N5Us`oWzwVKkfwJX$@H*}t9G-lEkH@kE0v@I~{8h|)urT&h z{ri`Qx~2ZVwrst}5Y2FR-_(xcq#)*T#$P>=phW<%ZPWz_R+&c98mBjzTF#LnW=D(Q zv?}$BUqvuJJPYex<8gb9?d};enK0J~9?o5(_;AnW65D$srVcaNyzx92J+h3EF* z{hV%$+q=yv)H3vFA=l|`&VuIzyM_^{r>^?3*mH4~s=vcKqLC{3d@`bW<6V_kz_)g} znnwR3% zryd#VHNWh}w_F_IkztF(Tt!LX=Xs>U#V0d&le+K_K}DS2Z7Ir1qHxKi_lFSWabgcR!ETF>&uEjw>hyd;#IfS@>De_4nqtln(pO8BKTODqI6Pif!p z1gP0!KN*r|M>rtDWg1+XqcTv2WZ%t_hix2{V=c>Si$b(zheu~y8e8+e&p57)JwzlG z#ac_673Fa7C(Ok+Dmd4afOL`Y4{mKV9249-E&v7&enL!gSh^!={4Zi<3K zWf;J07~H&5wTXF}4+0emj++ca&|B--w)%326JZ^)qZ(>oNBnb;ehD{{O=JY#t5BYT zaBjoBiO%RNdB`dgQoIm#+toD6z;if4()wA)-GCABLlgh72%6Q7VImGj1Qz9uikv2p z8CwOI*EASP_(C^HF~5|{%s5#d-@dbolkP?`Gorc2>KNaqo9B|FC9J|&@Pbfc@l+0Q zEf&=l4NYac@HBNlZx>~qc_&>U)4a%YA5Hy7V~W1XqM~&TjCWacNIS;e7cm2Q-ai&_ z$kP>Vw%0UfL->&<2F~`e4$<8C@4Am>sBgvZ9C=V)WS9p!9R2%UH#y; z%r!*}{=&%@U(HOhL`Xe^AmcAJSopc|qU1fi-Tit*d?7X}EKdi1VCO7*Y|w+FAxUjC zYsKsJ_)*!VA1Ja5ZNLSj-*yp&4#1FE8t|APtVBr9cp0vxJGMz05A2{Ih&8&bg%0Fq zIC4rIaB$L?H>qgM2{CjyMz-V){dWE0iyXNTNZplffWStQT0?a2fIvs0s`muQyDzbq z_G~{aQT)oKA;9wdo8Y=ik1gml)T|+n!4D$m`x9XseZorT2+omp18QZDppL+Oe5p&$ zYxreHujJ$snJ>qV%?yWNQ!>$=V0as=?pz%Y=5-gv*3K*gS6`BV*GaiLNiAk0bP5 z67uOfZDs+J;Jy+bw1;?_;l6<%G@f^XuUEq^99XdJ*}L{QrRpLSZXQg_FQhv%kMSTk$AL-k zj>>*pKAPN2j`{($!O|}k{5*!C01iE!1RUYt3iK|{4NScl#NCR~2PYVA4jhJjW0n5Z zKWEh|7uXr$fxL==qax^OaBiL)h=UD2_&3M`0zO_$h+h>bI|6pXgZd7!TIQ==K@ENgyOtOl%$9=?y47GSnT2mLh|L^Q(LgdMALK}!z!Xr8 zu+op!9z!24IN~`s%0NjthpO2&7XcH0ia^p@#8uSjnDq40VOabX?+AiTP$24n@DWDH zz3^nlKhSS06x~)VdiMjCgG)q9LXCTs5w*q!_6L7_{S_DmT+`BtUtzlV&@RBlhV32vPr;>Z!-XIOX<&95#H**;lOk9( zZ*-4t6}^9j$e=zVvf7V|{G2zMt&iH>2D|(=(`!_3l7NZItO9SrnmaqyjEgiV%ZAM9 zi_m9Cvp=Ex7k3I4qG7gefO;aH_O4r9$}jm@1gONN-cK8L#`#nQw0 zR@dF|Buh;NBhP+nSE{SKm1ULHK?_jr!iKl6|%vkjzC`J+hkh_13`q z!`|R(%P}3A=hE5)6VR~Q(wxUndmiiD7?8q7*AH~?!Zc>|`It#4 z;;BN5H=*_KT6EaCdY7I`xICBx+*EQudwY(RqfjBuBMaqTs0ig4tV{96oAKmCjg7Lu zU#!|h52&1X4rxhM@M?UCGe^*hEWI1trWhF=JQ48eJQ7Gdc!>nxYq6R(kEhj3+t!b< znwbyQlCj$`CL&jAZ4A7KPPjuCK`l}jbtJGzz!c%NxCNAl$9*{*p9?q|8fj(^7!=F`HoscdU{es$niI zP8#!FuQ5<*qDPS(bE|DUExzbl8qHs0!;3KUhLK^iua~CrN(%*;fcN&j1cC(Odz*z( zA`^#r$~Uj<6S+WCi8;`@NN*EP!WzfHk#zz6kGG>waNG%$uksuArY^3|W=3}Zd~-0i zhWiEp0{-~~003-UEL_0<(OLgX=jHw95R;gvtAvWHk*gWtKZJj1M!>Hf0F#IifEmDK zV)V7;=l_TKKU*eo2Yc85hufS1$N&Vee|4+;57XC+=^qj^faCv9V*cXd{2%G7tFQNe zMP>OvkpN7pUXEq}CPgC)GXMpE`CkzgjhxNCMr8h9TP8U(Q!6832Ty>`7wM~0E@ox` zGmwJ?p!YAo$d`Oy4O{?h|JOpo*}={6|B?AWqx?(#pUf&>V>=t!yEy(cnu*u{$Wj1T zXE(F|?TLJGiJG}vnV2a{2>(O;uS%57TpZk-P0U;X|EO2_|2a3Vul;{6!#@g@HM6&H zwfqwG|Kt+4vUN3c1~7@+eq8}kGZP2Xe~0=nCo_Nr2xMjdr?>xJThH7hUk_h3vDc@` zTyLK%K91HB?0HGRN@Aj72@JVqqP`d?2}V$yG`5mZv>_de*OgKjF7>0FoNN!~l&ow7;LP*h>*#YmjPe4xu_ z^$hZFYVq=2Qeo5k(D3#T2AWpiXhkOi67%1OS}Nv9Jk0{Zif1}76-0T?EB#|aS0&P? zdt#oYp6A6*==tlp%y@<8JOwq5v~1KWp-PS)oVjK4d0ZYrPdpdioDN}k7am)zMg;r$ zUPOi2SfRjdF8eR|NNXx;-T{fL?Y64%IiuTTG zBq!~~=*g9k^S+0O&J<4jLb*{xQsYu%<`@3B#tu@k+#id!H(Z2Xc(m&2W+qgrY8a2)u@?acY z8I!1cT_sz-!MG$sM#@TGd$b&zbB<7|zFp@g+jI8#1{PHzJ=zma)l66`YlD7p-o1?5 zr_=b)M~7tn4~;908*>aniEBzz1qEVXGsU;G$G+DqC_ULGT$u(XFE1Xx^*9QsA2r3> zhV*0BGu9)Dc(xSnTz*)?Kf=V)F+vGRbj#rZohz7v>c6-G!D-4PArco%;0@# zkAE;rEZYhA{98`6Ql@=(7Y*lQ#IS6nzst~8S6){j!I?>VEnLSjJ=3Vg0L8iq@pE?H zP@0^eP^_&$!YB#W<^-Xq(O$*SdVBJweZWb^A?CGD3^@ZTwX;2d-=oXuJ45j!-aJeH zgO$42+(rxskwAyn6weRV*WbRA`^U|A9*-El8JR{iT_(@HO?wWuRTXQ=tC#XdnMc+= zq&PpbK5i@HW^%5@#!0J4E+~Em(Ax)zDLb?0pr_y%6VSJE0DFW>b=L|+*LEwV(lP=q z+qp`8MI7B^;PrI9^lZ2p>*O^xIHbi$s^gh9>G}6Ik4!ij=A6u9mdhJ656;`PV!3WW zmgpv}-l1I9AL^f@_C1d(>~ zx34SG$)uib6sqa(_E3*HBS>&g)^{=4&f-5%c5BVto-0PveT#h`MIY8r=%)-#8ZSaL ziDN%S5JZR0veXDq5=X|>`#5}^;7v^b?k@0MFTs3eg{%q@Nv1c3h6m%whPhgRSXKy% zZ+v7A1_;4qIDc#8RHzj27yr-P6!K2G2|x8_HjnKd!@#s;K>y>8Nj~j*z~}xJ0aC*f zt2jb;)o0^OUL{jCqaVFZq)tLe<_Ls_c5M`$m{u zM>`7JIm#wV;l}&S9u16I1aTgQ9m1wojKA2YJ#l%Q`hE$HBZ1Lxe>IqNWt#(^W%l`(4LTsW6?GCV0Jl-VK-fv3r6aP=7o zMN4Iduv8YzDQ4=`kZHH&6%i?jFwb`huv&k`G|4z zdR+7yag{K@@A$Od#D|Pf+UxOJQ@J?m{j$uum!Kix>pYtVbNDsrKx2^M?%NNw+9w%z zIT}P1cjAD`ah6}#)B6X@R}3n%$}j}4EGT=v8i}krLbj=Y5x0i-Qy#8Z z1r8?#0|ZR-F0NMz!m$G6SsGwKKZEOeA10Uc{SC$Er_ASjjVmGkXYb5VM_$e_uYb4K zj@WVVJ@y&g-XNAkvQtObtS~ScCQE(vSxXQmC|6iuLkCBduA*Zkioo<{9foQzLG}-UPq&6AMT8 zgzXZ8=AN)Ae&x0};7{hm-rPZ~@n0?R4w1q5h^fSn4U{ro!ZA8dUc7_aGPVdm)$npb zR1l6$MvIzHafn4SFVrIjy!e1-QdDq@8BJsT-v?_hZ=Kfj;r8A#2!S8v;6LLt z0wLgUJ$^Mf7_25&-TZ&dy=7Eg%epO!26uM|?(XjH?(XjH?(RVX1a}Ya7Tn$4A-IGG zSvza5)6PEc-23k~#%Qx_RMn`js+eD&t@q0C_Wmp>%6o^-s_~>Jm{+=3&|{P((B=!t z-T)~7i#Ab&?u?I;TCXq9&8Mb)K*w?;+SR^w%}a@C89jNBHqC8dM@hXwAcgz~W?fnq z`EuAGf|xo+n&K6(=uqt>@uw_>&Fd28uuVF9z>6~PI09B+yB+1VI3V>YbnDve@7>_w zlZ612OMFEj*!rVyss*KwbH`#Ho%8OkmxN6R#%G@FKhqzd3w(*jx4#*>Fh{Sv$h|V! zcF20%i=VY#S)>J7is&)37(_k)_-S?e5b=n;yZVWeo@Bjl&TBvlnPlpej(t?#hv#QJ zepuKH1wxg)L)V*2w~ea0e!qc5@=Um)ll*%7SCN)N%Vq-LJ!9f5s(=%F2Yx-4PiX=O zIY~P}g5YSJj<{Y8470VVAf8f7m$#QvUVdXTNzf!F@oNn+Z{rfLGC4F3E!cr1yA)-LKpDB_g{7fJIT1 zRQ&>Zg51ZLDD5hup-e|Ml#UG{zE6hr0|t(}!3jG<5F zD>{?jV$XbxI(6>os}-fsC0*|*6QUGb@nAa_Z{m2&j`}CV7&@*%w;1IJWHH#d{4@pM zB?2!$SGG^OHE1nLEF=q+@(pXmvEtI14u z-Ooj{kW$zI4wvU_#UWl^i%n%Fi+b@Up+-ZVmPkOy!K7a>?zV3fr?p87+^z0zsYhom z5Bvo-b)E3<)?>uGNLy0vb%ofJ3TXwGs$V|gGiwPXl1wqvo;P>ZJCgc0;xh4Os}@wu z$4s9a+&4GJnUQeB;)dhK>M0<`O7{Z_0TyCq@hJBlT|W6B=8*{`V@W`ZM>#C7K@#^Y zRzea6V=?UF>lZkMXMrb!kU`eeZe1K?fW+Q3?|@kqE##hE6zT3pQE4R7I(pBB7123hLA>C%cz{3-a3;3R0}jE)*V|SU$o>0Vr?xOYohVk zNR<6Jb^}Y^5l&1q&Jz0?iTAhmplg@y?WPbsui$H!B_}BvUP5$d59N^AI3DbqkY2Lm zpwZmvPjND3^FdtX`q=jUJz_;HpX7=Hbk%bdg8q-5uc4{@sl+|i)Wg} z1wNBR4(_6zQe_fnt-Yb0=}Q+Z1YY#@N^PYO(~2)t+aas+yhScNg|JZZxLbxeWmg>H zoYq}?tZTxcHMYwkk!$i^H}!_Ryr1Fz$yA(EhH2yb?cNR;2^C>_Z3Xz3iBP`O;Hhi3 z5`0Jv9aAK0jZB))B3+}E_EAdG@+E-;-)4}U$RI4CK)G0AL%;x>zY9i*V}P=-_KjAc zvvL~Zl5KaPsS~8N5h7amXJk$)`F?wq$U|zdGFGB)YElHFBG=BE53u?H=-=XR$!5-m z&Y_$|&|J>Go^Ru_vl%s8Yf9?P#dfwB#Xpn7b{aL7(FO$_LETh%MF|))R3V8R z|AR54d?np23Oh5Mb@Rv-(X`l0$;qp)tp^qWcVs~QN;CpRvsrq8bX1c{bNa!|f=mJZ z5UW_GA|^@-2PORr2XUKRyIh^z3jCK$4pt`CwwYGW-abFgF5_BtlHK}dE1Op0nJ*}F zC57Qza{6IG3QQ6#Yev4kXY>%|q?VQ~6q<5ow?ceN{#Hd3HVvXy?fLoyh|c>K-U!~d z$ioi1yDYAc41A?TcqxZpQ~Fenb5oM!QVP~7iPx;s3Bb0c5F3NOZ28Tj@kLa_cS(JE z=+wfT>B=@lw4`g-)TDO;4vE3rQo0=EKZ_r>?MxhP&xn=lw z=*zSMei+Y)Mx}+r2hQY2En?f~DFRacBwpjMY%kaaJHe)SDhOW(p0yFP1{9UGoDQv- z%AJ9NW!r&A>O#N7fyRw%T(_;x8LYiY%1E9!4rKwj)HOm+){dJzf{l>nL+WP076J{M^G3>DlX|7T?iN!wIjqwP4|< zz2sA(L_E*at>>7aGFb;wBwt)6B?~YxAqxr^SdI4GYfDca-@TW8dN5-y3lPmsT|KZ8 zJ!R5)gjK~pSfH4?XuN*uE!1nMpyXAAeJV$Pc?WFTh=2fsA45C4?mcmc`2yVzxt{zh zf#wvD*gx#r1a|e(lfeCj^O4<%FJ{TY7($d_8w%O!et|dURkDeu1Ajp;P5;5i{-X=T zoS4kpKKz&J=?Iml2X~W@uK;aIxbbWkAgeDdJkWvSfjS)s*8NTst$5;9x1_?){nyp2 zljlqGqwGGnZ5_fj>It|}_-c>Zc+BS=$+D$(NIPmf-LE+o#C0NCl`qWZhU@U-5w>I# zrAmpxN83buOv-`Pa#nIQO4Mlc#Ys0h3S*<9P;4ejvzWoRdXo0!&)_~mdni;UN+2W^ z`|z|B%2BzqKs->`ozg~&EX4BlCGhKoE@s7JAca0YUzNf}R9@s(L3n@>+*0ycuLFRhMB*DQqp85dFbk=9BglzWk5lWH z*d-)#kI28*I)ryrlm#a^-<>Y;siO_M%4>>&*1sN}{p|F)pFur3t%170Nn8g>EDpwS zdj(hYKI57>c{<^nETBoW#-+r>o!ImTwhEArsz!MK-jhw1!PI92D7w-Mh1#Hiq{wl$ zh$yHMI`)e&zq4GCXv8SIS^u(^!_}fNlUZJp%Xb#o7dzX%FSFHQo|3w2GY1AHiA(jK zso^J9ghi8&7&Gh~yKXnWCV+WKbPk#;5DTMu7@84QPA`(W#g;5ImD-n}Z?$+mHi65x zwcfVzXU|pRy|FM_Vph$5yXe@54D-X>5DbQb?uTP;w_;op^ct$B8DXU4CWnRT`=Q9M zH}~wA0geLqBB%#6_&3T|br}8JG&#CL>g_ z=TQgGhfyQ7<$Ra4zO3bQ=x$7!_~Ka+y*9;tm8rL)J(}L~!v!^0x9OZ3F)L5sCsQc0 z%vUo<&JVli8+Z~!)~vOXFAxuuvU8LleIO8A`;2`Fd{io`9cM7c>Qa_3QvuBJ$%{sP z0RZybGysLWkMK?)+~^vU`y7;K>c-pZ0O_~X8JqT(?iJrsXtQ2?lwdH3VTf4c*F|O*ydxvSBZxp=igO39By$PUUMu!c49g$}ePo?&#tx+t6CrQy zndOdv1kT#MHHgFjt+_o>otx?iU1)!}hBX}8h1DcoV=(Q3EMhqu;Btz*!&`ieCnDN0$PJB{YK z{m4Gk&)SXAgpR<*t^{5`3B51G-e_f`Xj<(Pemtz2|A{f$(HgbFe2Ni1W zqWooUn5Mmt)>v&o1`3P4n0iI7lFc(dKXnmhC!jpVVRU41>+0Rp{ACN$sH>%vGiTNM zm>21fqxBfsW-IhW(6|CB+^lD(R%hlSJd&H)LUo9Ju<`wlW?!41rO8u*6GffwGKcVaHp3ILfPQG1K zJ8;j_6Vqjr@|zZPyA{8!!?U7o8Dz^T2-`7N5`O`>vFKmfW{GGQg7^3N?8htU?T#?} zS<5rsFw5eY+BT9BWj1;P*DP%g=iDs5o^H<$sQDZ5DGV>t#n!MP}QPwjfCsTjN>Oy?2Gs{&8Rxf zeGB6lI^r2R+T002sQiEZ@O9q)h1<(Q&8~oB0>_B*)9$9V3iZz0{q|uyTt>u`+XV#6 zbd51r?;(m9YaUk3r+n+n&6*=EO=&h$HJ$71O*?~ZKOpbwDr$kC4HKu0=I~ZW-Z#F% z?@%DoHEG`mb;_?*JQEb9i1c%3emR-tR58X~s}6(6}5FU_Wqu^`H;P_%XzsHK`ec940O<5Nb4Iv z{laTWe} zx`!`bUaL~=hhHKz@seaZJ^goay!JbcS5&O~@>x^dcza&Yv4e&)t9FW=%vbQfU2G)j zJXw*#@~+<8GzP|2dHYXW0O|dJJ-3XD#pl4gINMg;+zYzW!)pSna_QVY*v^Y-P{VkY zI#HmUQis8G5sp~jWX6a_{b~wnh``L96@d@&38Sh81upCagC&)GE>vnfj$z?M45ne5 zJQQfgwx@s+LOt0jM7#AOD0y~gmqwkTd~CS%oBIl%o~oaZ-M7pu7SlyOPbZ@s=E+&;1% zZ`>C<>~43#jzW#zAx>Uh+Mhg*{hn9Nq`3@u+Z1E3*T7q*rZ{ z=~L}J&c0KG4nEuKJ?;8-(kF&~Q99-@rimTV!gYy|15W>u&yXr=Oao^$o(#qSwj|AP zPZ&QFIP@-{?w&+vV(6#egrnJ6CcR8{#{2#4YT;7r^r`OV{$0Ab$ZtLRU8xoXjGbm~Uss)El1Pn%s3)R%iOGxs?fDr_cm?e79 z`Mry0MMBW1^-^l7M3X~AjtrH&1fspN;1+Ju_j7#tRczl{4^GWL34Nc ze{~J5M{&E6`oMC=)+xM_n={>G7H89WZdu@&2<(4S}{G4BM5A#fhRc zlO+F1+;0ouxN9=~P^9k`M(XJnF0Fq-1`*G-O}KQC(j0E6ln%qyw62?mR4nj3tjKSH z+=5TFljOu4*HKA1@KJWZq}dj|p^Kqn@s44=06`a|s`L)NsBC`3?BOiA76hhVP~Jog zb6LrZpSNz4?x;BHYgY&#pp1=mJX!(Ji;DrZyVDQPi!DndXM>KuEqdwEmJ?SVV(v`r zmBSdQRjW8Gxa+*YD+RPB4BKsVyt^8>&E!=JBvvgW6*uTlP#ZJ7#hx0-I@`A#X<7=U z6U=O{5n7?W2OC#}1TEK!2uayP)==p^%W;(DwG`KJ1LYHv1M;hO0|X?pG5KU?$6N)B z0wf%dT$rN6&*5y4$<$pK2rDgmB;UWo!Xq^U((}(pdZNpGl417!?EQRi%hTGFdSBr9 z3V4(cdo#U2i`zdZe{OlyQ{&9}&9y)=f#}sicFiy~hoj?+^+8z6`TV{X2T4FI4F;-Y zfN%#(7*q$JN{LFYqK{Io5)%$TSqg?ztX!@VRh42QZf4YcD@sDUEaS73as>%x8cL7F z9Fh@Io%~|yvY-nCEx4yLhdgpp-Uxw4az2O-ajnEOrv}QNKo?AI?riD#IA?s8Yt4+i zE&|xaE2P_nDrUF0F5vld3;o)vtLe16>#rnWS?}|^Wc9PPm)i<^oZAKU82whS_aO1@ z+*gbe<;~d0qAt7xqfoDJ#cx|iU6a9~bYh5HSCCF%mt8|R6vzg(yis)ixI(WG>_^{) zAzBwrQYj^c;2q(wo|f=WoN@%$k8{5!f;4XWVMWlv^TAHw4!?+fv+A8L-jLwM<$kYL60nOE}cvYVk2JDPU; zJ3JbKWsrtGPgBHFPd$Ta8ZUTh)~HO1ot9D*xeH|n*ba8V7e%@Yq*aW{&qO111o({s z2hDNvF$(0nawrH1H5A00lt7r{z>-3ree!Y% zkwx5tzyRo3v;bG{;ff1DLBEt^(@zx7yXn#lb0UZdH+!$#D^^DY-nr+ zH?8BTFm^1#&kSUg#YN*%=scw8yV`JRZzL z==GxggPN2#j8@ZKp)+|jz$G=!gdn8p09*ld*b2ws%G<&a(;7?SV?wW z;JHy%5#EEiAyfr#>(3Y6PLAC^OY6I`PE(`T>C*0b2T_tW`x#=$AcnEZzuI7x2N?>t zbdMnJo5OAuxrf>z5&fF5o;SKgC;*D1gAIWeftKu!I8XtqZWvC-p03E@_KXRb1OT27 zoT}Mz4gYaL{0m%fSHH7mnY3iZm`V?<3nJ8EhXh2YF?CO90HVVU93U7?Eg$aw56pVYkz!qW~1)@u8dEAG zNr)iAH$u#3x@!AtY5#G`vx{DWUa`+d7?Za6oNGXjGh!{Q+Odh8JhgoL!ilR@(jjO*r5`s;j2$t*D}Io)x=whR?mxumv-!(o)K!wrm20G_)rwJYk2RZC_?GrT zO|}raN*OGjzifUPP;Bad$^b5`jo;wOIFTqns^2-(@k2blu)hj0KQfyT%a#Bz@$$2>dR__5k#sLg9{@$UmvFpVMrm!{>E!-tAZh^YSx_Wvu+Js&s1H`I z+gOe@vI8i;Wel!Q9uZj%zx|yCMAedmkoGB4fY!lD5?3OD)2tlydNZbO_i3pP;*JXc zC#=K49M3MFLGh?BU@lDkXTaMqPWlUBAbnv>F_8sur0qq^HbE^;k;Vxy`PM6mg{IGe zhmzI78H@fvcjZP84Mv+Wc+F36GM@OuE9P{mxYt@b(GwBd0zj}pj1gHqQuJ7Te5iLYLAsZWq$h4B+P(foNCP8^O zea5JR1W6?M#7tNH(^iNwy<_m_)vC*|P_wUusa8zRbxC;+b<@?|F0f%fqg(k7Eka`@vu;HZk!w zSWez=BGNbbubbtYdwTAX1atc?#UYQGni;G%yA&R){6jxMV%^NwZo@|tPh5WR&^)+f zQ~vM=6`lgV+Mddo3OO=rF=^39NXLVJGPi*4BYbiypV$>9`|M#Yv=<#_RcV<)!+I#C zs3ss^Mwl`maRDONqW-d%>nJcU8IBh)CeoUSghRkXwYcOo$}U{0nxLD#<lWDUwWW;WT==`0o@*V_y8#;Y-v+bnz3o6w~3hBcBOk1 z(b=l0+M%x--;cuNd`(zI+rKTXrXdPam@RQ1s=D3ZqJwM5Wrz#)_81Z{AfkQzIs{`Z z%`7jLn-jDehEFtxucHU`egs=%57re3H>6#T^ozwJt*NA^?0I57#E;ktI8>@+ku{Vx z`m4UPCNbZ)pSs_8^-&gIpzcjkee3}jhyo$2BoF4f-AqbmHq|^f21?SHh7dx2ZX&-b zdJrCw0`c14VA%t$sxCzw&&X5yd@F54pTo1AXf0B4hIao9?&!276)_XK)g zwc}b#XtEauPtY;jBEkVt0LW zNWzA6S-f+s-o8YB+@?mIew*4Nd7RMV2&`2FZOvQo!EelfH`k{Rvi#CVp_u%aR{e&N z6w#U{9o;qdk_)8p4X3Jw96B|sQNuXXvXu+0;ZsdI4tju%^Rz$^qN1TKi?;HYiNFfk z-B`k1>(rjS^=5D_f)I!=u)J&?+rtB-z4W#&89cO_6SwmcjvGM2KTKrfaT9-CCop{v zM$1{a1Z;h9qu_jaLfr?HDC5V;spUU6XaCens>84mZvB}qIe)$esG_e7S)id94JMc zo0WL<=OzdXCtb)o=u0Q$(DHb)YjlGAX9dl+)zr0kQPE&~8Hecar*Vym77~vdceLJm$#?6jmr*Bsn^HXU$ zXxc*^ub9d?B1m;!&uHgv1D);N=%V^9`6c4n-}XGR@+>gyGBQSzNOt;e$qt%~ z`j9MmLUuYhjg*CUWV2Xz_oRErFkDR0L&q5kMvH}p3_`Id2O^hwlu9zaG7}Doxp1*l zk)c^?FcmRD%j-oql1|q^GEaqpujQjeqR?8v%z3z%RMww#7Q)Xy;*czUzs{xN`ueuW z2aYKnES$SvSJqL9lYBJHq3!N|yML1UeW2(`jG8pYI_nZ+J%aVs`R#|36bepGjSYGt z9!_M@0=nv>Sa?g7d6IcV_!)UAGmrwMv&I=${;<*P@ry{JpA^ft3cjTX#ci1}+Io%H znxYtEsFxyc%s{!*HZYWD#ZutmuwPAQ)(+S?%V}s;tBXri;sZ4RZ1Sdpq$bPWYWzJ> zXvnxCJiG^=U=C5Tcx-629X)0=2k7f0><38$9^JBj z)6L1-m{E|RzaIaFSz|<(q*D-zO8o?S+|_%<)&@Y^e=>}<%jmn9vvYCeOCL8q#s3C_ zx*QX_Rb%HZe&CFxMSIlp>%td(onQ4>{Dld7g)^3QlPz%KWsr2HJHs)s;n2TH(dsZ- zLY{V4h}^Ga&6l_sTr4ZRM-DE}Yv_mEr-vT|yMiT~S!nrMNuaoyXl;k+6wOEKSzH28 zYDY=1!9xIH6(KB?n+XWzVz_NYpk191;km-~db%~mq|+~h_HXaHbTatdn+nKAlfx#! zd-8*%aWs<(m{a!QoY6PaqcNR~-|QIsYYBxYXaz|pd(E)iB)OJhZpknM8*}U2I5+|> z06MP)PlJ;!K{A^1{12&Dr|fa1MJ9S*9jClD@Zu0QxBKziv%{zKRYSO-PM{*OD^YYk^EoBq+ zh1k(gd+N$)Fo%OER|xsR4IML*CHElr4cP7rz(qUc1c0L{@VeOcF5q+z&4HBy&>_6w zoZW1$=OjgL7s71FFaM6g4S3Fc(|v{_4oHq)aF=@kX~bE>M6&>c5^VF`Ut;uewa$3l zz{QSs+z5SQ3*r$9QbtLkhKNn*9Kq`1++*8Qt@r00G~JSKtOsjiW^nKTR6Qrp%wRPj z>o~*5zfvAjq$zT_?7-9Xqqsq1N4sIM2e@t$8iDipPsF!BxIR$6z;uYP<&mH!$m}P; z7~!#tubULoW2KX(ZP0JjwzCV|LqEn{mN4mPP`9tiFJBoCcJj>&#hoJNUg<+D)y^J;IzlZ1sT-yqRfVtxHk7fj#=2P@ezuSf*R~S6 zH*O1-bXT$)*D^?14REZWrC%$_wldv-I@z-P1eI~AIILZ-slXR7u zy^AEgOtyL&ZP6tKjhR+LOLl9ZJ(r#CpRKQ;VOck zdXu|3d=*=@@Q4=hl20&;!GJFPI!AaKrZ<8HpUN_xSfsQSnHv#XI1#}R$ru^Qs{7(2 zTvmggqBI%iU_smSONTMa%Pky0DEBO-iY-?A3JC-)%P6${`Ecs_XR0;aL`?>mIAm_( zJPho%1uN{Ce5fj-Tk{*kF;o|X2BtfpkE9#*j1#S=7Vhw5Y-da7U0N4~1iS~OHD=Jn zF6E@Jov666no!3-LEj5D8M$j)Bk8QOgL4cVB@b`OK&ng?*37(0N9YOL9cLUF`Zt@Y zS=TY1mqW&sm0Jo3T4dtbhk7NX+xK2i??N1>4J1)a~}fgs_Q0GTRsY6m4mN z?PG|pNSPRj#o1n~zGXMf9#v5s_&iuSbsx0Q_4<_SJJtlALbv!UHB)oIx$%j>XyU+K

ob(c!cb=fJj#m^hs6X=T?Z2M{s3FlA0AwkY{^ImbUP2T0@dnZ;}6x~3g z?pV@$2;U@;ac>>Tw4Uq8wC-n+abXNA^N~B+BYW&sk&b1y2JjG&Ul+uE2BnFr-X@6f zX0mo&doBlui140uKR-6-2wq?DM(!f{Wb7j8k0Ox`lfLp2J?`2N$@);nOs3e5HLaIW zKBHm?J-&?d#}4fkvG+>8F&}vd-KITof1%j?wTl$@lb$>z4zu^xWzLY*b+&#YfLpz=790Yb zc-F`EuZhc@z>gwES&#Ps{VE45u%#qM7h*;zP{SZbS6F|x*#i>mOTY%f#L41W;(0NfaIx}6`Tq6m?6Yd^b05kg~g zIcKhDujps7zMP^rCyKb)DLm0$mGSjuYR2Sevo_@FZugqO9}^EQ+N^?@`Pn3EYGqN2 z$}z9bwCCsZ9%K3^!Ukrftw-pJOITiaS|5XzU@HI?+}voM79(5;#R@FLhV#!{3YrlV z%*AE<^|{R5@o@y;S(x-c$hX0>+rKqO+^(6pJe*1-U#zK&HgTOj_x+lqKjp)`A$DQ1 zjN3*fUXRJ)M~a8*^b>GHDCL)LyOleo*-&e)2OA5R1rLvRjJjv?{OMJ-1&wS#&xQ0! zq_67XS6t@NGe=YMrpd;z7$sDHRhu2^!lR5^_TB3=WB)y7f%mYar(wlSI(K%f7gk$^ zn`NCX_iNL@SjS*OKNlR=Q-rOFRidpyrI+_+p-OGlk}A-G=zwxZ&$eGxm*Z3_jyVeb z7gLWD$n31&+2~8D`>Iq*>TnU|-zhec>sa$%2c;;nk<09+bAoK>4Dy#Kf(3#JfoMNJ=AHn2HsU z3^uh}x?%w*6C+z{WfR6aH2&-B6^i4rt;fP~rR!M9L-;s_nW^Jp>%hWW-=2quJD0aA z_s6yH@Zk2S?Ub!N6Qa7xiPnsSOhYzzcvVrh~n}A09pGZ~zq)=%qNylc=A-Dac^jgJLf$ub( zNtP?H@Erb%XR(Y2&?p{{$rprueQ|PC-lzZv(c5dAhVd-lnaz{4Z_avqZw+x!pJ|`5 zlVbD38)_K(h+%B5JA*Cw^{nzG)Us2>*@lPL#3{KB#+&$lwh@sN z%Z#CU4wr+`NSe>uA7rB)nd_HQMFVB(jvbZh0I`CEqQh02svb&iO4pPR!Rg=(@)n9_j=T1oHM4qO zyAuXcarbkF_=od1?RV{iLp`^u(MBBe*hph^u-n)?>@V|1CE}9_izvU8bIx+Jdv6iN zqT-qy#(hP!Wak|%66WdAA#YzLr1li*1nTVZx&{rrblIjyOk2BGhq|}^VW{${Z4`~f zGGpeMRZm;RDS1QJ?GdQ_W9s}@ekA=L*5NygkC~3)@45AFe&ioA;s@VR$k@@)!Q9Tt z*5QNo_nQJKqi_ArR8$mH6%pj86f?GRF?KRH)c-qMMBm!n${nBV?;P^KvaIw?-&ukT zf0q;Zi^xdL&P<0-&BR8BPtU}}j89L`K>sf~qqvj4mAN6mjj5F}KHcxf_#F*D2$8I; ze?>p~_7SIMU||29FQ{)PW^8V1_Rg7nujY4>qLZ<;3O?Ik4}3(#{zN|B?tR4=GvI$v z75_E`$I64WM={x*+ zeJTBa#pxOTJs$raX?*&>$dn%gE^PD8jWoA0#ivy@x8b*OH2;(EyJ1yxBPTP*f6+ld z*p>hNkCBlPpMjP2_sz`Ag8zSVf0X}cO)M-d_#Zbb1H31ky}jS%-gAFH^RH4LVV3uu?IZ6a%+AF4`}3ded_4bN+n-*(*ZL>?(WAe^?=}3@ zCc}H^qrE>le?C9n`J+#NKlL8|X!j%S&;6S+`##fcm2J#F=%?>f=DpT`P6PT68ttEX z@Q1?s!7l!vciRt({Z*14pP7}N;lD6g`_(+31PTw#hv+;RT8pQjvdNHWYM9&K** zGfmveVEQPiCU#Lg&*6EkHEa_;l=sU`_T%{@?t#yNtIt8J>m3z5Kv)3&WNy`k*}=UO z#V>HbCkTM{_8Mzt%UjFP89*iuzcidOl`6|sIWEpmIg4<7Jl2p8SU$H(^9%sJxHn@~ z0-`rW){x5wli91Y%N;BmMczhgEm5%u`C;nB$F;I zzXl2F9Rbpw{eejD4i*!`WS|_0i)$ovGbsQA^JXLg4&;!roSzJ5YZeS-4e@U2u?LF) zQkG*4Nl>4^ta1hR$Hi&L7a0q`OM_nW8L{zt*vNGIG-J zXj+7-Lp!V6wF2bYbqk7&f{KEwqC-n{{O0H@&pSL8-sx$wULP(V?uwwFKtfL|Pp6j| z&)WO`YnFVbDZCFY@+~?E>9-7n(nmX4w*?vt0XUs<$0vsfkL=HqI_GWkd>0p9@t!9Y z3spK@%yP$=%l0rM7F;n}kkBTtLXOztbGw=EvT-6s1YXtgMvA4>5{8pevik`oyOtV> z?AA<|3$_BC{@DJSAcDBzdRab>5E{|xo1YKwva?T85zfY82Aq%rs_3FcCx@5>xNk25 znH>E-s~5gdXLa5A;I|bU@s3TnLpC@n_ zNUDd24G7Z~=JBcAkNc9u++U{_+hY?*Jg~Jo8zU|kX#I2T3k_4xPqq%6@h%!R$eIT! zN+N^$M76zJGfugxz_IHpeTLD1!A+uF!+Dv*RJf5f^okH-JMfmvM_xpKjuELi-4tFh z9IBo#ne^9{S_F9AdE-4E6#^F~Y{KO?p$dKU=1)jZkc^?<7J*NRqm`eh_}04S{atr~9`jyc=Iddpni*x( z04Td-%d7*BF2O6e=^6=}ElszW-4v~J;*}0VZR0@kuZYs62$L64htYF=MK2dll>!(9VeP)a$4?L5i ztV3IY8jX}ndqm2d7}rOumR!PbOyP7cTip=Wm!6va6ZA%;}z;hSkZF^mH5&oL~ zk9y+**%}#C5_LD4u`d}3RtzN4Qc`K~wv&VX145!l^_VGN$$`{uzV;19Dtdip6egn{ zQ?crfg?*z7mEE0EW^jladY;vP8H$I+UUK^(CTdh*Xht;kGgvMytG^Xg$7_Z75>dTz zDDVl1ru#?aYpG7jF{ctYtaBtN-sai>2^uXT-UX^8M}~k+4%EyoM_j<+Db-%@wjVli z4Y=bgR%X;9IpL~7?WRK{N%yw_v$B;9d=={JW^HoRNHp)m(z{LUCG{~#nMKqUd?u?z zY`FtPrZ7#M4C_p`X6I+U0iTVjufPO|#oo^Y2s=`wdUL5C zv2>G?w7&8Xsx^iW3aIHv@yCK+jrg(96Dc2{Xa?Ev5T{jq7nF!IY}0*l9U->|k%>(* zH#C0CY(r7KdF)viSr?hy(PglTeAM}3-sjsvD}v*Cr+E6KEEF7nL`nc-8~M@084<91 zT*1a)9lkUtxcBlJQOksJJDpNNjj>+YsKVHXUd<6$B#{zmd|?|>G(91i`QeKW_65|D z=?)W{4(;VvN9lRT}@+ZpN;t>(JImiLqLmUeF5hsza;!7K__qo*QYrWWJ zl(S@M{UG)l?#Y9nuXUWn^d=|&o7YprDT{NK`BZ|ICli z3XYM2vT!xNB2H@-Dna4t3mw4}@PgO~>hDvV!v*$_bERNV+u_gFqaT0wuAoZ*t6W+3 z(@S^tXSoPyP0TPxC-Y3YKo;!fWK`H`93_%X<3*uc6of*5M7H|vt*GuHJlpNG>0l>w z1|dI3j}7pT&cHj`kWjeCQ!%FjE4m4O4I@_T>yRTOE@(4E(EMyy)=Pibg{Ln8WTf=D zn}Qh`nTnj^0JS@yx(%{url!DFDG7wpX5w;m3wk5k%(N7GqD2t;AU1ew$JqP-#U7bQw2~4VbF5lBQ6@<{o?JiLTwqg`Jyg>FEnu@&o?M75Tzo1a; zb#MxO_oYf&hz`Ggnj*L58@+|x^M51Ui|!=@LuAJ`>g<%%VehzvM0YA}uf0C7c$1fr z?q>D3JffF9?ofW~CJ2<>)HcMb2JAzOx*z0nC!CoSuDjYa0#HD~xY=6r-z9rHD$Y0O z(iHCY&h5{S$6cWSINLCE*-X#SBVQc0*Au5LuTPxW1sUxYOoNHV8u8@-d^W!ZzvG1XX|DLM8+dFE&#SXWW4P#o-hvnb3-#gVLtHe%!{c{;pU zcv&&bMTVOEAvI*)!8qORr;be9zO`Sksqtz3YY!BWp`$un{JEbq+XcHT>cd6sp#c=D=l#W7iKNcMcfEZ^UF+h zhiprPc$J_zMj+p_=D;kUa1+v%jbpY~&o#?Y`Yh~wBkWki0J{W3QG!)8gR1r!*_H;e zR?(<(+3)M1Sn)t%{>Y}%o@X&;E*#J3<kS~DfJf** zINJYEB4VUt`df&I{-6HkAHU*n&7l7s#gQAc=%s`1cqVtUg6n@jd6}Os>O7%$zQzh} zbJ4*<%_^TQwCxqSsGiOrdiuWCbXf9!9yEvFM}djk#EwU;hn>a<{u8!0yfg58rdQ60 zB}rD0k$vYlMiI^XfrY5MXn~G}D6my5ux`iz_*F*nUg%qwjz{;(odq|MNFmV4a*$LU zjh+ifg0kSXVX|4qs|UAzwWK?q(%{nRC)^3)s0Rfw39y6_joS*795lpqi@t7kWuHuK zG9X(`T$HmWR%uJEG7biiV>vw?S2})$S)hKFQ9oDQw(j=o8iY`k$}GLX!|8Y0{14;# zF-HG_#TXbF{{}8Tw3Yt2Y5#WD|Lsoy_LM)|^S^)#0T~G)HL3qw;DYJzTK?a`Eq*(w z|HWhdr|8nZhvIkF75^X?g5p9SgYl957x?gR^x{7b&3k!8cgJ_+MBK*2_M@r~MJEMg zQ}cIN=^f_be;3^{{tfYbIKQ$EM#c^wa0QvT(K|F_?&SV=mZGzrot3fm2VnCXjro^? z)JI?bZ7Y2Gzm>9RRq-|H-k}Yae>w9Xj=Q$@KNP0^OoxyD{7*0Yk^UF<^S`|1A6NyS z_D{#D#eR?5Um%l_{ySLo-qR1Ph5k2Q^q1t;Kj4FpRsBz6Fnk~zf9B=Cmiarj_5TMS z{1dFjiL8MIqK6hte?@r&%qURKti^Rj#EH{0;FIbavIGC+H_@@BKt=1*NSu^Zel8{! zKop(ch$vNN3?|H!dblP?@5zE7f3nh1n!Wvox;B1J3k|axbK~xtvk8Uqz=y*1<*r~e zzEZ+L5X-#or3FTn8U8Oe>o{qfK78ol`!78| zjtoGCIRr`zGpU~P3%LQhNrW>19ELtO(Guh<9RnHtTsP&WvVE1r^ZNAIJq4T8{pcqp zskMX{#q@}ufjYVJa(HZQnixcNSxue3|9-e^3+%2bLh2}%RyKXi4G{oe2P5J$UH9HI z3$|=;#&nhmBu3ICW}Ftry~T89I)qVh6*ubI!I?%i@8CiT`0_HVC=phCiV;-}1X8#R zIZT$LhFq7gh9`V}9dC$ik)ZZ*;jBbMsRP^~kN(>+##rSjvGST|bRF20|9JAZG?&tB z-F5)`XXSrDtP(w;2r!9X&J4 zyM_K*dmn|Q|Gze_ST|u9i0=6cFQ~Ajj_t%YE3`|8R_zx=0YVmn93xfz`o41rNgT2u z9%t@+dUuY_(ks1dNDq$+3%317|NFoRCPqCxn!EK9Q@GEvZm6*`CbqG9c;VxA;R?n|Ho)XR^Px#|uMV`x? zomh`)&Z%wC!!CDBS(q#^#gWOG9lm7X#SBwc8l-QA6-wB6ZDFw0*beX3v7NG7$FUSr zA==1?R79!BFJHu5C@zr7$nHrj+U8)&+)g9F7&F44 DAG.pdf - -lint: - snakemake --lint - -install-monitor: - $(CONDA_ACTIVATE) base - conda create --name panoptes - conda install -c panoptes-organization panoptes-ui - -run-monitor: - $(CONDA_ACTIVATE) panoptes - panoptes - -run-with-monitor: snakemake \ - --use-conda \ - --cores all \ - --wms-monitor http://127.0.0.1:5000 + --dag \ + --configfile $(CONFIG) \ + --config root="$(SHARED_ROOT)" cuda="$(CUDA_DIR)" gpus=$(GPUS) workspace=$(WORKSPACE) \ + | dot -Tpdf > DAG.pdf install-tensorboard: $(CONDA_ACTIVATE) base @@ -151,29 +155,4 @@ tensorboard: $(CONDA_ACTIVATE) tensorboard ls -d $(SHARED_ROOT)/models/*/*/* > tb-monitored-jobs; \ tensorboard --logdir=$$MODELS --host=0.0.0.0 &; \ - python utils/tb_log_parser.py --prefix= - -install-snakepit-scheduler: - mkdir -p $(SHARED_ROOT)/snakepit - cd $(SHARED_ROOT)/snakepit - - curl -sL https://deb.nodesource.com/setup_12.x | sudo -E bash - - sudo apt install nodejs - - if [ ! -e snakepit-client ]; then - git clone https://github.com/mozilla/snakepit-client.git - fi - cd snakepit-client - npm install - sudo npm link - - echo "http://10.2.224.243" > /root/.pitconnect.txt - - pit status - -run-snakepit: - chmod +x profiles/snakepit/* - snakemake \ - --use-conda \ - --cores all \ - --profile=profiles/snakepit + python utils/tb_log_parser.py --prefix= \ No newline at end of file diff --git a/README.md b/README.md index 97dd07078..18a152a7f 100644 --- a/README.md +++ b/README.md @@ -128,15 +128,24 @@ make dry-run ### Local mode -Without containerization: +#### Without containerization + ``` make run-local ``` -With containerization: +To test the whole pipeline end to end (it supposed to run quickly and does not train anything useful): + +``` +make test +``` +Or run +#### With containerization ``` make run-local-container ``` + + ### Cluster mode To run on Slurm @@ -149,6 +158,18 @@ with containerization (recommended): ``` make run-slurm-container ``` +### Specific target + +By default, all Snakemake rules are executed. To run the pipeline up to a specific rule use: +``` +make TARGET= +``` + +For example, collect corpus first: +``` +make run-local TARGET=merge_corpus +``` + ### Using Snakepit @@ -209,20 +230,23 @@ Step | Description | Bottleneck | Comments --- | --- | --- | --- Installation | Installing dependencies and compiling | CPU | Takes ~1 hour Data downloading | Downloads datasets, samples sentences | Network, Disk | Time depends on dataset size, sampling of huge mono datasets (100M+ sentences) is the most intensive operation. -Data cleaning | Basic preprocessing, language specific, rule based, deduplication, and other attempts to clean noisy data in parallel and mono datasets | CPU | Good parallelization across CPU cores. To make cleaning of a new language more efficient add it to [clean_parallel.py](/pipeline/clean/clean_parallel.py). -Bicleaner | Filters noisy sentence pairs in a parallel corpus using [bicleaner](https://github.com/bitextor/bicleaner) or [bicleaner-ai](https://github.com/bitextor/bicleaner-ai) depending on available language packs. | CPU, GPU | If there are no pretrained language packs for bicleaner-ai, it uses bicleaner. If there are no ones for bicleaner either, this step is skipped. Cleaning threshold is controlled by `BICLEANER_THRESHOLD` config setting. +Data cleaning | Basic preprocessing, dataset specific, language specific, rule based and other attempts to clean noisy data in parallel and mono datasets | CPU | Good parallelization across CPU cores. To make cleaning of a new language more efficient add it to [clean_parallel.py](/pipeline/clean/tools/clean_parallel.py). +Bicleaner | Filters noisy sentence pairs in a parallel corpus using [bicleaner](https://github.com/bitextor/bicleaner) or [bicleaner-ai](https://github.com/bitextor/bicleaner-ai) depending on available language packs. | CPU, GPU | If there are no pretrained language packs for bicleaner-ai, it uses bicleaner. If there are no ones for bicleaner either, this step is skipped. Cleaning thresholds are configurable per dataset, see [Dataset cleaning](##Dataset cleaning). +Merge and dedupe | Merges clean dataset and applies deduplicaiton | CPU, Disk | Training s2s | Trains a backward shallow s2s model, which is useful for back-translations and ce-filtering | GPU | Inspired by a [marian example](https://github.com/marian-nmt/marian-examples/tree/master/training-basics-sentencepiece). -Augmentation with back-translations | Translates mono corpus combined from `MONO_DATASETS_TRG` using shallow s2s model. | GPU | It is more useful for low-resource languages and can be skipped for others. -Training teacher | Trains one or multiple big transformer models | GPU | You might want to adjust [early stopping](pipeline/train/configs/training/teacher.transformer.train.yml) parameters depending on datasets size. Inspired by [transformer](https://github.com/marian-nmt/marian-examples/tree/master/transformer) and [wmt2017-uedin](https://github.com/marian-nmt/marian-examples/tree/master/wmt2017-uedin) marian examples and extended with [SentencePiece](https://github.com/google/sentencepiece). +Augmentation with back-translations | Translates mono corpus combined from monolingual datasets in target language using shallow s2s model. | GPU | It is more useful for low-resource languages and can be skipped for others. +Training teacher | Trains an ensemble of big transformer models on augmented dataset | GPU | You might want to adjust [early stopping](pipeline/train/configs/training/teacher.transformer.train.yml) or `after-epochs` parameters depending on datasets size. +Continue training teacher | Continue training an ensemble of teachers on parallel data only | GPU | You might want to adjust [early stopping](pipeline/train/configs/training/teacher.transformer.train.yml) parameters depending on datasets size. Translation by teacher | Translates a corpus and monolingual data combined from `MONO_DATASETS_SRC` using the teacher model (ensemble is not supported yet) | GPU | The slowest part of the pipeline. Can take days. It is possible to speed it up launching the same scripts ([corpus](pipeline/translate/translate-corpus.sh), [mono](pipeline/translate/translate-mono.sh)) in parallel from another machine with access to the same network directory. Cross-entropy filtering | Scores translated corpus with backward s2s model and removes a part of the corpus with the lowest scores to reduce noise | GPU, CPU, Disk | At this point we work with huge datasets, so it utilizes copying to a local disk to make things faster. Training alignments and shortlist | Trains alignments using [fast_align](https://github.com/clab/fast_align) and extracts lexical shortlist using [extract_lex](https://github.com/marian-nmt/extract-lex) tool | CPU, Disk | Some tools requires uncompressed datasets on disk and they are huge at this point. Data is copied to a local disk to make things faster. Might take 100+GB of local disk depending on a dataset size. Good CPU parallelization. -Training student | Trains a small transformer student model on filtered data and using alignments | GPU | Run [Tensorboard](utils/tensorboard/tensorboard.sh) manually to see training visualization. +Training student | Trains a small transformer student model on filtered data and using alignments | GPU | Fine-tuning student | Finetunes the student model by emulating 8bit GEMM during training | GPU | Converges very quickly and then degrades. It's quick but you might want to reduce early stopping threshold. Quantizaiton | Applies 8 bit quantization to the fined-tuned student model and evaluates on CPU | CPU | CPU threads must be set to 1 for this step. +Evaluation | Calculates metrics for all models (BLEU, chrf) using [SacreBLEU](https://github.com/mjpost/sacrebleu) | GPU | Uses `datasets.test` configuration section. Export | Exports trained model and shortlist to (bergamot-translator)(https://github.com/mozilla/bergamot-translator) format | | -## Datasets importers +## Dataset importers Dataset importers can be used in `datasets` sections of experiment config. @@ -256,6 +280,119 @@ Example: Just add a shell script to [corpus](pipeline/data/importers/corpus) or [mono]() which is named as `.sh` and accepts the same parameters as the other scripts from the same folder. +## Dataset fixing + +Some datasets require fixes like detokenization. Dataset and language specific fixes are implemented in [pipeline/clean/fixes]([pipeline/clean/fixes]). +Naming convention: +- `.sh` for parallel dataset cleaning +- `..sh` for language specific cleaning of parallel or monolingual dataset +- `/` in dataset name should be replaced with `_` + +## Dataset cleaning +Some parallel datasets require more aggressive filtering. +Dataset specific Bicleaner thretholds can be set in config. Example: + +```angular2html +experiment: +... + bicleaner: + default-threshold: 0.5 + dataset-thresholds: + mtdata_neulab_tedtalksv1_train: 0.6 +``` + +## Utilities + +### Tensorboard + +To see training graphs run tensorboard: + +``` +make install-tensorboard +make tensorboard +``` + +Then port forward 6006. + +## Directory structure + + ├ data + │ └ ru-en + │ └ test + │ ├ original + │ │ ├ corpus + │ │ │ ├ mtdata_JW300.en.gz + │ │ │ └ mtdata_JW300.ru.gz + │ │ ├ devset + │ │ │ ├ flores_dev.en.gz + │ │ │ └ flores_dev.ru.gz + │ │ ├ eval + │ │ │ ├ sacrebleu_wmt20.en.gz + │ │ │ └ sacrebleu_wmt20.ru.gz + │ │ ├ mono + │ │ │ ├ news-crawl_news.2020.ru.gz + │ │ │ └ news-crawl_news.2020.en.gz + │ │ ├ devset.ru.gz + │ │ └ devset.en.gz + │ ├ clean + │ │ ├ corpus + │ │ │ ├ mtdata_JW300.en.gz + │ │ │ └ mtdata_JW300.ru.gz + │ │ ├ mono + │ │ │ ├ news-crawl_news.2020.ru.gz + │ │ │ └ news-crawl_news.2020.en.gz + │ │ ├ mono.ru.gz + │ │ └ mono.en.gz + │ ├ biclean + │ │ ├ corpus + │ │ │ ├ mtdata_JW300.en.gz + │ │ │ └ mtdata_JW300.ru.gz + │ │ ├ corpus.ru.gz + │ │ ├ corpus.en.gz + │ ├ translated + │ │ ├ mono.ru.gz + │ │ └ mono.en.gz + │ ├ augmented + │ │ ├ corpus.ru.gz + │ │ └ corpus.en.gz + │ ├ alignment + │ │ ├ corpus.aln.gz + │ │ └ lex.s2t.pruned.gz + │ ├ merged + │ │ ├ corpus.ru.gz + │ │ └ corpus.en.gz + │ └ filtered + │ ├ corpus.ru.gz + │ └ corpus.en.gz + ├ models + │ ├ ru-en + │ │ └ test + │ │ ├ teacher + │ │ ├ student + │ │ ├ student-finetuned + │ │ ├ speed + │ │ ├ evaluation + │ │ │ ├ backward + │ │ │ ├ teacher0 + │ │ │ ├ teacher1 + │ │ │ ├ teacher-ensemble + │ │ │ ├ student + │ │ │ ├ student-finetuned + │ │ │ └ speed + │ │ └ exported + │ ├ en-ru + │ └ test + │ └ backward + │ + ├ experiments + │ └ ru-en + │ └ test + │ └ config.sh + ├ logs + │ └ ru-en + │ └ test + │ └ clean_corpus.log + ## Development ### Architecture @@ -271,9 +408,6 @@ Snakemake parallelizes steps that can be executed simultniously. It is especiall The main snakemkae process (scheduler) should be launched interactively. It runs job processes on the worker nodes in cluster mode or on a local machine in local mode. ### Conventions - -- All scripts work with respect to repo root directory. - It allows to not think about relative paths and execution folders. - Scripts inside the `pipeline` directory are independent and operate only using input arguments, input files and global envs. diff --git a/Snakefile b/Snakefile index c26224b4b..37bcbbc68 100644 --- a/Snakefile +++ b/Snakefile @@ -15,75 +15,6 @@ min_version("6.6.1") container: 'Singularity.sif' - -# Directories structure -# -#├ data -#│ ├ cache -#│ │ ├ corpus -#│ │ │ └ opus -#│ │ │ ├ ada83_v1.en.gz -#│ │ │ └ ada83_v1.ru.gz -#│ │ └ mono -#│ │ └ news-crawl -#│ │ ├ news.2019.ru.gz -#│ │ └ news.2019.en.gz -#│ └ ru-en -#│ └ test -#│ ├ original -#│ │ ├ corpus.ru.gz -#│ │ ├ corpus.en.gz -#│ │ ├ mono.ru.gz -#│ │ ├ mono.en.gz -#│ │ ├ devset.ru.gz -#│ │ └ devset.en.gz -#│ ├ evaluation -#│ │ ├ wmt12.ru -#│ │ ├ wmt12.en -#│ │ ├ wmt20.ru -#│ │ ├ wmt20.en -#│ ├ clean -#│ │ ├ corpus.ru.gz -#│ │ ├ corpus.en.gz -#│ │ ├ mono.ru.gz -#│ │ └ mono.en.gz -#│ ├ biclean -#│ │ ├ corpus.ru.gz -#│ │ ├ corpus.en.gz -#│ ├ translated -#│ │ ├ mono.ru.gz -#│ │ └ mono.en.gz -#│ ├ augmented -#│ │ ├ corpus.ru.gz -#│ │ └ corpus.en.gz -#│ ├ alignment -#│ │ ├ corpus.aln.gz -#│ │ └ lex.s2t.pruned.gz -#│ ├ merged -#│ │ ├ corpus.ru.gz -#│ │ └ corpus.en.gz -#│ └ filtered -#│ ├ corpus.ru.gz -#│ └ corpus.en.gz -#├ models -#│ ├ ru-en -#│ │ └ test -#│ │ ├ teacher -#│ │ ├ student -#│ │ ├ student-finetuned -#│ │ ├ speed -#│ │ └ exported -#│ ├ en-ru -#│ └ test -#│ └ s2s -#│ -#├ experiments -#│ └ ru-en -#│ └ test -#│ └ config.sh -#├ logs - - install_deps = config['deps'] == 'true' data_root_dir = config['root'] cuda_dir = config['cuda'] @@ -97,15 +28,17 @@ experiment = config['experiment']['name'] mono_max_sent_src = config['experiment']['mono-max-sentences-src'] mono_max_sent_trg = config['experiment']['mono-max-sentences-trg'] -bicleaner_threshold = config['experiment']['bicleaner-threshold'] -backward_model = config['experiment']['backward-model'] +bicl_default_threshold = config['experiment']['bicleaner']['default-threshold'] +bicl_dataset_thresholds = config['experiment']['bicleaner']['dataset-thresholds'] +backward_pretrained = config['experiment']['backward-model'] experiment_dir=f"{data_root_dir}/experiments/{src}-{trg}/{experiment}" # training -training_args = "" +training_args = {} if 'training' in config: - training_args = ' '.join([f'--{k} {v}' for k,v in config['training'].items()]) + training_args = {name: ' '.join([f'--{k} {v}' for k,v in conf.items() ]) + for name, conf in config['training'].items()} # datasets train_datasets = config['datasets']['train'] @@ -114,6 +47,9 @@ eval_datasets = config['datasets']['test'] mono_src_datasets = config['datasets']['mono-src'] mono_trg_datasets = config['datasets']['mono-trg'] +mono_datasets = {src: mono_src_datasets, trg: mono_trg_datasets} +mono_max_sent = {src: mono_max_sent_src, trg: mono_max_sent_trg} + # parallelization gpus = ' '.join([str(n) for n in range(int(gpus_num))]) ensemble = list(range(config['experiment']['teacher-ensemble'])) @@ -137,7 +73,6 @@ clean = f"{data_dir}/clean" biclean = f"{data_dir}/biclean" cache_dir = f"{data_dir}/cache" original = f"{data_dir}/original" -evaluation = f"{data_dir}/evaluation" translated = f"{data_dir}/translated" augmented = f"{data_dir}/augmented" merged = f"{data_dir}/merged" @@ -151,13 +86,22 @@ student_dir = f"{models_dir}/student" student_finetuned_dir = f"{models_dir}/student-finetuned" speed = f"{models_dir}/speed" exported = f"{models_dir}/exported" -best_model = "model.npz.best-bleu-detok.npz" -s2s=f'{models_dir}/s2s' - +best_model = f"model.npz.best-{config['experiment']['best-model']}.npz" +backward = f'{models_dir}/backward' + +#evaluation +eval_data = f"{original}/eval" +eval_res = f"{models_dir}/evaluation" +eval_backward = f'{eval_res}/backward' +eval_student = f'{eval_res}/student', +eval_student_finetuned = f'{eval_res}/student-finetuned', +eval_speed = f'{eval_res}/speed', +eval_teacher_ens = f'{eval_res}/teacher-ensemble', +full_eval_datasets = expand(f'{eval_data}/{{dataset}}.{{lang}}.gz', dataset=eval_datasets, lang=[src,trg]) # set common environment variables envs = f'''SRC={src} TRG={trg} MARIAN="{marian_dir}" GPUS="{gpus}" WORKSPACE={workspace} \ -CLEAN_TOOLS=pipeline/clean/tools BIN="{bin}" DATA_ROOT_DIR="{data_root_dir}" \ +BIN="{bin}" DATA_ROOT_DIR="{data_root_dir}" \ CUDA_DIR="{cuda_dir}"''' ### workflow options @@ -166,22 +110,25 @@ results = [f'{exported}/model.{src}{trg}.intgemm.alphas.bin.gz', f'{exported}/lex.50.50.{src}{trg}.s2t.bin.gz', f'{exported}/vocab.{src}{trg}.spm.gz', f'{experiment_dir}/config.yml', - expand(f'{teacher_dir}{{ens}}/eval',ens=ensemble), - f'{student_dir}/eval', - f'{student_finetuned_dir}/eval', - f'{speed}/eval', + expand(f'{eval_res}/teacher{{ens}}',ens=ensemble), + f'{eval_res}/student', + f'{eval_res}/student-finetuned', + f'{eval_res}/speed' ] +if len(ensemble) > 1: + results.append(f'{eval_res}/teacher-ensemble') + if install_deps: results.append("/tmp/flags/setup.done") -if not backward_model: - backward_model = s2s +if not backward_pretrained: # don't evaluate pretrained model - results.append(f'{backward_model}/eval') - train_s2s=True + results.append(eval_backward) + train_backward=True else: - train_s2s = False + train_backward = False + backward = backward_pretrained # bicleaner @@ -189,25 +136,29 @@ bicleaner_type = packs.find(src, trg) bicleaner_env = "envs/bicleaner-ai.yml" if bicleaner_type == 'bicleaner-ai' else 'envs/bicleaner.yml' if bicleaner_type: - clean_corpus_src = f"{biclean}/corpus.{src}.gz" - clean_corpus_trg = f"{biclean}/corpus.{trg}.gz" + clean_corpus_prefix = f'{biclean}/corpus' teacher_corpus = f'{biclean}/corpus' use_bicleaner = True else: - clean_corpus_src = f"{clean}/corpus.{src}.gz" - clean_corpus_trg = f"{clean}/corpus.{trg}.gz" + clean_corpus_prefix = f'{clean}/corpus' teacher_corpus = f'{clean}/corpus' use_bicleaner = False +clean_corpus_src = f'{clean_corpus_prefix}.{src}.gz' +clean_corpus_trg = f'{clean_corpus_prefix}.{trg}.gz' + # augmentation if mono_trg_datasets: teacher_corpus = f'{augmented}/corpus' - augment_corpus=True + augment_corpus = True + continue_teacher = True # continue training on parallel corpus + teacher_all_output = 'model.npz' else: - augment_corpus=False - + augment_corpus = False + continue_teacher = False + teacher_all_output = best_model ### rules @@ -216,13 +167,16 @@ def find_parts(wildcards, checkpoint): checkpoint_output = checkpoint.get(**wildcards).output[0] return glob_wildcards(os.path.join(checkpoint_output,"file.{part,\d+}")).part +def dataset_norm(name: str): + return name.replace('/','_') + shell.prefix(f"{envs} ") rule all: input: results localrules: experiment -ruleorder: teacher > eval_teacher +ruleorder: teacher_all > eval_teacher rule experiment: message: "Saving experiment metadata" @@ -245,7 +199,6 @@ if install_deps: output: touch("/tmp/flags/setup.done") # specific to local machine shell: 'bash pipeline/setup/install-deps.sh >> {log} 2>&1' - rule marian: message: "Compiling marian" log: f"{log_dir}/compile-marian.log" @@ -275,71 +228,62 @@ rule extract_lex: output: protected(f"{bin}/extract_lex") shell: 'bash pipeline/setup/compile-extract-lex.sh {extract_lex_build} {threads} >> {log} 2>&1' -# data - -rule data_train: - message: "Downloading training corpus" - log: f"{log_dir}/data_train.log" - conda: "envs/base.yml" - threads: 1 - group: 'data' - output: src=f"{original}/corpus.{src}.gz",trg=f"{original}/corpus.{trg}.gz" - params: prefix=f"{original}/corpus" - shell: 'bash pipeline/data/download-corpus.sh "{params.prefix}" "{cache_dir}" train {train_datasets} >> {log} 2>&1' +# data downloading -rule data_val: - message: "Downloading validation corpus" - log: f"{log_dir}/data_val.log" +rule download_corpus: + message: "Downloading parallel corpus" + log: f"{log_dir}/download_corpus/{{kind}}/{{dataset}}.log" conda: "envs/base.yml" threads: 1 group: 'data' - output: src=f"{original}/devset.{src}.gz",trg=f"{original}/devset.{trg}.gz" - params: prefix=f"{original}/devset" - shell: 'bash pipeline/data/download-corpus.sh "{params.prefix}" "{cache_dir}" valid {valid_datasets} >> {log} 2>&1' - -rule data_test: - message: "Downloading test corpus" - log: f"{log_dir}/data_test.log" + cache: False # caching is broken in snakemake + wildcard_constraints: kind="corpus|devset|eval" + output: multiext(f"{original}/{{kind}}/{{dataset}}", f".{src}.gz", f".{trg}.gz") + params: prefix=f"{original}/{{kind}}/{{dataset}}", dataset="{dataset}" + shell: 'bash pipeline/data/download-corpus.sh "{params.dataset}" "{params.prefix}" >> {log} 2>&1' + +rule download_mono: + message: "Downloading monolingual dataset" + log: f"{log_dir}/download_mono/{{dataset}}.{{lang}}.log" conda: "envs/base.yml" threads: 1 group: 'data' - output: expand(f"{evaluation}/{{dataset}}.{{lng}}",dataset=eval_datasets,lng=[src, trg]) - shell: 'bash pipeline/data/download-eval.sh "{evaluation}" "{cache_dir}" {eval_datasets} >> {log} 2>&1' - -rule data_mono_src: - message: "Downloading monolingual dataset for source language" - log: f"{log_dir}/data_mono_src.log" - conda: "envs/base.yml" - threads: 1 - group: 'data' - output: f'{original}/mono.{src}.gz' + cache: False # caching is broken in snakemake + wildcard_constraints: lang=f"{src}|{trg}" + output: f'{original}/mono/{{dataset}}.{{lang}}.gz' + params: max_sent=lambda wildcards: mono_max_sent[wildcards.lang], dataset='{dataset}', lang='{lang}' shell: '''bash pipeline/data/download-mono.sh \ - "{src}" "{mono_max_sent_src}" "{original}/mono" "{cache_dir}" {mono_src_datasets} >> {log} 2>&1''' - -if mono_trg_datasets: - rule data_mono_trg: - message: "Downloading monolingual dataset for target language" - log: f"{log_dir}/data_mono_trg.log" - conda: "envs/base.yml" - threads: 1 - group: 'data' - output: f'{original}/mono.{trg}.gz' - shell: '''bash pipeline/data/download-mono.sh \ - "{trg}" "{mono_max_sent_trg}" "{original}/mono" "{cache_dir}" {mono_trg_datasets} >> {log} 2>&1''' + "{params.dataset}" {params.lang} {params.max_sent} "{output}" >> {log} 2>&1''' # cleaning rule clean_corpus: - message: "Cleaning corpus" - log: f"{log_dir}/clean_corpus.log" + message: "Cleaning dataset" + log: f"{log_dir}/clean_corpus/{{dataset}}.log" conda: "envs/base.yml" + group: "clean_corpus" threads: workflow.cores - input: rules.data_train.output.src,rules.data_train.output.trg - output: src=f"{clean}/corpus.{src}.gz",trg=f"{clean}/corpus.{trg}.gz" - params: prefix_input=f"{original}/corpus",prefix_output=f"{clean}/corpus" - shell: '''bash pipeline/clean/clean-corpus.sh "{params.prefix_input}" "{params.prefix_output}" {threads} \ + input: multiext(f"{original}/corpus/{{dataset}}", f".{src}.gz", f".{trg}.gz") + output: multiext(f"{clean}/corpus/{{dataset}}", f".{src}.gz", f".{trg}.gz") + params: prefix_input=f"{original}/corpus/{{dataset}}",prefix_output=f"{clean}/corpus/{{dataset}}", + dataset=lambda wildcards: dataset_norm(wildcards.dataset) + shell: '''bash pipeline/clean/clean-corpus.sh "{params.prefix_input}" "{params.prefix_output}" {threads} {params.dataset} \ >> {log} 2>&1''' +rule clean_mono: + message: "Cleaning monolingual dataset" + log: f"{log_dir}/clean_mono/{{dataset}}.{{lang}}.log" + conda: "envs/base.yml" + threads: workflow.cores + group: "clean_mono{lang}" + cache: False + wildcard_constraints: lang=f"{src}|{trg}" + input: f'{original}/mono/{{dataset}}.{{lang}}.gz' + output: f'{clean}/mono/{{dataset}}.{{lang}}.gz' + params: prefix_input=f"{original}/mono/{{dataset}}", prefix_output=f"{clean}/mono/{{dataset}}", + dataset=lambda wildcards: dataset_norm(wildcards.dataset) + shell: '''bash pipeline/clean/clean-mono.sh {wildcards.lang} "{params.prefix_input}" "{params.prefix_output}" \ + {threads} {params.dataset} >> {log} 2>&1''' if use_bicleaner: rule kenlm: @@ -351,28 +295,67 @@ if use_bicleaner: output: directory(f"{bin}/kenlm") shell: 'bash pipeline/setup/install-kenlm.sh {kenlm} {threads} >> {log} 2>&1' + rule bicleaner_pack: + message: f"Downloading language pack for bicleaner" + log: f"{log_dir}/bicleaner_pack.log" + conda: bicleaner_env + group: "clean_corpus" + threads: 1 + input: rules.kenlm.output + output: directory(f"{biclean}/pack") + shell: '''bash pipeline/bicleaner/download-pack.sh "{output}" {bicleaner_type} >> {log} 2>&1''' + rule bicleaner: message: f"Cleaning corpus using {bicleaner_type}" - log: f"{log_dir}/bicleaner.log" + log: f"{log_dir}/bicleaner/{{dataset}}.log" conda: bicleaner_env - threads: workflow.cores - input: src=rules.clean_corpus.output.src,trg=rules.clean_corpus.output.trg,kenlm=rules.kenlm.output - output: src=clean_corpus_src,trg=clean_corpus_trg - params: prefix_input=f"{clean}/corpus",prefix_output=f"{biclean}/corpus" + group: "clean_corpus" + threads: 1 + input: rules.kenlm.output, multiext(f"{clean}/corpus/{{dataset}}", f".{src}.gz", f".{trg}.gz"), + pack_dir=rules.bicleaner_pack.output + output: multiext(f"{biclean}/corpus/{{dataset}}", f".{src}.gz", f".{trg}.gz") + params: + prefix_input=f"{clean}/corpus/{{dataset}}",prefix_output=f"{biclean}/corpus/{{dataset}}", + threshold=lambda wildcards: bicl_dataset_thresholds.get(wildcards.dataset) or bicl_default_threshold shell: '''bash pipeline/bicleaner/bicleaner.sh \ - "{params.prefix_input}" "{params.prefix_output}" {bicleaner_threshold} {bicleaner_type} \ - >> {log} 2>&1''' + "{params.prefix_input}" "{params.prefix_output}" {params.threshold} {bicleaner_type} {threads} \ + "{input.pack_dir}" >> {log} 2>&1''' -rule clean_mono: - message: "Cleaning monolingual dataset" - log: f"{log_dir}/clean_mono_{{lang}}.log" +rule merge_corpus: + message: "Merging clean parallel datasets" + log: f"{log_dir}/merge_corpus.log" + conda: "envs/base.yml" + threads: workflow.cores + group: "clean_corpus" + input: expand(f"{clean_corpus_prefix}/{{dataset}}.{{lang}}.gz", dataset=train_datasets, lang=[src, trg]) + output: src=clean_corpus_src,trg=clean_corpus_trg + params: prefix_output=clean_corpus_prefix, prefixes=expand(f"{clean_corpus_prefix}/{{dataset}}", dataset=train_datasets) + shell: '''bash pipeline/clean/merge-corpus.sh "{params.prefix_output}" {params.prefixes} >> {log} 2>&1''' + +rule merge_devset: + message: "Merging devsets" + log: f"{log_dir}/merge_devset.log" + conda: "envs/base.yml" + threads: workflow.cores + group: "clean_corpus" + input: expand(f"{original}/devset/{{dataset}}.{{lang}}.gz", dataset=valid_datasets, lang=[src, trg]) + output: multiext(f"{original}/devset", f".{src}.gz", f".{trg}.gz") + params: prefix_output=f"{original}/devset", prefixes=expand(f"{original}/devset/{{dataset}}", dataset=valid_datasets) + shell: '''bash pipeline/clean/merge-corpus.sh "{params.prefix_output}" {params.prefixes} >> {log} 2>&1''' + +rule merge_mono: + message: "Merging clean monolingual datasets" + log: f"{log_dir}/merge_mono_{{lang}}.log" conda: "envs/base.yml" threads: workflow.cores - input: f'{original}/mono.{{lang}}.gz' + group: "clean_mono{lang}" + input: + lambda wildcards: expand(f"{clean}/mono/{{dataset}}.{{lang}}.gz", + dataset=mono_datasets[wildcards.lang], lang=wildcards.lang) output: f"{clean}/mono.{{lang}}.gz" - params: lang='{lang}' - shell: '''bash pipeline/clean/clean-mono.sh "{params.lang}" "{original}/mono" "{clean}/mono" {threads} \ - >> {log} 2>&1''' + params: max_sent=lambda wildcards: mono_max_sent[wildcards.lang] + shell: '''bash pipeline/clean/merge-mono.sh "{output}" {params.max_sent} {input} >> {log} 2>&1''' + # augmentation and teacher training @@ -385,11 +368,11 @@ rule train_vocab: bin=rules.marian.output.vocab, corpus_src=clean_corpus_src,corpus_trg=clean_corpus_trg output: f"{models_dir}/vocab/vocab.spm" - params: prefix_train=f"{biclean}/corpus",prefix_test=f"{original}/devset" + params: prefix_train=clean_corpus_prefix,prefix_test=f"{original}/devset" shell: 'bash pipeline/train/spm-vocab.sh "{input.corpus_src}" "{input.corpus_trg}" "{output}" >> {log} 2>&1' -if train_s2s: +if train_backward: rule backward: message: "Training backward model" log: f"{log_dir}/train_backward.log" @@ -398,14 +381,14 @@ if train_s2s: resources: gpu=gpus_num group: 'backward' input: - train_src=clean_corpus_src,train_trg=clean_corpus_trg, - val_src=rules.data_val.output.src,val_trg=rules.data_val.output.trg, - bin=rules.marian.output.trainer, vocab=rules.train_vocab.output - output: model=f'{backward_model}/{best_model}' - params: prefix_train=f"{biclean}/corpus",prefix_test=f"{original}/devset" - shell: '''bash pipeline/train/train-s2s.sh \ - "{backward_model}" "{params.prefix_train}" "{params.prefix_test}" "{input.vocab}" {trg} {src} \ - {training_args} >> {log} 2>&1''' + rules.merge_devset.output, train_src=clean_corpus_src,train_trg=clean_corpus_trg, + bin=rules.marian.output.trainer, vocab=rules.train_vocab.output, + output: model=f'{backward}/{best_model}' + params: prefix_train=f"{biclean}/corpus",prefix_test=f"{original}/devset", + args=training_args.get("backward") or "" + shell: '''bash pipeline/train/train.sh \ + backward train {trg} {src} "{params.prefix_train}" "{params.prefix_test}" "{backward}" \ + "{input.vocab}" {params.args} >> {log} 2>&1''' rule eval_backward: message: "Evaluating backward model" @@ -415,11 +398,13 @@ if train_s2s: resources: gpu=gpus_num group: 'backward' priority: 50 - input: model=f'{backward_model}/{best_model}', datasets=rules.data_test.output + input: + full_eval_datasets, + model=f'{backward}/{best_model}' output: - report(directory(f'{backward_model}/eval'),patterns=["{name}.bleu"], + report(directory(eval_backward),patterns=["{name}.metrics"], category='evaluation', subcategory='finetuned', caption='reports/evaluation.rst') - shell: 'bash pipeline/train/eval.sh "{backward_model}" "{evaluation}" {trg} {src} >> {log} 2>&1' + shell: 'bash pipeline/train/eval.sh "{eval_backward}" "{eval_data}" {trg} {src} {input.model} >> {log} 2>&1' @@ -441,7 +426,7 @@ if augment_corpus: resources: gpu=gpus_num input: rules.marian.output.trainer,file=f'{translated}/mono_trg/file.{{part}}', - vocab=rules.train_vocab.output,model=f'{backward_model}/{best_model}' + vocab=rules.train_vocab.output,model=f'{backward}/{best_model}' output: f'{translated}/mono_trg/file.{{part}}.out' shell: 'bash pipeline/translate/translate.sh "{input.file}" "{input.vocab}" {input.model} >> {log} 2>&1' @@ -472,22 +457,43 @@ if augment_corpus: "{input.src1}" "{input.src2}" "{input.trg1}" "{input.trg2}" "{output.res_src}" "{output.res_trg}" \ >> {log} 2>&1''' -rule teacher: - message: "Training teacher" - log: f"{log_dir}/train_teacher{{ens}}.log" + + +rule teacher_all: + message: "Training teacher on all data" + log: f"{log_dir}/train_teacher_all{{ens}}.log" conda: "envs/base.yml" threads: gpus_num*2 resources: gpu=gpus_num group: 'teacher{ens}' input: - train_src=f'{teacher_corpus}.{src}.gz',train_trg=f'{teacher_corpus}.{trg}.gz', - val_src=rules.data_val.output.src,val_trg=rules.data_val.output.trg, + rules.merge_devset.output, train_src=f'{teacher_corpus}.{src}.gz',train_trg=f'{teacher_corpus}.{trg}.gz', bin=rules.marian.output.trainer,vocab=rules.train_vocab.output - output: model=f'{teacher_dir}{{ens}}/{best_model}' - params: prefix_train=teacher_corpus, prefix_test=f"{original}/devset", dir=directory(f'{teacher_dir}{{ens}}') - shell: '''bash pipeline/train/train-teacher.sh \ - "{params.dir}" "{params.prefix_train}" "{params.prefix_test}" "{input.vocab}" \ - {training_args} >> {log} 2>&1''' + output: model=f'{teacher_dir}{{ens}}/{teacher_all_output}' + params: prefix_train=teacher_corpus, prefix_test=f"{original}/devset", dir=directory(f'{teacher_dir}{{ens}}'), + args=training_args.get("teacher-all") or "" + shell: '''bash pipeline/train/train.sh \ + teacher train {src} {trg} "{params.prefix_train}" "{params.prefix_test}" "{params.dir}" \ + "{input.vocab}" {params.args} >> {log} 2>&1''' + +if continue_teacher: + rule teacher_parallel: + message: "Continue training teacher on parallel corpus" + log: f"{log_dir}/train_teacher_parallel{{ens}}.log" + conda: "envs/base.yml" + threads: gpus_num * 2 + resources: gpu=gpus_num + group: 'teacher{ens}' + input: + rules.merge_devset.output, model = f'{teacher_dir}{{ens}}/model.npz', + train_src=clean_corpus_src,train_trg=clean_corpus_trg, + bin=rules.marian.output.trainer,vocab=rules.train_vocab.output + output: model=f'{teacher_dir}{{ens}}/{best_model}' + params: prefix_train=clean_corpus_prefix,prefix_test=f"{original}/devset",dir=directory(f'{teacher_dir}{{ens}}'), + args=training_args.get("teacher-parallel") or "" + shell: '''bash pipeline/train/train.sh \ + teacher continue {src} {trg} "{params.prefix_train}" "{params.prefix_test}" "{params.dir}" \ + "{input.vocab}" {params.args} >> {log} 2>&1''' rule eval_teacher: message: "Evaluating teacher model" @@ -498,13 +504,29 @@ rule eval_teacher: group: 'teacher{ens}' priority: 50 input: - model=f'{teacher_dir}{{ens}}/{best_model}', - datasets=rules.data_test.output + full_eval_datasets, + model=f'{teacher_dir}{{ens}}/{best_model}' output: - report(directory(f'{teacher_dir}{{ens}}/eval'), patterns=["{name}.bleu"], - category='evaluation', subcategory='teacher', caption='reports/evaluation.rst') - params: dir=f'{teacher_dir}{{ens}}' - shell: 'bash pipeline/train/eval.sh "{params.dir}" "{evaluation}" {src} {trg} >> {log} 2>&1' + report(directory(f'{eval_res}/teacher{{ens}}'), patterns=["{name}.metrics"], + category='evaluation', subcategory='teacher{ens}', caption='reports/evaluation.rst') + params: dir=f'{eval_res}/teacher{{ens}}' + shell: 'bash pipeline/train/eval.sh "{params.dir}" "{eval_data}" {src} {trg} {input.model} >> {log} 2>&1' + + +if len(ensemble) > 1: + rule eval_teacher_ensemble: + message: "Evaluating an ensemble of teacher models" + log: f"{log_dir}/eval_teacher_ensemble.log" + conda: "envs/base.yml" + threads: gpus_num * 2 + resources: gpu=gpus_num + priority: 50 + input: + full_eval_datasets, models=[f'{teacher_dir}{ens}/{best_model}' for ens in ensemble] + output: + report(directory(eval_teacher_ens),patterns=["{name}.metrics"], + category='evaluation',subcategory='teacher_ensemble',caption='reports/evaluation.rst') + shell: 'bash pipeline/train/eval.sh "{eval_teacher_ens}" "{eval_data}" {src} {trg} {input.models} >> {log} 2>&1' ### translation with teacher @@ -640,7 +662,7 @@ rule ce_filter: output: src_corpus=f"{filtered}/corpus.{src}.gz",trg_corpus=f"{filtered}/corpus.{trg}.gz" params: input_prefix=f'{merged}/corpus',output_prefix=f'{filtered}/corpus' shell: '''bash pipeline/cefilter/ce-filter.sh \ - "{params.input_prefix}" "{params.output_prefix}" "{input.scores}" {threads} >> {log} 2>&1''' + "{params.input_prefix}" "{params.output_prefix}" "{input.scores}" >> {log} 2>&1''' rule alignments: message: 'Training word alignment and lexical shortlists' @@ -664,15 +686,15 @@ rule student: resources: gpu=gpus_num group: 'student' input: - train_src=rules.ce_filter.output.src_corpus, train_trg=rules.ce_filter.output.trg_corpus, - val_src=rules.data_val.output.src, val_trg=rules.data_val.output.trg, + rules.merge_devset.output, train_src=rules.ce_filter.output.src_corpus, train_trg=rules.ce_filter.output.trg_corpus, alignments=rules.alignments.output.alignment, bin=rules.marian.output.trainer, vocab=rules.train_vocab.output output: model=f'{student_dir}/{best_model}' - params: prefix_train=rules.ce_filter.params.output_prefix,prefix_test=f"{original}/devset" + params: prefix_train=rules.ce_filter.params.output_prefix,prefix_test=f"{original}/devset", + args=training_args.get("student") or "" shell: '''bash pipeline/train/train-student.sh \ - "{student_dir}" "{params.prefix_train}" "{params.prefix_test}" "{input.vocab}" \ - "{input.alignments}" {training_args} >> {log} 2>&1''' + "{input.alignments}" student train {src} {trg} "{params.prefix_train}" "{params.prefix_test}" \ + "{student_dir}" "{input.vocab}" {params.args} >> {log} 2>&1''' rule eval_student: message: "Evaluating student model" @@ -682,11 +704,11 @@ rule eval_student: resources: gpu=gpus_num group: 'student' priority: 50 - input: model=rules.student.output.model, datasets=rules.data_test.output + input: full_eval_datasets, model=rules.student.output.model output: - report(directory(f'{student_dir}/eval'),patterns=["{name}.bleu"],category='evaluation', + report(directory(eval_student),patterns=["{name}.metrics"],category='evaluation', subcategory='student', caption='reports/evaluation.rst') - shell: 'bash pipeline/train/eval.sh "{student_dir}" "{evaluation}" {src} {trg} >> {log} 2>&1' + shell: 'bash pipeline/train/eval.sh "{eval_student}" "{eval_data}" {src} {trg} {input.model} >> {log} 2>&1' # quantize @@ -698,15 +720,15 @@ rule finetune_student: resources: gpu=gpus_num group: 'finetune' input: - train_src=rules.ce_filter.output.src_corpus, train_trg=rules.ce_filter.output.trg_corpus, - val_src=rules.data_val.output.src, val_trg=rules.data_val.output.trg, + rules.merge_devset.output, train_src=rules.ce_filter.output.src_corpus, train_trg=rules.ce_filter.output.trg_corpus, alignments=rules.alignments.output.alignment, student_model=rules.student.output.model, bin=rules.marian.output.trainer, vocab=rules.train_vocab.output output: model=f'{student_finetuned_dir}/{best_model}' - params: prefix_train=rules.ce_filter.params.output_prefix,prefix_test=f"{original}/devset" - shell: '''bash pipeline/train/finetune-student.sh \ - "{student_finetuned_dir}" "{params.prefix_train}" "{params.prefix_test}" "{input.vocab}" \ - "{input.alignments}" "{input.student_model}" {training_args} >> {log} 2>&1''' + params: prefix_train=rules.ce_filter.params.output_prefix,prefix_test=f"{original}/devset", + args=training_args.get("student-finetune") or "" + shell: '''bash pipeline/train/train-student.sh \ + "{input.alignments}" student finetune {src} {trg} "{params.prefix_train}" "{params.prefix_test}" \ + "{student_finetuned_dir}" "{input.vocab}" {params.args} >> {log} 2>&1''' rule eval_finetuned_student: message: "Evaluating fine-tuned student model" @@ -716,19 +738,18 @@ rule eval_finetuned_student: resources: gpu=gpus_num group: 'finetune' priority: 50 - input: model=rules.finetune_student.output.model, datasets=rules.data_test.output + input: full_eval_datasets, model=rules.finetune_student.output.model output: - report(directory(f'{student_finetuned_dir}/eval'),patterns=["{name}.bleu"], + report(directory(eval_student_finetuned),patterns=["{name}.metrics"], category='evaluation', subcategory='finetuned', caption='reports/evaluation.rst') - shell: 'bash pipeline/train/eval.sh "{student_finetuned_dir}" "{evaluation}" {src} {trg} >> {log} 2>&1' + shell: 'bash pipeline/train/eval.sh "{eval_student_finetuned}" "{eval_data}" {src} {trg} {input.model} \ + >> {log} 2>&1' rule quantize: message: "Quantization" log: f"{log_dir}/quntize.log" conda: "envs/base.yml" - threads: gpus_num*2 - resources: gpu=gpus_num - threads: workflow.cores + threads: 1 input: shortlist=rules.alignments.output.shortlist, model=rules.finetune_student.output.model, bin=rules.marian.output.decoder, vocab=rules.train_vocab.output, devset=f"{original}/devset.{src}.gz" @@ -741,16 +762,16 @@ rule eval_quantized: log: f"{log_dir}/eval_quantized.log" conda: "envs/base.yml" group: 'export' - threads: workflow.cores + threads: 1 priority: 50 input: + full_eval_datasets, model=rules.quantize.output.model, - datasets=rules.data_test.output, shortlist=rules.alignments.output.shortlist,vocab=rules.train_vocab.output output: - report(directory(f'{speed}/eval'),patterns=["{name}.bleu"], category='evaluation', + report(directory(eval_speed),patterns=["{name}.metrics"], category='evaluation', subcategory='quantized', caption='reports/evaluation.rst') - shell: '''bash pipeline/quantize/eval.sh "{speed}" "{input.shortlist}" "{evaluation}" "{input.vocab}" \ + shell: '''bash pipeline/quantize/eval.sh "{speed}" "{input.shortlist}" "{eval_data}" "{input.vocab}" "{eval_speed}" \ >> {log} 2>&1''' rule export: diff --git a/configs/config.prod.yml b/configs/config.prod.yml index 89f8fd28f..f298156ac 100644 --- a/configs/config.prod.yml +++ b/configs/config.prod.yml @@ -1,4 +1,12 @@ +#### +# Example of a production config +# Change language pair, experiment name, datasets and other settings if needed +# Training low resource languages might require more tuning of pipeline/training/configs +### + +# These settings depend on execution environment +# They are set in the Makefile root: "" cuda: "" deps: false @@ -7,7 +15,7 @@ workspace: "" experiment: - name: snakemake + name: prod src: ru trg: en @@ -19,15 +27,26 @@ experiment: mono-max-sentences-src: 100000000 mono-max-sentences-trg: 20000000 - bicleaner-threshold: 0.5 - # split corpus to parallelize translation split-length: 2000000 + best-model: chrf + bicleaner: + default-threshold: 0.5 + dataset-thresholds: + opus_CCAligned/v1: 0.7 + opus_WikiMatrix/v1: 0.7 + opus_OpenSubtitles/v2018: 0.9 + opus_bible-uedin/v1: 0.7 + mtdata_cc_aligned: 0.7 + mtdata_wiki_titles_v1: 0.7 + mtdata_WikiMatrix_v1: 0.7 + mtdata_wiki_titles_v2: 0.7 + mtdata_wmt13_commoncrawl: 0.7 datasets: - # parallel corpus + # parallel training corpus train: - opus_ada83/v1 - opus_UN/v20090831 @@ -74,12 +93,14 @@ datasets: - mtdata_news_commentary_v14 - mtdata_neulab_tedtalksv1_test - mtdata_JW300 + # datasets to merge for validation while training devtest: - flores_dev - mtdata_newstest2019_ruen - mtdata_newstest2017_ruen - mtdata_newstest2015_ruen - mtdata_newstest2014_ruen + # datasets for evaluation test: - flores_devtest - sacrebleu_wmt20 @@ -99,7 +120,7 @@ datasets: - news-crawl_news.2013 - news-crawl_news.2012 - news-crawl_news.2011 - # to be translated by the shallow backward model to augment teacher corpus with back-translations + # to be translated by the backward model to augment teacher corpus with back-translations # leave empty to skip augmentation step (high resource languages) mono-trg: - news-crawl_news.2020 diff --git a/configs/config.test.yml b/configs/config.test.yml index 9b2b4d8b2..92a714134 100644 --- a/configs/config.test.yml +++ b/configs/config.test.yml @@ -1,3 +1,7 @@ +#### +# Test config, it rus the pipeline quickly end to end +### + root: "" cuda: "" @@ -6,28 +10,38 @@ gpus: "" workspace: "" experiment: - name: snakemake + name: test src: ru trg: en teacher-ensemble: 2 - # path to a pretrained backward model (optional) backward-model: "" - # limits per downloaded dataset mono-max-sentences-src: 100000 mono-max-sentences-trg: 200000 + split-length: 100000 - bicleaner-threshold: 0.5 + best-model: chrf - split-length: 100000 + bicleaner: + default-threshold: 0.5 + dataset-thresholds: + mtdata_neulab_tedtalksv1_train: 0.6 training: - after-epochs: 1 + backward: + after: 1000u + teacher-all: + after: 2000u + teacher-parallel: + after: 1000u + student: + after: 1000u + student-finetune: + after: 1000u datasets: - # parallel corpus train: - opus_ada83/v1 - opus_GNOME/v1 @@ -39,12 +53,8 @@ datasets: - flores_devtest - sacrebleu_wmt20 - sacrebleu_wmt18 - # monolingual datasets (ex. paracrawl-mono_paracrawl8, commoncrawl_wmt16, news-crawl_news.2020) - # to be translated by the teacher model mono-src: - news-crawl_news.2020 - # to be translated by the shallow backward model to augment teacher corpus with back-translations - # leave empty to skip augmentation step (high resource languages) mono-trg: - news-crawl_news.2020 diff --git a/envs/base.yml b/envs/base.yml index 0734d52d7..c46b7648d 100644 --- a/envs/base.yml +++ b/envs/base.yml @@ -7,6 +7,8 @@ dependencies: - cmake=3.21.1 - pip=21.2.2 - pip: - - sacrebleu==1.5.1 + - sacrebleu==2.0.0 - mtdata==0.2.9 - - fasttext==0.9.2 \ No newline at end of file + - fasttext==0.9.2 + - regex==2019.8.19 + - sacremoses==0.0.43 \ No newline at end of file diff --git a/pipeline/alignment/generate-alignment-and-shortlist.sh b/pipeline/alignment/generate-alignment-and-shortlist.sh index 9ad4342ee..9514014bf 100644 --- a/pipeline/alignment/generate-alignment-and-shortlist.sh +++ b/pipeline/alignment/generate-alignment-and-shortlist.sh @@ -17,6 +17,7 @@ vocab_path=$2 output_dir=$3 threads=$4 +cd "$(dirname "${0}")" mkdir -p "${output_dir}" dir="${output_dir}/tmp" @@ -72,7 +73,7 @@ test -s "${dir}/vocab.txt" || test -s "${output_dir}/lex.s2t.pruned.gz" || pigz -dc "${dir}/lex.s2t.gz" | grep -v NULL | - python3 "pipeline/alignment/prune_shortlist.py" 100 "${dir}/vocab.txt" | + python3 "prune_shortlist.py" 100 "${dir}/vocab.txt" | pigz >"${output_dir}/lex.s2t.pruned.gz" echo "### Deleting tmp dir" diff --git a/pipeline/bicleaner/bicleaner.sh b/pipeline/bicleaner/bicleaner.sh index ef5a0568a..86b3b2eea 100644 --- a/pipeline/bicleaner/bicleaner.sh +++ b/pipeline/bicleaner/bicleaner.sh @@ -15,18 +15,17 @@ corpus_prefix=$1 output_prefix=$2 bicleaner_threshold=$3 type=$4 +threads=$5 +pack_dir=$6 output_dir=$(dirname "${output_prefix}") -tmp_dir="${output_dir}/tmp" -mkdir -p "${tmp_dir}" +mkdir -p "${output_dir}" if [ "${type}" == 'bicleaner-ai' ]; then echo "### Using bicleaner-ai" - bash "pipeline/bicleaner/download-pack.sh" "${tmp_dir}" "bicleaner-ai" cmd=bicleaner-ai-classify elif [ "${type}" == 'bicleaner' ]; then echo "### Using bicleaner" - bash "pipeline/bicleaner/download-pack.sh" "${tmp_dir}" "bicleaner" cmd=bicleaner-classify else echo "### Unsupported type: ${type}" @@ -34,17 +33,18 @@ else fi echo "### Classifying and filtering" -test -s "${tmp_dir}/best.gz" || +test -s "${output_prefix}.best.gz" || paste <(pigz -dc "${corpus_prefix}.${SRC}.gz") <(pigz -dc "${corpus_prefix}.${TRG}.gz") | - ${cmd} --scol 1 --tcol 1 - - "${tmp_dir}"/*.yaml | + ${cmd} --scol 1 --tcol 1 --processes "${threads}" - - "${pack_dir}"/*.yaml | awk -v threshold=${bicleaner_threshold} '{if ($3>threshold) {print $0}}' | - pigz >"${tmp_dir}/best.gz" + pigz >"${output_prefix}.best.gz" echo "### Writing output corpus" -pigz -dc "${tmp_dir}/best.gz" | cut -f1 | pigz >"${output_prefix}.${SRC}.gz" -pigz -dc "${tmp_dir}/best.gz" | cut -f2 | pigz >"${output_prefix}.${TRG}.gz" +pigz -dc "${output_prefix}.best.gz" | + tee >(cut -f1 | pigz >"${output_prefix}.${SRC}.gz") | + cut -f2 | pigz >"${output_prefix}.${TRG}.gz" echo "### Cleaning files" -rm -rf "${tmp_dir}" +rm "${output_prefix}.best.gz" echo "###### Done: Bicleaner filtering" diff --git a/pipeline/bicleaner/download-pack.sh b/pipeline/bicleaner/download-pack.sh index bbaea6337..afd33d038 100644 --- a/pipeline/bicleaner/download-pack.sh +++ b/pipeline/bicleaner/download-pack.sh @@ -13,6 +13,7 @@ test -v TRG download_path=$1 type=$2 +mkdir -p download_path invalid_url() { wget -S --spider -o - $1 | grep -q '404 Not Found' @@ -47,11 +48,11 @@ else lang2=$TRG fi -if ! test -s "${download_path}"/*.yaml; then - wget -P "${download_path}" "${url}/${prefix}${lang1}-${lang2}.${extension}" - tar xvf "${download_path}/${prefix}${lang1}-${lang2}.${extension}" -C "${download_path}" --no-same-owner - mv "${download_path}/${lang1}-${lang2}"/* "${download_path}/" - rm "${download_path}/${prefix}${lang1}-${lang2}.${extension}" -fi + +wget -P "${download_path}" "${url}/${prefix}${lang1}-${lang2}.${extension}" +tar xvf "${download_path}/${prefix}${lang1}-${lang2}.${extension}" -C "${download_path}" --no-same-owner +mv "${download_path}/${lang1}-${lang2}"/* "${download_path}/" +rm "${download_path}/${prefix}${lang1}-${lang2}.${extension}" + echo "### ${type} language pack ${url} is downloaded" diff --git a/pipeline/cefilter/ce-filter.sh b/pipeline/cefilter/ce-filter.sh index 51ded55ba..24b2bc9d4 100644 --- a/pipeline/cefilter/ce-filter.sh +++ b/pipeline/cefilter/ce-filter.sh @@ -13,7 +13,8 @@ test -v TRG corpus_prefix=$1 output_prefix=$2 scores=$3 -threads=$4 + +cd "$(dirname "${0}")" # Part of the data to be removed (0.05 is 5%) remove=0.05 @@ -21,21 +22,10 @@ output_dir=$(dirname "${output_prefix}") tmp="${output_dir}/tmp" mkdir -p "${tmp}" -echo "### Decompressing corpus" -test -s "${tmp}/corpus.${TRG}" || pigz -dc "${corpus_prefix}.${TRG}.gz" >"${tmp}/corpus.${TRG}" -test -s "${tmp}/corpus.${SRC}" || pigz -dc "${corpus_prefix}.${SRC}.gz" >"${tmp}/corpus.${SRC}" - - -echo "### Normalizing scores" -test -s "${tmp}/scores.nrm.txt" || - paste "${scores}" "${tmp}/corpus.${TRG}" | - parallel --no-notice --pipe -k -j "${threads}" --block 50M "python pipeline/cefilter/normalize-scores.py" | - cut -f1 >"${tmp}/scores.nrm.txt" - echo "### Sorting scores" if [ ! -s "${tmp}/sorted.gz" ]; then buffer_size="$(echo "$(grep MemTotal /proc/meminfo | awk '{print $2}')"*0.9 | bc | cut -f1 -d.)" - paste "${tmp}/scores.nrm.txt" "${tmp}/corpus.${SRC}" "${tmp}/corpus.${TRG}" | + paste "${scores}" <(pigz -dc "${corpus_prefix}.${SRC}.gz") <(pigz -dc "${corpus_prefix}.${TRG}.gz") | LC_ALL=C sort -n -k1,1 -S "${buffer_size}K" -T "${tmp}" | pigz >"${tmp}/sorted.gz" fi @@ -48,8 +38,9 @@ if [ ! -s "${tmp}/best.gz" ]; then fi echo "### Writing output corpus" -pigz -dc "${tmp}/best.gz" | cut -f1 | pigz >"${output_prefix}.${SRC}.gz" -pigz -dc "${tmp}/best.gz" | cut -f2 | pigz >"${output_prefix}.${TRG}.gz" +pigz -dc "${tmp}/best.gz" | + tee >(cut -f1 | pigz >"${output_prefix}.${SRC}.gz") | + cut -f2 | pigz >"${output_prefix}.${TRG}.gz" echo "### Deleting tmp dir" rm -rf "${tmp}" diff --git a/pipeline/cefilter/normalize-scores.py b/pipeline/cefilter/normalize-scores.py deleted file mode 100644 index c920ea662..000000000 --- a/pipeline/cefilter/normalize-scores.py +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -from __future__ import print_function, unicode_literals, division - -import sys -import argparse -import math - - -def main(): - args = parse_user_args() - - for line in sys.stdin: - fields = line.strip().split("\t") - trg = fields[-1] - score = float(fields[0]) - - if not args.no_normalize: - length = len(trg.split()) - score = score / float(length + 1) - if args.exp: - score = math.exp(score) - - sys.stdout.write("{:.6f}\t{}".format(score, line)) - - -def parse_user_args(): - parser = argparse.ArgumentParser() - parser.add_argument("-n", "--no-normalize", action="store_true") - parser.add_argument("-e", "--exp", action="store_true") - return parser.parse_args() - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/pipeline/cefilter/score.sh b/pipeline/cefilter/score.sh index 78f72249b..a70ce1f24 100644 --- a/pipeline/cefilter/score.sh +++ b/pipeline/cefilter/score.sh @@ -31,6 +31,7 @@ mkdir -p "${dir}" --maxi-batch 1000 \ --max-length 250 \ --max-length-crop \ + --normalize \ -d ${GPUS} \ -w "${WORKSPACE}" \ --log "${dir}/scores.txt.log" \ diff --git a/pipeline/clean/clean-corpus.sh b/pipeline/clean/clean-corpus.sh index d2446c1d2..b24926c14 100755 --- a/pipeline/clean/clean-corpus.sh +++ b/pipeline/clean/clean-corpus.sh @@ -8,81 +8,101 @@ set -euo pipefail echo "###### Cleaning corpus" -export PYTHONPATH="${CLEAN_TOOLS}" + test -v SRC test -v TRG -test -v CLEAN_TOOLS -data=$1 -output=$2 +input_prefix=$1 +output_prefix=$2 threads=$3 +dataset=$4 + +cd "$(dirname "${0}")" +export PYTHONPATH="tools" -dir="$(dirname "${output}")" -tmp="${dir}/tmp" -mkdir -p "${tmp}" +dir="$(dirname "${output_prefix}")" +mkdir -p "${dir}" -echo "### CLeaning ${data}" +echo "### Cleaning ${input_prefix}" ###################################################################### echo "### Basic preprocessing" for lng in "${SRC}" "${TRG}"; do - test -s "${output}.${lng}.nrm.gz" || - pigz -dc "${data}.${lng}.gz" | + test -s "${output_prefix}.${lng}.nrm.gz" || + pigz -dc "${input_prefix}.${lng}.gz" | parallel --no-notice --pipe -k -j "${threads}" --block 50M \ - "perl ${CLEAN_TOOLS}/remove-non-printing-char.perl | perl ${CLEAN_TOOLS}/normalize-punctuation.perl -l ${lng}" | - pigz >"${output}.${lng}.nrm.gz" + "perl tools/remove-non-printing-char.perl" | + pigz >"${output_prefix}.${lng}.nrm.gz" +done + +##################################################################### +echo "### Apply monolingual fixes" +for lng in $SRC $TRG; do + if [[ ! -x fixes/${dataset}.${lng}.sh ]]; then + test -s "${output_prefix}.${lng}.monofix.gz" || + cp "${output_prefix}.${lng}.nrm.gz" "${output_prefix}.${lng}.monofix.gz" + else + test -s "${output_prefix}.${lng}.monofix.gz" || + pigz -dc "${output_prefix}.${lng}.nrm.gz" \ + | fixes/"${dataset}"."${lng}".sh \ + | pigz >"${output_prefix}.${lng}.monofix.gz" + fi done ###################################################################### -echo "### Deduplication" -test -s "${output}.${SRC}${TRG}.nrm.uniq.gz" || - paste <(pigz -dc "${output}.${SRC}.nrm.gz") <(pigz -dc "${output}.${TRG}.nrm.gz") | - LC_ALL=C sort -S 10G -T "${tmp}" | - uniq | - pigz >"${output}.${SRC}${TRG}.nrm.uniq.gz" +echo "### Apply bilingual fixes" +if [[ -x fixes/${dataset}.sh ]]; then + FIX="fixes/${dataset}.sh ${SRC} ${TRG} ${threads}" +else + FIX="cat" +fi +test -s "${output_prefix}.${SRC}${TRG}.fix.gz" || + paste <(pigz -dc "${output_prefix}.${SRC}.monofix.gz") <(pigz -dc "${output_prefix}.${TRG}.monofix.gz") \ + | $FIX \ + | pigz > "${output_prefix}.${SRC}${TRG}.fix.gz" ###################################################################### echo "### Rule-based filtering" -test -s "${output}.${SRC}${TRG}.rule-based.gz" || - pigz -dc "${output}.${SRC}${TRG}.nrm.uniq.gz" | +test -s "${output_prefix}.${SRC}${TRG}.rule-based.gz" || + pigz -dc "${output_prefix}.${SRC}${TRG}.fix.gz" | parallel --no-notice --pipe -k -j "${threads}" --block 50M \ - "python3 ${CLEAN_TOOLS}/clean_parallel.py -l1 ${SRC} -l2 ${TRG} --debug" \ - 2>"${output}.${SRC}${TRG}.clean.debug.txt" | - pigz >"${output}.${SRC}${TRG}.rule-based.gz" + "python3 tools/clean_parallel.py -l1 ${SRC} -l2 ${TRG} --debug" \ + 2>"${output_prefix}.${SRC}${TRG}.clean.debug.txt" | + pigz >"${output_prefix}.${SRC}${TRG}.rule-based.gz" ###################################################################### echo "### Language identification" -test -s "${output}.${SRC}${TRG}.langid.gz" || - pigz -dc "${output}.${SRC}${TRG}.rule-based.gz" | +test -s "${output_prefix}.${SRC}${TRG}.langid.gz" || + pigz -dc "${output_prefix}.${SRC}${TRG}.rule-based.gz" | # memory intensive parallel --no-notice --pipe -k -j "$(echo "${threads}"/4 | bc)" --block 50M \ - "python3 -Wi ${CLEAN_TOOLS}/langid_fasttext.py -f 1 | python3 -Wi ${CLEAN_TOOLS}/langid_fasttext.py -f 1" | + "python3 -Wi tools/langid_fasttext.py -f 1 | python3 -Wi tools/langid_fasttext.py -f 1" | grep -P "^${SRC}\t${TRG}\t" | cut -f3,4 | - pigz >"${output}.${SRC}${TRG}.langid.gz" + pigz >"${output_prefix}.${SRC}${TRG}.langid.gz" ###################################################################### echo "### Removing leading and repetitive white spaces" -pigz -dc "${output}.${SRC}${TRG}.langid.gz" | +pigz -dc "${output_prefix}.${SRC}${TRG}.langid.gz" | cut -f1 | sed -e 's/^[[:space:]]*//' | tr -s " " | -pigz >"${output}.${SRC}.gz" +pigz >"${output_prefix}.${SRC}.gz" -pigz -dc "${output}.${SRC}${TRG}.langid.gz" | +pigz -dc "${output_prefix}.${SRC}${TRG}.langid.gz" | cut -f2 | sed -e 's/^[[:space:]]*//' | tr -s " " | -pigz >"${output}.${TRG}.gz" +pigz >"${output_prefix}.${TRG}.gz" -test -s "${output}.${SRC}.gz" || exit 1 -test -s "${output}.${TRG}.gz" || exit 1 +test -s "${output_prefix}.${SRC}.gz" || exit 1 +test -s "${output_prefix}.${TRG}.gz" || exit 1 -echo "### Remove ${data} from intermediate steps" -rm -f "${output}".*.nrm.gz "${output}".*.nrm.uniq.gz "${output}".*.langid.gz "${output}".*.rule-based.gz -rm -rf "${tmp}" +echo "### Remove input_prefix from intermediate steps" +rm -rf "${output_prefix}".*.nrm.gz "${output_prefix}".*.langid.gz \ + "${output_prefix}".*.rule-based.gz "${output_prefix}".*.*fix.gz -echo "### Clean data is written to ${output}" +echo "### Clean ${input_prefix} is written to ${output_prefix}" echo "###### Done: Cleaning corpus" diff --git a/pipeline/clean/clean-mono.sh b/pipeline/clean/clean-mono.sh index 1490277a2..2c2fe7bbe 100755 --- a/pipeline/clean/clean-mono.sh +++ b/pipeline/clean/clean-mono.sh @@ -9,57 +9,63 @@ set -euo pipefail echo "###### Cleaning monolingual data" lang=$1 -input=$2 -output=$3 +input_prefix=$2 +output_prefix=$3 threads=$4 +dataset=$5 -test -v CLEAN_TOOLS +echo "### Cleaning ${input_prefix}" -echo "### CLeaning ${input}" +cd "$(dirname "${0}")" +export PYTHONPATH="tools" -dir="$(dirname "${output}")" -tmp="${dir}/tmp" -mkdir -p "${tmp}" +dir="$(dirname "${output_prefix}")" +mkdir -p "${dir}" ###################################################################### echo "### Basic preprocessing" -test -s "${output}.${lang}.nrm.gz" || - pigz -dc "${input}.${lang}.gz" | +test -s "${output_prefix}.${lang}.nrm.gz" || + pigz -dc "${input_prefix}.${lang}.gz" | parallel --no-notice --pipe -k -j "${threads}" --block 50M \ - "perl ${CLEAN_TOOLS}/remove-non-printing-char.perl | perl ${CLEAN_TOOLS}/normalize-punctuation.perl -l ${lang}" | - pigz >"${output}.${lang}.nrm.gz" + "perl tools/remove-non-printing-char.perl" | + pigz >"${output_prefix}.${lang}.nrm.gz" -###################################################################### -echo "### Deduplication" -test -s "${output}.${lang}.nrm.uniq.gz" || - pigz -dc "${output}.${lang}.nrm.gz" | - LC_ALL=C sort -S 10G -T "${tmp}" | - uniq | - pigz >"${output}.${lang}.nrm.uniq.gz" +##################################################################### +echo "### Apply monolingual fixes" +if [[ ! -x fixes/${dataset}.${lang}.sh ]]; then + test -s "${output_prefix}.${lang}.monofix.gz" || + cp "${output_prefix}.${lang}.nrm.gz" "${output_prefix}.${lang}.monofix.gz" +else + test -s "${output_prefix}.${lang}.monofix.gz" || + pigz -dc "${output_prefix}.${lang}.nrm.gz" \ + | fixes/"${dataset}"."${lang}".sh \ + | pigz >"${output_prefix}.${lang}.monofix.gz" +fi ###################################################################### echo "### Language identification" -test -s "${output}.${lang}.langid.gz" || - pigz -dc "${output}.${lang}.nrm.uniq.gz" | +test -s "${output_prefix}.${lang}.langid.gz" || + pigz -dc "${output_prefix}.${lang}.monofix.gz" | # memory intensive - parallel --no-notice --pipe -k -j "$(echo "${threads}"/4 | bc)" --block 50M "python ${CLEAN_TOOLS}/langid_fasttext.py" | + parallel --no-notice --pipe -k -j "$(echo "${threads}"/4 | bc)" --block 50M "python tools/langid_fasttext.py" | grep -P "^${lang}\t" | cut -f2 | - pigz >"${output}.${lang}.langid.gz" + pigz >"${output_prefix}.${lang}.langid.gz" ###################################################################### echo "### Rule-based filtering" -pigz -dc "${output}.${lang}.langid.gz" | +pigz -dc "${output_prefix}.${lang}.langid.gz" | parallel --no-notice --pipe -k -j "${threads}" --block 50M \ - "python ${CLEAN_TOOLS}/clean_mono.py -l ${lang} --debug" \ - 2>"${output}.${lang}.clean.debug.txt" | -pigz >"${output}.${lang}.gz" + "python tools/clean_mono.py -l ${lang} --debug" \ + 2>"${output_prefix}.${lang}.clean.debug.txt" | +pigz >"${output_prefix}.${lang}.gz" -test -s "${output}.${lang}.gz" || exit 1 +test -s "${output_prefix}.${lang}.gz" || exit 1 echo "### Remove data from intermediate steps" -rm -rf "${output}".*.nrm.gz "${output}".*.nrm.uniq.gz "${output}".*.langid.gz "${tmp}" +rm -rf "${output_prefix}".*.nrm.gz "${output_prefix}".*.langid.gz \ + "${output_prefix}".*.monofix.gz -echo "### Clean data is written to ${output}" +echo "### Clean data is written to ${output_prefix}" echo "###### Done: Cleaning monolingual data" diff --git a/pipeline/clean/fixes/detok.sh b/pipeline/clean/fixes/detok.sh new file mode 100755 index 000000000..0f6d792b5 --- /dev/null +++ b/pipeline/clean/fixes/detok.sh @@ -0,0 +1,17 @@ +#!/bin/bash +set -e + +# Detokenize + +SRC=$1 +TRG=$2 +threads=$3 + +temp=$(mktemp -d) + +tee >(cut -f1 | sacremoses -j $threads -l $SRC detokenize >$temp/$SRC.detok) \ + | cut -f2 | sacremoses -j $threads -l $TRG detokenize >$temp/$TRG.detok + +paste $temp/$SRC.detok $temp/$TRG.detok + +rm -r $temp diff --git a/pipeline/clean/fixes/mtdata_JW300.mt.sh b/pipeline/clean/fixes/mtdata_JW300.mt.sh new file mode 100755 index 000000000..98e5786f1 --- /dev/null +++ b/pipeline/clean/fixes/mtdata_JW300.mt.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +# Fix Maltese tokenization in JW300 that detokenizer cannot fix +sed "s/ - $(echo -ne \u200b) /-/g" \ + | sed 's/ - /-/g' diff --git a/pipeline/clean/fixes/mtdata_JW300.sh b/pipeline/clean/fixes/mtdata_JW300.sh new file mode 100755 index 000000000..8f24e4439 --- /dev/null +++ b/pipeline/clean/fixes/mtdata_JW300.sh @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +# Detokenize JW300 +fixes/detok.sh $1 $2 $3 diff --git a/pipeline/clean/fixes/mtdata_OPUS_DOGC_v2.ca.sh b/pipeline/clean/fixes/mtdata_OPUS_DOGC_v2.ca.sh new file mode 100755 index 000000000..79f003315 --- /dev/null +++ b/pipeline/clean/fixes/mtdata_OPUS_DOGC_v2.ca.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +# Detokenize Catalan apostrophe, dates and laws, and ending period +# detokenize middle dot +sed "s/\([lndsLNDS]\) ' \([a-zA-Z1]\)/\1'\2/g" \ + | sed "s#\([0-9]\) \?/ \?\([0-9]\)#\1/\2#g" \ + | sed "s/\([a-z]\) .\$/\1./g" \ + | sed "s/l · l/l·l/g" diff --git a/pipeline/clean/fixes/mtdata_OPUS_DOGC_v2.es.sh b/pipeline/clean/fixes/mtdata_OPUS_DOGC_v2.es.sh new file mode 100755 index 000000000..88b6cec22 --- /dev/null +++ b/pipeline/clean/fixes/mtdata_OPUS_DOGC_v2.es.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +# Detokenize dates and laws, and ending period +sed "s#\([0-9]\) \?/ \?\([0-9]\)#\1/\2#g" \ + | sed "s/\([a-z]\) .\$/\1./g" diff --git a/pipeline/clean/fixes/mtdata_OPUS_DOGC_v2.sh b/pipeline/clean/fixes/mtdata_OPUS_DOGC_v2.sh new file mode 100644 index 000000000..9c258aea7 --- /dev/null +++ b/pipeline/clean/fixes/mtdata_OPUS_DOGC_v2.sh @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +# Detokenize DOGC +fixes/detok.sh $1 $2 $3 diff --git a/pipeline/clean/fixes/mtdata_OPUS_ECB_v1.sh b/pipeline/clean/fixes/mtdata_OPUS_ECB_v1.sh new file mode 100755 index 000000000..8f24e4439 --- /dev/null +++ b/pipeline/clean/fixes/mtdata_OPUS_ECB_v1.sh @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +# Detokenize JW300 +fixes/detok.sh $1 $2 $3 diff --git a/pipeline/clean/fixes/mtdata_OPUS_SETIMES_v2.sh b/pipeline/clean/fixes/mtdata_OPUS_SETIMES_v2.sh new file mode 100755 index 000000000..695c69afb --- /dev/null +++ b/pipeline/clean/fixes/mtdata_OPUS_SETIMES_v2.sh @@ -0,0 +1,17 @@ +#!/bin/bash +set -e + +# Detokenize SETIMES + +SRC=$1 +TRG=$2 +threads=$3 + +temp=$(mktemp -d) + +tee >(cut -f1 | sacremoses -j $threads -l $SRC detokenize >$temp/$SRC.detok) \ + >(cut -f2 | sacremoses -j $threads -l $TRG detokenize >$temp/$TRG.detok) + +paste $temp/$SRC.detok $temp/$TRG.detok + +rm -r $temp diff --git a/pipeline/clean/fixes/mtdata_OPUS_UNPC_v1_0.en.sh b/pipeline/clean/fixes/mtdata_OPUS_UNPC_v1_0.en.sh new file mode 100755 index 000000000..33c194d8c --- /dev/null +++ b/pipeline/clean/fixes/mtdata_OPUS_UNPC_v1_0.en.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +# Detokenize English possessive +sed "s/\([a-z]\) ' \([s]\)/\1'\2/g" diff --git a/pipeline/clean/fixes/mtdata_OPUS_UNPC_v1_0.fr.sh b/pipeline/clean/fixes/mtdata_OPUS_UNPC_v1_0.fr.sh new file mode 100755 index 000000000..0e81ef40f --- /dev/null +++ b/pipeline/clean/fixes/mtdata_OPUS_UNPC_v1_0.fr.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +# Detokenize French apostrophe +sed "s/\([lndsLNDS]\) ' \([a-zA-Z]\)/\1'\2/g" diff --git a/pipeline/clean/fixes/mtdata_neulab_tedtalksv1_train.ro.sh b/pipeline/clean/fixes/mtdata_neulab_tedtalksv1_train.ro.sh new file mode 100755 index 000000000..993534550 --- /dev/null +++ b/pipeline/clean/fixes/mtdata_neulab_tedtalksv1_train.ro.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +# Detokenize Romanian hyphens +sed -E "s/(\w) - (\w)/\1-\2/g" diff --git a/pipeline/clean/fixes/mtdata_neulab_tedtalksv1_train.sh b/pipeline/clean/fixes/mtdata_neulab_tedtalksv1_train.sh new file mode 100755 index 000000000..8c42faa65 --- /dev/null +++ b/pipeline/clean/fixes/mtdata_neulab_tedtalksv1_train.sh @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +# Detokenize neulabs +fixes/detok.sh $1 $2 $3 diff --git a/pipeline/clean/merge-corpus.sh b/pipeline/clean/merge-corpus.sh new file mode 100644 index 000000000..734dbd2ea --- /dev/null +++ b/pipeline/clean/merge-corpus.sh @@ -0,0 +1,35 @@ +#!/bin/bash +## +# Merges and deduplicates parallel datasets +# + +set -x +set -euo pipefail + +echo "###### Merging parallel datasets" + +test -v SRC +test -v TRG + +output_prefix=$1 +input_prefixes=( "${@:2}" ) + +tmp="${output_prefix}/merge" +mkdir -p "${tmp}" + +echo "### Merging" +cat "${input_prefixes[@]/%/.${SRC}.gz}" >"${tmp}/corpus.${SRC}.dup.gz" +cat "${input_prefixes[@]/%/.${TRG}.gz}" >"${tmp}/corpus.${TRG}.dup.gz" + +echo "### Deduplication" +paste <(pigz -dc "${tmp}/corpus.${SRC}.dup.gz") <(pigz -dc "${tmp}/corpus.${TRG}.dup.gz") | +LC_ALL=C sort -S 10G -T "${tmp}" | +uniq | +pigz >"${tmp}.${SRC}${TRG}.gz" + +pigz -dc "${tmp}.${SRC}${TRG}.gz" | cut -f1 | pigz > "${output_prefix}.${SRC}.gz" +pigz -dc "${tmp}.${SRC}${TRG}.gz" | cut -f2 | pigz > "${output_prefix}.${TRG}.gz" + +rm -rf "${tmp}" + +echo "###### Done: Merging parallel datasets" diff --git a/pipeline/clean/merge-mono.sh b/pipeline/clean/merge-mono.sh new file mode 100644 index 000000000..564bd084b --- /dev/null +++ b/pipeline/clean/merge-mono.sh @@ -0,0 +1,23 @@ +#!/bin/bash +## +# Merges monolingual datasets +# + +set -x +set -euo pipefail + +echo "###### Merging monolingual datasets" + +output=$1 +max_sent=$2 +datasets=( "${@:3}" ) + +dir=$(dirname "${output}") +mkdir -p "${dir}" + +pigz -dc "${datasets[@]}" | + shuf -n "${max_sent}" | + pigz >"${output}" + + +echo "###### Done: Merging monolingual datasets" diff --git a/pipeline/clean/tools/clean_parallel.py b/pipeline/clean/tools/clean_parallel.py index 115004b03..88fc17254 100755 --- a/pipeline/clean/tools/clean_parallel.py +++ b/pipeline/clean/tools/clean_parallel.py @@ -17,15 +17,38 @@ RATIO_ALPHA_CHARS = 0.5 # minimum fraction of alpha characters in a source sentence CHARS = { + 'bg': r'[АаБбВвГгДддЕеЖжЗзИиЙйКкkasЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЬьЮюЯя]', 'cs': r'[a-zÁáČčĎďÉéěÍíŇňÓóŘřŠšŤťÚúůÝýŽž]', + 'ca': r'[a-zÀàÈèÉéÍíÒòÓóÚúÇç]', + 'da': r'[a-zÆæØøÅå]', + 'de': r'[a-zÄäÖöÜüß]', 'en': r'[a-z]', - 'es': r'[a-zÁáÉéÍíÓóÚúñÑ¡!¿?]', + 'el': r'[a-zΑαΒβΓγΔδΕεΖζΗηΘθΙιΚκΛλΜμΝνΞξΟοΠπΡρΣσςΤτΥυΦφΧχΨψΩω]', + 'es': r'[a-zÁáÉéÍíÓóÚúñÑ]', 'et': r'[a-zÕõÄäÖöÜü]', - 'de': r'[a-zÄäÖöÜüß]', - 'no': r'[a-zÂâÁáÀàâÉéÈèÊêÓóÒòÔôÜüÆæØøÅå]', + 'eu': r'[a-zñÑ]', + 'fi': r'[a-zÅåÄäÖö]', + 'fr': r'[a-zÂâÁáÀàâÇçÉéÈèÊêÓóÒòÔôŒœÜüÛûŸÿ]', + 'ga': r'[abcdefghilmnoprstuáéíóúÁÉÍÓÚ]', + 'gl': r'[a-zÁáÉéÍíÓóÚúÑñ]', + 'hr': r'[abcčČćĆdđĐefghijklmnoprsšŠtuvzžŽ]', + 'hu': r'[a-zÁáÉéÍíÓóÖöŐőŰű]', + 'is': r'[abdefghijklmnoprstuvxyÁáðÐÉéÍíÓóÚúÝýÞþÆæÖö]', + 'it': r'[a-zàÀèÈéÉìÌíÍîÎòÒóÓùÙúÚ]', + 'lt': r'[aąbcČčdeĘęĖėfghiĮįyjklmnoprsŠštuŲųŪūvzŽž]', + 'lv': r'[aĀābcČčdeĒēfgĢģhiĪījkĶķlĻļmnŅņoprsŠštuŪūvzŽž]', + 'mt': r'[abĊċdefĠġghĦħiiejklmnopqrstuvwxŻżz]', 'nb': r'[a-zÂâÁáÀàâÉéÈèÊêÓóÒòÔôÜüÆæØøÅå]', + 'nl': r'[a-zÂâÁáÀàâÉéÈèÊêÓóÒòÔôÚú]', + 'no': r'[a-zÂâÁáÀàâÉéÈèÊêÓóÒòÔôÜüÆæØøÅå]', 'nn': r'[a-zÂâÁáÀàâÉéÈèÊêÓóÒòÔôÜüÆæØøÅå]', - 'is': r'[abdefghijklmnoprstuvxyÁáðÐÉéÍíÓóÚúÝýÞþÆæÖö]', + 'pl': r'[a-zĄąĆćĘꣳŃńÓ󌜏źŻż]', + 'pt': r'[a-zÂâÁáÀàÃãÇçÉéÈèÊêÍíÌìÓóÒòÔôÕõÚúÙù]', + 'ro': r'[a-zĂăÂâÎîȘșȚț]', + 'ru': r'[а-я]', + 'sk': r'[a-záäÁÄčČďĎžéÉíÍĺĹľĽňŇóÓôÔŕŔšŠťŤúÚýÝžŽ]', + 'sl': r'[abcčČdđĐefghijklmnoprsšŠtuvzžŽ]', + 'sv': r'[a-zÅåÄäÖö]', } diff --git a/pipeline/data/download-corpus.sh b/pipeline/data/download-corpus.sh index 167f0433b..7937a228a 100644 --- a/pipeline/data/download-corpus.sh +++ b/pipeline/data/download-corpus.sh @@ -1,40 +1,27 @@ #!/bin/bash ## -# Downloads parallel corpus datasets +# Downloads parallel dataset # set -x set -euo pipefail -echo "###### Downloading corpus" - test -v SRC test -v TRG -prefix=$1 -cache=$2 -id=$3 -datasets=( "${@:4}" ) - -src_corpus="${prefix}.${SRC}.gz" -trg_corpus="${prefix}.${TRG}.gz" -dir=$(dirname "${prefix}")/${id} -mkdir -p "${dir}" +dataset=$1 +output_prefix=$2 -echo "### Downloading datasets" +echo "###### Downloading dataset ${dataset}" -for dataset in "${datasets[@]}"; do - echo "### Downloading dataset ${dataset}" - name=${dataset#*_} - type=${dataset%%_*} - bash "pipeline/data/importers/corpus/${type}.sh" "${SRC}" "${TRG}" "${dir}" "${name}" -done - -cat "${dir}"/*."${SRC}" | pigz >"${src_corpus}" -cat "${dir}"/*."${TRG}" | pigz >"${trg_corpus}" +cd "$(dirname "${0}")" +dir=$(dirname "${output_prefix}") +mkdir -p "${dir}" -rm -rf "${dir}" +name=${dataset#*_} +type=${dataset%%_*} +bash "importers/corpus/${type}.sh" "${SRC}" "${TRG}" "${output_prefix}" "${name}" -echo "###### Done: Downloading corpus" +echo "###### Done: Downloading dataset ${dataset}" diff --git a/pipeline/data/download-eval.sh b/pipeline/data/download-eval.sh deleted file mode 100644 index 871d5c6f3..000000000 --- a/pipeline/data/download-eval.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -## -# Downloads evaluation datasets -# - -set -x -set -euo pipefail - -echo "###### Downloading evaluation datasets" - -test -v SRC -test -v TRG - -dir=$1 -cache=$2 -datasets=( "${@:3}" ) - -for dataset in "${datasets[@]}"; do - name="${dataset//[^A-Za-z0-9_- ]/_}" - bash "pipeline/data/download-corpus.sh" "${dir}/${name}" "${cache}" eval "${dataset}" - - test -e "${dir}/${name}.${SRC}" || pigz -dk "${dir}/${name}.${SRC}.gz" - test -e "${dir}/${name}.${TRG}" || pigz -dk "${dir}/${name}.${TRG}.gz" -done - - -echo "###### Done: Downloading evaluation datasets" diff --git a/pipeline/data/download-mono.sh b/pipeline/data/download-mono.sh index 70300ff3c..e4875bf6c 100644 --- a/pipeline/data/download-mono.sh +++ b/pipeline/data/download-mono.sh @@ -6,53 +6,37 @@ set -x set -euo pipefail -echo "###### Downloading monolingual data" - -lang=$1 -max_sent=$2 -prefix=$3 -cache=$4 -datasets=( "${@:5}" ) - -file_name="${prefix}.${lang}.gz" -dir=$(dirname "${prefix}")/mono - -if [ ! -e "${file_name}" ]; then - echo "### Downloading monolingual corpus for ${lang}" - mkdir -p "${dir}" - coef=0.1 - - for dataset in "${datasets[@]}"; do - echo "### Downloading dataset ${dataset}" - source_prefix="${dir}/${dataset}.original.${lang}" - gz_path="${dir}/${dataset}.${lang}.gz" - name=${dataset#*_} - type=${dataset%%_*} - - test -s "${source_prefix}.gz" || - bash "pipeline/data/importers/mono/${type}.sh" "${lang}" "${source_prefix}" "${name}" - - echo "### Sampling dataset ${dataset}" - # temporary disable pipefail because perl operation causes SIGPIPE (141) - set +o pipefail - test -s "${gz_path}" || - pigz -dc "${source_prefix}.gz" | - shuf -n "$(bc -l <<<"${max_sent}+${max_sent}*${coef}")" | - perl -ne 'print if(split(/\s/, $_) < 100)' | - head -n "${max_sent}" | - pigz >"${gz_path}" - set -o pipefail - - rm "${source_prefix}"* - done - - pigz -dc "${dir}"/*."${lang}".gz | shuf -n "${max_sent}" | pigz >"${file_name}" - -fi - -test -s "${file_name}" - -lines=$(pigz -dc "${file_name}" | wc -l) -echo "### Number of sentences: ${lines}" +dataset=$1 +lang=$2 +max_sent=$3 +output_path=$4 +coef=0.1 + +echo "###### Downloading monolingual data for language ${lang} dataset ${dataset}" + +cd "$(dirname "${0}")" + +tmp=$(dirname "${output_path}")/original +mkdir -p "${tmp}" + +echo "### Downloading dataset" +original_prefix="${tmp}/${dataset}.original.${lang}" +name=${dataset#*_} +type=${dataset%%_*} + +test -s "${original_prefix}.gz" || + bash "importers/mono/${type}.sh" "${lang}" "${original_prefix}" "${name}" + +echo "### Sampling dataset" +# temporary disable pipefail because perl operation causes SIGPIPE (141) +set +o pipefail +pigz -dc "${original_prefix}.gz" | +shuf -n "$(bc -l <<<"${max_sent}+${max_sent}*${coef}")" | +perl -ne 'print if(split(/\s/, $_) < 100)' | +head -n "${max_sent}" | +pigz >"${output_path}" +set -o pipefail + +rm -rf "${original_prefix}.gz" echo "###### Done: Downloading monolingual data" diff --git a/pipeline/data/importers/corpus/custom-corpus.sh b/pipeline/data/importers/corpus/custom-corpus.sh index cde465b69..35a6a28e1 100644 --- a/pipeline/data/importers/corpus/custom-corpus.sh +++ b/pipeline/data/importers/corpus/custom-corpus.sh @@ -11,11 +11,11 @@ echo "###### Copying custom corpus" src=$1 trg=$2 -dir=$3 +output_prefix=$3 dataset=$4 -cp "${dataset}.${src}.gz" "${dir}/" -cp "${dataset}.${trg}.gz" "${dir}/" +cp "${dataset}.${src}.gz" "${output_prefix}.${src}.gz" +cp "${dataset}.${trg}.gz" "${output_prefix}.${trg}.gz" echo "###### Done: Copying custom corpus" \ No newline at end of file diff --git a/pipeline/data/importers/corpus/flores.sh b/pipeline/data/importers/corpus/flores.sh index 19e799561..e66e61ed6 100644 --- a/pipeline/data/importers/corpus/flores.sh +++ b/pipeline/data/importers/corpus/flores.sh @@ -11,15 +11,13 @@ echo "###### Downloading flores corpus" src=$1 trg=$2 -dir=$3 +output_prefix=$3 dataset=$4 -tmp="${dir}/flores" +tmp="$(dirname "${output_prefix}")/flores/${dataset}" mkdir -p "${tmp}" -test -s "${tmp}/flores101_dataset.tar.gz" || - wget -O "${tmp}/flores101_dataset.tar.gz" "https://dl.fbaipublicfiles.com/flores101/dataset/flores101_dataset.tar.gz" - +wget -O "${tmp}/flores101_dataset.tar.gz" "https://dl.fbaipublicfiles.com/flores101/dataset/flores101_dataset.tar.gz" tar -xzf "${tmp}/flores101_dataset.tar.gz" -C "${tmp}" --no-same-owner flores_code() { @@ -39,8 +37,8 @@ flores_code() { src_flores=$(flores_code "${src}") trg_flores=$(flores_code "${trg}") -cp "${tmp}/flores101_dataset/${dataset}/${src_flores}.${dataset}" "${dir}/flores.${src}" -cp "${tmp}/flores101_dataset/${dataset}/${trg_flores}.${dataset}" "${dir}/flores.${trg}" +pigz -c "${tmp}/flores101_dataset/${dataset}/${src_flores}.${dataset}" > "${output_prefix}.${src}.gz" +pigz -c "${tmp}/flores101_dataset/${dataset}/${trg_flores}.${dataset}" > "${output_prefix}.${trg}.gz" rm -rf "${tmp}" diff --git a/pipeline/data/importers/corpus/mtdata.sh b/pipeline/data/importers/corpus/mtdata.sh index 821f8b446..e243f14b9 100644 --- a/pipeline/data/importers/corpus/mtdata.sh +++ b/pipeline/data/importers/corpus/mtdata.sh @@ -10,24 +10,20 @@ echo "###### Downloading mtdata corpus" src=$1 trg=$2 -dir=$3 +output_prefix=$3 dataset=$4 +tmp="$(dirname "${output_prefix}")/mtdata/${dataset}" +mkdir -p "${tmp}" + src_iso=$(python -c "from mtdata.iso import iso3_code; print(iso3_code('${src}', fail_error=True))") trg_iso=$(python -c "from mtdata.iso import iso3_code; print(iso3_code('${trg}', fail_error=True))") -if [ ! -e "${dir}/${dataset}.${trg}" ]; then - mtdata get -l "${src}-${trg}" -tr "${dataset}" -o "${dir}" - - for f in "${dir}"/train-parts/*."${src_iso}"; do - mv "${f}" "${dir}/${dataset}.${src}" - done - for f in "${dir}"/train-parts/*."${trg_iso}"; do - mv "${f}" "${dir}/${dataset}.${trg}" - done +mtdata get -l "${src}-${trg}" -tr "${dataset}" -o "${tmp}" - rm -rf "${dir}/train-parts" -fi +pigz -c "${tmp}/train-parts/${dataset}-${src_iso}_${trg_iso}.${src_iso}" > "${output_prefix}.${src}.gz" +pigz -c "${tmp}/train-parts/${dataset}-${src_iso}_${trg_iso}.${trg_iso}" > "${output_prefix}.${trg}.gz" +rm -rf "${tmp}" echo "###### Done: Downloading mtdata corpus" diff --git a/pipeline/data/importers/corpus/opus.sh b/pipeline/data/importers/corpus/opus.sh index 3bff6eced..58172a199 100644 --- a/pipeline/data/importers/corpus/opus.sh +++ b/pipeline/data/importers/corpus/opus.sh @@ -10,23 +10,27 @@ echo "###### Downloading opus corpus" src=$1 trg=$2 -dir=$3 +output_prefix=$3 dataset=$4 name=${dataset%%/*} +name_and_version="${dataset//[^A-Za-z0-9_- ]/_}" -if [ ! -s "${dir}/${name}.${src}-${trg}.${trg}" ] && [ ! -s "${dir}/${name}.${trg}-${src}.${trg}" ]; then - mkdir -p "${dir}/opus" +tmp="$(dirname "${output_prefix}")/opus/${name_and_version}" +mkdir -p "${tmp}" - name_and_version="${dataset//[^A-Za-z0-9_- ]/_}" - archive_path="${dir}/opus/${name_and_version}.txt.zip" +archive_path="${tmp}/${name}.txt.zip" - test -s "${archive_path}" || - wget -O "${archive_path}" "https://object.pouta.csc.fi/OPUS-${dataset}/moses/${src}-${trg}.txt.zip" || - wget -O "${archive_path}" "https://object.pouta.csc.fi/OPUS-${dataset}/moses/${trg}-${src}.txt.zip" - unzip -o "${archive_path}" -d "${dir}" +wget -O "${archive_path}" "https://object.pouta.csc.fi/OPUS-${dataset}/moses/${src}-${trg}.txt.zip" || + wget -O "${archive_path}" "https://object.pouta.csc.fi/OPUS-${dataset}/moses/${trg}-${src}.txt.zip" +unzip -o "${archive_path}" -d "${tmp}" + +for lang in ${src} ${trg}; do + pigz -c "${tmp}/${name}.${src}-${trg}.${lang}" > "${output_prefix}.${lang}.gz" || + pigz -c "${tmp}/${name}.${trg}-${src}.${lang}" > "${output_prefix}.${lang}.gz" +done + +rm -rf "${tmp}" - rm -rf "${dir}/opus" -fi echo "###### Done: Downloading opus corpus" diff --git a/pipeline/data/importers/corpus/sacrebleu.sh b/pipeline/data/importers/corpus/sacrebleu.sh index dece83e21..cecacc3bf 100644 --- a/pipeline/data/importers/corpus/sacrebleu.sh +++ b/pipeline/data/importers/corpus/sacrebleu.sh @@ -10,15 +10,10 @@ echo "###### Downloading sacrebleu corpus" src=$1 trg=$2 -dir=$3 +output_prefix=$3 dataset=$4 -name="${dataset//[^A-Za-z0-9_- ]/_}" - -test -s "${dir}/${name}.${src}" || -sacrebleu -t "${dataset}" -l "${src}-${trg}" --echo src > "${dir}/${name}.${src}" - -test -s "${dir}/${name}.${trg}" || -sacrebleu -t "${dataset}" -l "${src}-${trg}" --echo ref > "${dir}/${name}.${trg}" +sacrebleu -t "${dataset}" -l "${src}-${trg}" --echo src | pigz > "${output_prefix}.${src}.gz" +sacrebleu -t "${dataset}" -l "${src}-${trg}" --echo ref | pigz > "${output_prefix}.${trg}.gz" echo "###### Done: Downloading sacrebleu corpus" diff --git a/pipeline/data/importers/mono/commoncrawl.sh b/pipeline/data/importers/mono/commoncrawl.sh index d093ebe20..917971f2c 100644 --- a/pipeline/data/importers/mono/commoncrawl.sh +++ b/pipeline/data/importers/mono/commoncrawl.sh @@ -12,8 +12,7 @@ lang=$1 output_prefix=$2 dataset=$3 -test -s "${output_prefix}.gz" || - wget -O "${output_prefix}.xz" \ +wget -O "${output_prefix}.xz" \ "http://web-language-models.s3-website-us-east-1.amazonaws.com/${dataset}/deduped/${lang}.xz" xzcat "${output_prefix}.xz" | pigz >"${output_prefix}.gz" diff --git a/pipeline/data/importers/mono/custom-mono.sh b/pipeline/data/importers/mono/custom-mono.sh index c326cde10..aba3e053c 100644 --- a/pipeline/data/importers/mono/custom-mono.sh +++ b/pipeline/data/importers/mono/custom-mono.sh @@ -13,7 +13,7 @@ lang=$1 output_prefix=$2 dataset=$3 -cp "${dataset}.${lang}.gz" "${output_prefix}.${lang}.gz" +cp "${dataset}.${lang}.gz" "${output_prefix}.gz" echo "###### Done: Copying custom monolingual dataset" \ No newline at end of file diff --git a/pipeline/data/importers/mono/news-crawl.sh b/pipeline/data/importers/mono/news-crawl.sh index d695243ec..39d8dc13b 100644 --- a/pipeline/data/importers/mono/news-crawl.sh +++ b/pipeline/data/importers/mono/news-crawl.sh @@ -12,8 +12,7 @@ dataset=$3 echo "###### Downloading WMT newscrawl monolingual data" -test -s "${output_prefix}.gz" || - wget -O "${output_prefix}.gz" \ +wget -O "${output_prefix}.gz" \ "http://data.statmt.org/news-crawl/${lang}/${dataset}.${lang}.shuffled.deduped.gz" echo "###### Done: Downloading WMT newscrawl monolingual data" diff --git a/pipeline/data/importers/mono/paracrawl-mono.sh b/pipeline/data/importers/mono/paracrawl-mono.sh index d86a1b852..fca5660ab 100644 --- a/pipeline/data/importers/mono/paracrawl-mono.sh +++ b/pipeline/data/importers/mono/paracrawl-mono.sh @@ -13,7 +13,6 @@ output_prefix=$2 dataset=$3 if [[ "${lang}" == "en" ]]; then - test -s "${output_prefix}.gz" || wget -O "${output_prefix}.gz" "https://neural.mt/data/${dataset}-mono/en-000.gz" else echo "Only English language is supported at this time for Paracrawl" diff --git a/pipeline/quantize/eval.sh b/pipeline/quantize/eval.sh index 602a7f7ad..07caf925e 100644 --- a/pipeline/quantize/eval.sh +++ b/pipeline/quantize/eval.sh @@ -16,32 +16,36 @@ model_dir=$1 shortlist=$2 datasets_dir=$3 vocab=$4 +eval_dir=$5 -eval_dir="${model_dir}/eval" +cd "$(dirname "${0}")" mkdir -p "${eval_dir}" echo "### Evaluating a model ${model_dir} on CPU" -for src_path in "${datasets_dir}"/*."${SRC}"; do - prefix=$(basename "${src_path}" ".${SRC}") +for src_path in "${datasets_dir}"/*."${SRC}.gz"; do + prefix=$(basename "${src_path}" ".${SRC}.gz") echo "### Evaluating ${prefix} ${SRC}-${TRG}" + pigz -dc "${datasets_dir}/${prefix}.${TRG}.gz" > "${eval_dir}/${prefix}.${TRG}.ref" + test -s "${eval_dir}/${prefix}.${TRG}.bleu" || - tee "${eval_dir}/${prefix}.${SRC}" < "${src_path}" | + pigz -dc "${src_path}" | + tee "${eval_dir}/${prefix}.${SRC}" | "${MARIAN}"/marian-decoder \ -m "${model_dir}/model.intgemm.alphas.bin" \ -v "${vocab}" "${vocab}" \ - -c "pipeline/quantize/decoder.yml" \ + -c "decoder.yml" \ --quiet \ --quiet-translation \ --log "${eval_dir}/${prefix}.log" \ --shortlist "${shortlist}" false \ --int8shiftAlphaAll | tee "${eval_dir}/${prefix}.${TRG}" | - sacrebleu -d --score-only -l "${SRC}-${TRG}" "${datasets_dir}/${prefix}.${TRG}" | - tee "${eval_dir}/${prefix}.${TRG}.bleu" + sacrebleu "${eval_dir}/${prefix}.${TRG}.ref" -d -f text --score-only -l "${SRC}-${TRG}" -m bleu chrf | + tee "${eval_dir}/${prefix}.${TRG}.metrics" - test -e "${eval_dir}/${prefix}.${TRG}.bleu" || exit 1 + test -e "${eval_dir}/${prefix}.${TRG}.metrics" || exit 1 done echo "###### Done: Evaluation of a quantized model" diff --git a/pipeline/quantize/quantize.sh b/pipeline/quantize/quantize.sh index 3d3def6d2..9a18653a4 100644 --- a/pipeline/quantize/quantize.sh +++ b/pipeline/quantize/quantize.sh @@ -19,6 +19,8 @@ shortlist=$3 devtest_src=$4 output_dir=$5 +cd "$(dirname "${0}")" + res_model="${output_dir}/model.intgemm.alphas.bin" mkdir -p "${output_dir}" cp "${vocab}" "${output_dir}" @@ -28,7 +30,7 @@ test -s "${output_dir}/quantmults" || "${MARIAN}"/marian-decoder \ -m "${model}" \ -v "${vocab}" "${vocab}" \ - -c "pipeline/quantize/decoder.yml" \ + -c "decoder.yml" \ -i "${devtest_src}" \ -o "${output_dir}/output.${TRG}" \ --shortlist "${shortlist}" false \ diff --git a/pipeline/train/configs/model/s2s.yml b/pipeline/train/configs/model/backward.yml similarity index 100% rename from pipeline/train/configs/model/s2s.yml rename to pipeline/train/configs/model/backward.yml diff --git a/pipeline/train/configs/model/student.tiny11.yml b/pipeline/train/configs/model/student.yml similarity index 100% rename from pipeline/train/configs/model/student.tiny11.yml rename to pipeline/train/configs/model/student.yml diff --git a/pipeline/train/configs/model/teacher.transformer.yml b/pipeline/train/configs/model/teacher.transformer.yml deleted file mode 100644 index cf3a72d3b..000000000 --- a/pipeline/train/configs/model/teacher.transformer.yml +++ /dev/null @@ -1,8 +0,0 @@ -# https://github.com/marian-nmt/marian-examples/tree/master/transformer -# https://github.com/marian-nmt/marian-examples/tree/master/wmt2017-uedin -dec-depth: 6 -dim-vocabs: [32000, 32000] -enc-depth: 6 -tied-embeddings-all: true -transformer-dropout: 0.1 -type: transformer diff --git a/pipeline/train/configs/model/teacher.yml b/pipeline/train/configs/model/teacher.yml new file mode 100644 index 000000000..57ebc0510 --- /dev/null +++ b/pipeline/train/configs/model/teacher.yml @@ -0,0 +1,6 @@ +# https://discourse.translatelocally.com/t/marian-configuration-to-use/24 +dim-vocabs: [32000, 32000] +type: transformer +# tasks: https://github.com/marian-nmt/marian-dev/blob/master/src/common/aliases.cpp +task: transformer-big +#task: transformer-base # use smaller model for low resource (<5M sentences) \ No newline at end of file diff --git a/pipeline/train/configs/training/s2s.train.yml b/pipeline/train/configs/training/backward.train.yml similarity index 84% rename from pipeline/train/configs/training/s2s.train.yml rename to pipeline/train/configs/training/backward.train.yml index 80a74e7ec..8e66f1d46 100644 --- a/pipeline/train/configs/training/s2s.train.yml +++ b/pipeline/train/configs/training/backward.train.yml @@ -1,5 +1,5 @@ ## https://github.com/marian-nmt/marian-examples/tree/master/wmt2017-uedin -after-epochs: 10 +after: 10e # change based on available training data beam-size: 12 cost-type: ce-mean-words disp-freq: 1000 diff --git a/pipeline/train/configs/training/teacher.continue.yml b/pipeline/train/configs/training/teacher.continue.yml new file mode 100644 index 000000000..c752ca9d6 --- /dev/null +++ b/pipeline/train/configs/training/teacher.continue.yml @@ -0,0 +1,9 @@ +# https://discourse.translatelocally.com/t/marian-configuration-to-use/24 +disp-freq: 1000 +learn-rate: 0.0003 # Turn this down if you get a diverged model, maybe 0.0001 +no-restore-corpus: True +optimizer-delay: 1 # Roughly GPU devices * optimizer-delay = 8, but keep as an integer +save-freq: 5000 +valid-freq: 3000 +valid-max-length: 300 +valid-mini-batch: 8 \ No newline at end of file diff --git a/pipeline/train/configs/training/teacher.train.yml b/pipeline/train/configs/training/teacher.train.yml new file mode 100644 index 000000000..718b476b1 --- /dev/null +++ b/pipeline/train/configs/training/teacher.train.yml @@ -0,0 +1,9 @@ +# https://discourse.translatelocally.com/t/marian-configuration-to-use/24 +after: 2e # remove for low resource languages or if training without augmentation +disp-freq: 1000 +learn-rate: 0.0003 # Turn this down if you get a diverged model, maybe 0.0001 +optimizer-delay: 1 # Roughly GPU devices * optimizer-delay = 8, but keep as an integer +save-freq: 5000 +valid-freq: 3000 +valid-max-length: 300 +valid-mini-batch: 8 \ No newline at end of file diff --git a/pipeline/train/configs/training/teacher.transformer-ens.train.yml b/pipeline/train/configs/training/teacher.transformer-ens.train.yml deleted file mode 100644 index 0c3c3e79a..000000000 --- a/pipeline/train/configs/training/teacher.transformer-ens.train.yml +++ /dev/null @@ -1,22 +0,0 @@ -# https://github.com/marian-nmt/marian-examples/tree/master/wmt2017-uedin -after-epochs: 8 -beam-size: 12 -clip-norm: 5 -cost-type: ce-mean-words -disp-freq: 500 -early-stopping: 5 -exponential-smoothing: True -label-smoothing: 0.1 -learn-rate: 0.0003 -lr-decay-inv-sqrt: 16000 -lr-report: True -lr-warmup: 16000 -max-length: 100 -maxi-batch: 1000 -mini-batch-fit: True -mini-batch: 1000 -normalize: 1 -optimizer-params: [0.9, 0.98, 1e-09] -save-freq: 5000 -valid-freq: 5000 -valid-mini-batch: 64 \ No newline at end of file diff --git a/pipeline/train/configs/training/teacher.transformer.train.yml b/pipeline/train/configs/training/teacher.transformer.train.yml deleted file mode 100644 index 12ecc7db6..000000000 --- a/pipeline/train/configs/training/teacher.transformer.train.yml +++ /dev/null @@ -1,21 +0,0 @@ -# https://github.com/marian-nmt/marian-examples/tree/master/transformer -beam-size: 6 -clip-norm: 5 -cost-type: ce-mean-words -disp-first: 10 -disp-freq: 500 -early-stopping: 10 -exponential-smoothing: True -label-smoothing: 0.1 -learn-rate: 0.0003 -lr-decay-inv-sqrt: 16000 -lr-report: True -lr-warmup: 16000 -max-length: 100 -maxi-batch: 1000 -mini-batch-fit: True -normalize: 0.6 -optimizer-params: [0.9, 0.98, 1e-09] -save-freq: 5000 -valid-freq: 5000 -valid-mini-batch: 64 \ No newline at end of file diff --git a/pipeline/train/eval.sh b/pipeline/train/eval.sh index e51ccbb61..56f18ac03 100644 --- a/pipeline/train/eval.sh +++ b/pipeline/train/eval.sh @@ -12,38 +12,38 @@ test -v GPUS test -v MARIAN test -v WORKSPACE -model_dir=$1 +eval_dir=$1 datasets_dir=$2 -src="${3:-${SRC}}" -trg="${4:-${TRG}}" +src=$3 +trg=$4 +models=( "${@:5}" ) -config="${model_dir}/model.npz.best-bleu-detok.npz.decoder.yml" -eval_dir="${model_dir}/eval" - -echo "### Checking model files" -test -e "${config}" || exit 1 mkdir -p "${eval_dir}" -echo "### Evaluating a model ${model_dir}" -for src_path in "${datasets_dir}"/*."${src}"; do - prefix=$(basename "${src_path}" ".${src}") +echo "### Evaluating the model" +for src_path in "${datasets_dir}"/*."${src}.gz"; do + prefix=$(basename "${src_path}" ".${src}.gz") echo "### Evaluating ${prefix} ${src}-${trg}" + pigz -dc "${datasets_dir}/${prefix}.${TRG}.gz" > "${eval_dir}/${prefix}.${TRG}.ref" + test -s "${eval_dir}/${prefix}.${trg}.bleu" || - tee "${eval_dir}/${prefix}.${src}" < "${src_path}" | + pigz -dc "${src_path}" | + tee "${eval_dir}/${prefix}.${src}" | "${MARIAN}"/marian-decoder \ - -c "${config}" \ + -m "${models[@]}" \ + -c "${models[0]}.decoder.yml" \ -w "${WORKSPACE}" \ --quiet \ --quiet-translation \ --log "${eval_dir}/${prefix}.log" \ -d ${GPUS} | tee "${eval_dir}/${prefix}.${trg}" | - sacrebleu -d --score-only -l "${src}-${trg}" "${datasets_dir}/${prefix}.${trg}" | - tee "${eval_dir}/${prefix}.${trg}.bleu" + sacrebleu "${eval_dir}/${prefix}.${TRG}.ref" -d -f text --score-only -l "${src}-${trg}" -m bleu chrf | + tee "${eval_dir}/${prefix}.${trg}.metrics" - test -e "${eval_dir}/${prefix}.${trg}.bleu" || exit 1 + test -e "${eval_dir}/${prefix}.${trg}.metrics" || exit 1 done diff --git a/pipeline/train/finetune-student.sh b/pipeline/train/finetune-student.sh deleted file mode 100644 index bd78d8ba9..000000000 --- a/pipeline/train/finetune-student.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/bin/bash -## -# Finetune a student model. -# - -set -x -set -euo pipefail - -echo "###### Finetuning the student model" - -dir=$1 -corpus=$2 -devset=$3 -vocab=$4 -alignment=$5 -student=$6 -extra_params=( "${@:7}" ) - -test -v SRC -test -v TRG - - -mkdir -p "${dir}" -cp "${student}" "${dir}/model.npz" - -bash "pipeline/train/train.sh" \ - "pipeline/train/configs/model/student.tiny11.yml" \ - "pipeline/train/configs/training/student.finetune.yml" \ - "${SRC}" \ - "${TRG}" \ - "${corpus}" \ - "${devset}" \ - "${dir}" \ - "${vocab}" \ - --guided-alignment "${alignment}" \ - "${extra_params[@]}" - -echo "###### Done: Finetuning the student model" - - diff --git a/pipeline/train/train-s2s.sh b/pipeline/train/train-s2s.sh deleted file mode 100644 index 002f1d543..000000000 --- a/pipeline/train/train-s2s.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash -## -# Train a shallow s2s model. -# - -set -x -set -euo pipefail - -echo "###### Training s2s model" - -dir=$1 -corpus=$2 -devset=$3 -vocab=$4 -src=$5 -trg=$6 -extra_params=( "${@:7}" ) - - -bash "pipeline/train/train.sh" \ - "pipeline/train/configs/model/s2s.yml" \ - "pipeline/train/configs/training/s2s.train.yml" \ - "${src}" \ - "${trg}" \ - "${corpus}" \ - "${devset}" \ - "${dir}" \ - "${vocab}" \ - "${extra_params[@]}" - - -echo "###### Done: Training s2s model" diff --git a/pipeline/train/train-student.sh b/pipeline/train/train-student.sh index 5b450e9f4..e0a5f9afb 100644 --- a/pipeline/train/train-student.sh +++ b/pipeline/train/train-student.sh @@ -8,27 +8,15 @@ set -euo pipefail echo "###### Training a student model" -dir=$1 -corpus=$2 -devset=$3 -vocab=$4 -alignment=$5 -extra_params=( "${@:6}" ) - -test -v SRC -test -v TRG - -bash "pipeline/train/train.sh" \ - "pipeline/train/configs/model/student.tiny11.yml" \ - "pipeline/train/configs/training/student.train.yml" \ - "${SRC}" \ - "${TRG}" \ - "${corpus}" \ - "${devset}" \ - "${dir}" \ - "${vocab}" \ +alignment=$1 +extra_params=( "${@:2}" ) + +cd "$(dirname "${0}")" + +bash "train.sh" \ + "${extra_params[@]}" \ --guided-alignment "${alignment}" \ - "${extra_params[@]}" + echo "###### Done: Training a student model" diff --git a/pipeline/train/train-teacher.sh b/pipeline/train/train-teacher.sh deleted file mode 100644 index dc3abee07..000000000 --- a/pipeline/train/train-teacher.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash -## -# Train a teacher model. -# - -set -x -set -euo pipefail - -echo "###### Training a teacher model" - -dir=$1 -corpus=$2 -devset=$3 -vocab=$4 -extra_params=( "${@:5}" ) - -test -v SRC -test -v TRG - -bash "pipeline/train/train.sh" \ - "pipeline/train/configs/model/teacher.transformer.yml" \ - "pipeline/train/configs/training/teacher.transformer.train.yml" \ - "${SRC}" \ - "${TRG}" \ - "${corpus}" \ - "${devset}" \ - "${dir}" \ - "${vocab}" \ - "${extra_params[@]}" - -echo "###### Training a teacher model" diff --git a/pipeline/train/train.sh b/pipeline/train/train.sh index 15f67145f..b089fb8a7 100644 --- a/pipeline/train/train.sh +++ b/pipeline/train/train.sh @@ -8,10 +8,8 @@ set -euo pipefail echo "###### Training a model" -#TODO too many positional args here, replace with names args - -model_config=$1 -training_config=$2 +model_type=$1 +training_type=$2 src=$3 trg=$4 train_set_prefix=$5 @@ -24,6 +22,7 @@ test -v GPUS test -v MARIAN test -v WORKSPACE +cd "$(dirname "${0}")" mkdir -p "${model_dir}/tmp" echo "### Training ${model_dir}" @@ -32,7 +31,7 @@ echo "### Training ${model_dir}" "${MARIAN}/marian" \ --model "${model_dir}/model.npz" \ - -c "${model_config}" "${training_config}" \ + -c "configs/model/${model_type}.yml" "configs/training/${model_type}.${training_type}.yml" \ --train-sets "${train_set_prefix}".{"${src}","${trg}"}.gz \ -T "${model_dir}/tmp" \ --shuffle-in-ram \ @@ -40,7 +39,7 @@ echo "### Training ${model_dir}" -w "${WORKSPACE}" \ --devices ${GPUS} \ --sync-sgd \ - --valid-metrics bleu-detok ce-mean-words perplexity \ + --valid-metrics ce-mean-words bleu-detok chrf \ --valid-sets "${valid_set_prefix}".{"${src}","${trg}"}.gz \ --valid-translation-output "${model_dir}/devset.out" \ --quiet-translation \ diff --git a/pipeline/translate/decoder.yml b/pipeline/translate/decoder.yml index 4ebbbf520..664a9f733 100644 --- a/pipeline/translate/decoder.yml +++ b/pipeline/translate/decoder.yml @@ -1,7 +1,8 @@ normalize: 1.0 word-penalty: 0 mini-batch: 16 -mini-batch-words: 2000 +#mini-batch-words: 2000 # 1 model or 24 gb GPU +mini-batch-words: 500 # 12 Gb GPU, ensemble of 4 teachers maxi-batch: 1000 maxi-batch-sort: src max-length: 200 diff --git a/pipeline/translate/translate-nbest.sh b/pipeline/translate/translate-nbest.sh index 6099acbc3..97a66b30b 100755 --- a/pipeline/translate/translate-nbest.sh +++ b/pipeline/translate/translate-nbest.sh @@ -14,9 +14,10 @@ input=$1 vocab=$2 models=( "${@:3}" ) +cd "$(dirname "${0}")" "${MARIAN}/marian-decoder" \ - -c pipeline/translate/decoder.yml \ + -c decoder.yml \ -m "${models[@]}" \ -v "${vocab}" "${vocab}" \ -i "${input}" \ @@ -26,3 +27,4 @@ models=( "${@:3}" ) -d ${GPUS} \ -w "${WORKSPACE}" +test "$(wc -l <"${input}.nbest")" -eq "$(( $(wc -l <"${input}") * 8 ))" \ No newline at end of file diff --git a/pipeline/translate/translate.sh b/pipeline/translate/translate.sh index b5389f6e4..f046ae532 100755 --- a/pipeline/translate/translate.sh +++ b/pipeline/translate/translate.sh @@ -15,8 +15,10 @@ vocab=$2 models=( "${@:3}" ) +cd "$(dirname "${0}")" + "${MARIAN}/marian-decoder" \ - -c pipeline/translate/decoder.yml \ + -c decoder.yml \ -m "${models[@]}" \ -v "${vocab}" "${vocab}" \ -i "${input}" \ @@ -25,3 +27,4 @@ models=( "${@:3}" ) -d ${GPUS} \ -w "${WORKSPACE}" +test "$(wc -l <"${input}")" == "$(wc -l <"${input}.out")" diff --git a/profiles/snakepit/config.yaml b/profiles/snakepit/config.yaml deleted file mode 100644 index 3a27972a1..000000000 --- a/profiles/snakepit/config.yaml +++ /dev/null @@ -1,6 +0,0 @@ -cluster: "submit.py" -cluster-status: "status.py" -jobscript: "jobscript.sh" -jobs: 10 -immediate-submit: false -verbose: true \ No newline at end of file diff --git a/profiles/snakepit/jobscript.sh b/profiles/snakepit/jobscript.sh deleted file mode 100644 index a72f24e3a..000000000 --- a/profiles/snakepit/jobscript.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash -# properties = {properties} -{exec_job} \ No newline at end of file diff --git a/profiles/snakepit/status.py b/profiles/snakepit/status.py deleted file mode 100644 index ca375ed10..000000000 --- a/profiles/snakepit/status.py +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env python3 - - -import sys -import subprocess - -job_id = sys.argv[1] - -try: - cmd = f''' - unset http_proxy - unset HTTP_PROXY - pit show job:{job_id}''' - - res = subprocess.run(cmd, - check=True, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - shell=True) - - info = res.stdout.decode() - - if 'FIN' in info: - if 'Status code: 0' in info: - print("success") - else: - print("failed") - else: - print("running") - -except (subprocess.CalledProcessError, IndexError, KeyboardInterrupt) as e: - print("failed") diff --git a/profiles/snakepit/submit.py b/profiles/snakepit/submit.py deleted file mode 100644 index ba396b67d..000000000 --- a/profiles/snakepit/submit.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python3 -import os -import sys -import argparse -import subprocess - -from snakemake.utils import read_job_properties - -jobscript = sys.argv[-1] -job_properties = read_job_properties(jobscript) - -request = '[]' # cpu only -if "resources" in job_properties: - resources = job_properties["resources"] - - if 'gpu' in resources: - num = resources['gpu'] - # todo: find available models - request = f'[{num}:txp]' - -name = job_properties.get("rule") -cmd = f''' - unset http_proxy - unset HTTP_PROXY - mkdir -p empty - cd empty - pit run snakemake-{name} {request} -e "bash {jobscript}"''' - -try: - res = subprocess.run(cmd, check=True, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) -except subprocess.CalledProcessError as e: - raise e - -res = res.stdout.decode() -number_line = '=> job number:' -job_id = res[res.find(number_line) + len(number_line):].strip() -print(job_id) diff --git a/reports/evaluation.rst b/reports/evaluation.rst index 4aca21f56..133588b7e 100644 --- a/reports/evaluation.rst +++ b/reports/evaluation.rst @@ -1 +1 @@ -.. include:: {{ snakemake.output[0] }}/{{ snakemake.wildcards.name }}.bleu \ No newline at end of file +.. include:: {{ snakemake.output[0] }}/{{ snakemake.wildcards.name }}.metrics \ No newline at end of file