From 8285a08e08605504d6a0355ae7022d5df7993139 Mon Sep 17 00:00:00 2001 From: Cam Nugent Date: Tue, 10 Mar 2020 16:12:41 -0400 Subject: [PATCH] moving tests to seqio --- .../test_kmerseq.cpython-37-pytest-5.2.1.pyc | Bin 0 -> 2734 bytes .../test_seqio.cpython-37-pytest-5.2.1.pyc | Bin 0 -> 8646 bytes .../test_training.cpython-37-pytest-5.2.1.pyc | Bin 0 -> 12013 bytes alfie/__pycache__/training.cpython-37.pyc | Bin 10808 -> 10908 bytes alfie/test_kmerseq.py | 40 ++--- alfie/test_seqio.py | 151 ++++++++---------- alfie/test_training.py | 140 ++++++++-------- alfie/training.py | 18 ++- 8 files changed, 168 insertions(+), 181 deletions(-) create mode 100644 alfie/__pycache__/test_kmerseq.cpython-37-pytest-5.2.1.pyc create mode 100644 alfie/__pycache__/test_seqio.cpython-37-pytest-5.2.1.pyc create mode 100644 alfie/__pycache__/test_training.cpython-37-pytest-5.2.1.pyc diff --git a/alfie/__pycache__/test_kmerseq.cpython-37-pytest-5.2.1.pyc b/alfie/__pycache__/test_kmerseq.cpython-37-pytest-5.2.1.pyc new file mode 100644 index 0000000000000000000000000000000000000000..45f10302e075f166e55ef2c32e624dd6a4fb97d3 GIT binary patch literal 2734 zcmcJQ-*4MQ9Ki2v$8p>wYu9#cV;g2{)htHZjg@U^MH>SqK!h;@Lw>-xxl8KRiL*P0 zt;Obng0vSz{(wC!kNgjaKg6#{NO?s(O~Clx+0C-4g`$bG?(6e?@4nyf=YD+sW~E}@ z`Mv%5F$JMt&_sc6bQvmt9RxuH2gt$lt#nicVbsx_0?stLqk~2XiXFo-pid>*x5z0G z{4uf%BbblsHcrd$cPM*@dQqQI-W0u}OhGL{y$Y2F8bT5HBFdJMpcvi42_`sEVjL^$ zN}|RpQP&N2ooJ*GKpcq%-ZY{ITBndeG;tx;NO2281O_P)^Lx<8D*q|c9~8+{thO*I z3tKH^sF6x!JSYv(y3W4_y&0n+X7`{qC0bnt^rDT2SiY4ZCe{1u0hH)uADNDIA49ts zl|fsHi$lDHzX87*N-$IvVn(;B36AlteThnD5+hb)0~ktiNiYmBJFq8;{W(QxfcXG? z362tQ%nEx?a=?i9V1y(A`wbcQPAqE9WB}9LeNAwXd2%2wN+vUBGM{Dwt`cwn7p(7d z-zkTHHJXlr90(7RL*#H<-GTG3J#jui=2ZCI3C<&9&ZFD=HBk6j!5K{t@jzjh1gB`j zo*k9U+k5uxoX3dz6z2=XA}{X1sjy%7#Cd$osq!!Wg;V8+$DArV@xO7-{KGj_+0T38 zdTGq1@e31NC&v3XzHbAS|0cM^JsYU(55cwLoqMMJi}pX5jpl~8Wldd0G5#9E_5#1@N1Myw42FSD zM)IW>^r=-3yY#fx3#iAb)rWdK8)f-i>W*)|yto#2=wh?mZ_~w9zq{xKEuZF6cz4$! z`Eq)@*4r3~bZe`r(extP4%tR(wETc}y$((F!0*y-Xyb><%8HFQPoF&1+c;};^Q*jJ ztyrS9Wb>MsU?$c|*0XH$)O_p-?5i+Lh_ga0Z&ve|#q{$q7&_0}cm#hEG!j1n@b<3I zo!j;XjojNV0R-ME4Y--q_?p+72nDR$8WT2I8HDh3j#Z)cL<|?R7%q-f&^X!^C{kXP z^4h2Z-4N5#E-}FhoDwRC&3?QJ0!#~h9*zEd^yrbJH5v`D2lASfjnO`^I-1E$f3S<` zU2w$?x-CX;?~Lr|^$#0w)vws|O#Hi3O>XGC+molSCXN}LDmU8bQO*Q&YUF*K!EYC( z=H+at>#=mkZF)iAGTLL!u+#GxP0OHmSNnbdhm*rW31I|J54wCU><2^+b=yI>>IJU! zTynYBqQXL2&Cb3bcHd?!Wa+F6H?8AE^4NP_F9Ho%l-rpWm# zenjjQ@ci(<+`n{2UX#frBi5ha1A!C`>xzyvWWZB~XAatRcvn#w`ZJB0YjosD&9qX5 z(Wb1~oV{pvw^qh%5n{2;)Y#r|b__bwL))h*TMXE?8p{$|S{iqy#k`LlfjMjegj13E lsFo)oc|fHLxq1_#CcGGzIw9!?^bL_fxdFmJ`;mdm%HJH^-@yO? literal 0 HcmV?d00001 diff --git a/alfie/__pycache__/test_seqio.cpython-37-pytest-5.2.1.pyc b/alfie/__pycache__/test_seqio.cpython-37-pytest-5.2.1.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c6e589bb6d1fe2db5cbf710e40d73622c268625f GIT binary patch literal 8646 zcmd^EOK;rP6(%{}w+kQSM5PxaaN9o4g7>nJ}_^SlG zCfAj&DuRgDVs*8P8{m$(DjU5gvlxZK^bcTnWA>-c$fGz*y0MDPQZ!dZ*;HBD8GWQ} zN_XOR2lj*`ZOZ&UdOGzQ3hd~+=!?A^?3GQKWge=NQa8@VSk{eOGWw%V5?(2HbW^@3 zzmMyyQdhzzV~ceaHojLp4`e<&R74Z~h#tuHuW-FRVuHsFc!;Pl|63HpKE!q4F>(E8 zsJIS>aVd7|F>xvO^I=>HU)ra*4n^Xc-2VPOQ23vRitBI~SIoZln7CrL62=wdseOuT zYR|Z0{5M0zH66yK+PTNZs52^merUuYJe5AboG*LwykENwb7d-R54F08;)6R@(oX;;2K8V^x`$E zVK$n1`E%vUmAu@Uf96bUV<~UzKMp#%D>>@T=j}7JU(DNiy_>t@k6z5%oER6B7lRKq zi_|O?(Om3g0?c&c<-ARn%bk>9L%M4q!QNq<-AP%ELO9x+q9Om@E*2_n|K7HH)524@ zVHhX3dou$klXLG#FckPXcb!hUKl@^#Y`>Gw3dG)_QcFCe0zC}ed%@N=Cvx?)V9e%_ z7m}x6^<7H__r^xLs<4x?c>uy}F*>Ry~1mTlUe{wBR4 z;)J~BCF$z^eO`zjoQob@h#s65L5k3d;%xt#mx>|~|6pPMw-Wybg8zK@*UZa^w@%!7 z`oB4d)9sJYNQw&KLYpR~@H>e9o7iWhELh*S=z%3J6~9`Gl9DH|@oxfWD6{A-7{w-J z)~EQVW)N49N)S^l{&1u#u~A<(v7|Vbk#bUyV2Z1_QXs$5kY9{t*jOmPGAuhFzs4cI z(rkhqU)gOT%&ic_{J`qi> zj};zFpw%{I%-;}r(MLo%b|T{SGvNhU39nxYFZAJcVz{VS8tMIHXK66KLzcJJ9Y4$4 zrbBtPYBo%c6v@JLU+5eYS+xZ3Iiq3KjhbbY=r#A#SJZsVC_5$}G|z0EU0H8bt>~#o zw`{GN(QphD47*MY?r|*S%7{5XBABK}bkm~*+>248qUp>st5PXp$}-4c2jupEeJAF2 z=SbxIZJ^toV-e#H)Pr;Hgx&7wTM_oTQY53iEYi_eK!xXUi}Fy7*zq(q85)rJnAmkD z0wz<1zKK0$*MR-?oW$CwLa zZ+DWB+em_-(yTF)ms;GcSa*9DaGJR4;ulABnuLA{T4Y78~3aqx{>Lqf8UgUlLoGo&SlK-jN&> zlMuGZn4%D-OuSVHE6BvJQT7(umkQw+isF%aJS>Fc$i#}D(}PU>8RZ|Qi%gsh3*nTL zc%+L$_&wN@4rSt>qL-o`GExeqaNpw*c~DWzKPC_A!GlU^6xJw_X^*pZEs@D96PC!J z&whFA0{gVY#;2vO#wNN6Wa$L5wCn=^9S!;M(ok_8U7&2KFZPp z9dfbQCxG&h;18Wmvcs+}3i-I>ql^cIJfYMGDNsplDnvQG3(C)iLOBznq*8W2P#y_U z9^HXb;rE9^c`SnRFZ+U$p7<;_g&qHiBA$G12ih249187;5N*t+cX)rE{2njOX!0Dm zx=i(gUFb0+$ghcCui(;q@T({<&-Keobhee33j^il1*#cQsCv367EwwWEUyZ}rCM@< znsd}#7@*!ueyMrMQ>or6rcehjkRT$B)%y+slzpPd)%rvax(1bBh0$m2Xz2N-ZI8K4bJOkqik|2bc@q#Yb$HZtEiI zcx`oMt+yiV%PXs^IHGY3lgAqER#$p2`*tz3jNujPVJ38Gw%#kS=f^nMP~vfFUZmzp z1VWP2y^2!&!e^{*vPgRi#no(Vyqbjq}%mHdHDk0BH$<9^)0O? zj+4Q03aV)8VH{O;hE1~>HzQQhSfGlq6e&J$kSa=G)3H%Mi7nktwLia!k2XTfO$cT` zBA7kPj^eIlT|J`d6n<-{*rPh+cR<=__F0-Q)O6XcsMv9#>BgzrcE`U^(`A8J1}cf2 z1kypYr*@!?@$67&PlsF?v)|emw9o87`|K`gKN$+`b0J#Q)}KCF!R%+YGkbd{v-1~_ zNWLa!e-)R0n#?W;%4ha=l-Yd}CiWDhyPerNsXv|HhSN~ zCtj!@5j9zO@1% zw1ck|2&Sh(q%3sCBFo?l&^%)JT%ZUXr|cNjs!_$_eAU4^0UbNlveP3bc_O7UZIr83 z%tUiyN}AoFUA5f1fw{R*Ez^b=H()DM4^Yr0IK4Lrzy!xvW{Fe1{lt>O8^NQhnx0Cx zl;8BHr>k&!xrDgj-=$u-3c;__Aw5!0w)_$K)O!mN|HU-Ln`XZc|Gk9BBo;2>_|YPFR7kun1j*5-B`_FJu|yok#a)Ri6tTJg?f{%*UT4bE}K@#G-*Q8lqorz*0S0mZR~%5glp{p literal 0 HcmV?d00001 diff --git a/alfie/__pycache__/test_training.cpython-37-pytest-5.2.1.pyc b/alfie/__pycache__/test_training.cpython-37-pytest-5.2.1.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a0cc1defe54e52a5e15987ffa7de528b50908640 GIT binary patch literal 12013 zcmeHNYiu0Xb)MJG&Mx=C$C46dOQvNz*18fUzVvV`S@J_Rl2}z_CQi3$)P8!B_99@~0>jsnpU?Pi6hC%4g%QDvsRIrjI& zy@t}&aik!#x-s6(dJ}&3mfAJMQE?2CJ|SkrlR^{6A>C8rY4MEsjCfX@5M}Y4_%(4- zoT`<&BfjB}K--+3BW-t8=^sen6Oq2TUCn(QQjx|vXgo{pvBszS8tqxO~!H zmox&3RF1X(mGq}5!kb*!nOz=oBrbf1SBDc3RnOjPx2kTZ0dKRm*;@7L?bd9?dZXQF zY`Uvpa!tRlKSye&kM?Nxz<4qjyVUU{QE59|`RXt@x-P43}SN)#$ zjsY3eoP#fxQi=hQPNI@|MQH~ZUBVdmltmClsiS$Ey;IN9#Z#qicr z*$q8DLpiR9coULrPJW%L71T|>5 zlvDC3+9s*zh%Cq_qRaC;#e`;auu5Qn=}cLZPlJV}4h-u4-XyMjX(E_tC_M|LTne;~ z2r{o=;9Zqu{uDllniE+4(OsFk|EC(H*!aO@wZnsJmPQcCLhOzTq5B{ z=F=x|gjww>Njjk%p8qe8$4RaUW*F_pLwXX?%(^gd8TD0P<9W-NXYFNp zruI$LXL+W!i_%Cm8yOShF~qA6h(+Zb$mMVK(*Gc;7%fw8z>#)|VX|>ny#+ z^DW8pqm1~B8E1m)yUc|-C#3iRbD<2b#}B5ZNFzEP-8sHI9HH$d{C(AR-bqB|t5@RR za{PNO{=FRkVzw(W+j7kInq;4Q6@w@YZHNTTI_3Zr&a9@r#U)8WqH+vR#jU=4PqY0P z#~)@{8INLAKQ&7EOzDNR*kC&H;kb9R`h`dCEJcE3N$g4!CpXF;rUI!^>D<<`|^N#!t zx>q6cc_Q2%M~Z|yT}b-LCHXaCtP&wNl<<+0p^uD3+;m#2j!XXCT?A1yLmO93E$JxV zX18Z(B~{~^fkRy98o7e!Vz)qsEhduJNjS~#iz#u4(_5sp zC?DqZ)`HWt{Ir;c)0`AX$R)7TG~qVRVk$o(j`}9t<}qrG9m^B_w8#A!P5ZqRoZ>&w zq@PM~7f429j$aT5JJ$aqj>xgja2YO?&xz!SuhEKl@{UPqFcR9Q{0zI*Y?SUHnc$Ro z5^0`>gVJJ3`Aq1mim_>>JK~M|2mwfR74bB#^SI9AI_2ls9tFQZ_JEW7;6N66CbCEY ziQsieNERu;BA=o5*dovN^@Yv;2^4e@1&rFd)^?s?|Mx4&!1mzW!6 z*kg&yCb-YuG4S1t6GV`Bu0zjxC4O#)c&Uu%sJ)_(G3nFzEBa$-;R3fX5q)g{Z{n|w z#?;_z8}o}|Vj6w~Us%a6Z5-}u;`uH>0KX7TXK3`KWbQy#`)p)2eK+O44Oz%)G+BFr z+9Rv6e2|*Y{hkN(BQ7#{HYqNN7yU`@@zF^3(S+CAnx-%yS<#&> zWJAu@Je3xRED~9|auTyro+`Zdh#trSk-?Uk`4RnS3xo!HG`tRM0WuiPNyB;TBl=Mm zU>~;p^75nq16m-k>{h*LH|lm^t=Pz~BuXg&%C&ls?lkO{XCt}n$hHSO%?3c2K^ZB6 zTNi;^vb06e* zA2jj-DCA9N+s$@Q0EJ8dk%TKUau%eK9ae#zmCZ1Bor7F*o(Lh5a)t;cT*I&=XTN|Y z=Vcvry>lSINw1U8$aO=ZQP?Lmp4ulv$Vt*ftY^zEL#jYZ!5ZiQk^7QSRxN)MgaFgC za)B;MGESP1@LN1b$XB?&eCE<@PAUg@|vEi1#rbBzPE<3y6NR*@7Jt z@)MYAMb?{-0A0Dyd?e(s<;d_Pb?2Y~XxbuoTO%sHUiFn}an9VkYObudCb$8-UR$&;iW zp$C1mKYrQZU4)PCL;KP7_#g%bbkV{PiUNakua~Fg^Qe=bCBjg~SY!o4Awr-aFlgNK z5dS3LhLm_mkdC^a6%TY6MBb!#)7Y1nK+02ci^^>xzfR-}M7{`;#E35>FU}9}5&0%D zGF~1*#tR{W@&>hX9i)Gln}&Cf(A_xw-`XNInZQ13HgYvHTT?FU|_7T8#4`Q#Hj%7aJA)H11ixB@|#j zKPDKwk!qS<+Q^X;g+{t*b+f>HvVKMsf%(uz2W0<@`2Z7fzw2f8vSNZaHI(QLoU=}P zBYQc9%KsVld4j*?FHqt|Uu2Tl6w&8E8wT@Go~p_MbA@>-@F#}LM{tkcVIjjiOcL`u zrV}k(r53!fh3nJ;mGPcwZVi_6sf~h0wQ_ z;j!2Pn)^C)0RaHl4dy}_T*vx01kN_FnC5N~KU++b?-T!nEhc6-$1g2IKjtNSFD)2_aj-qj@Vx015twH8+M5@YLicr3&+)%!PGji)EugT}|A#i+@&roTW z2rna^q7sE|c>yFu?Nx@NuSToPpFRRlUhpAPB1Ygjf+SXlTh6=HElJ-x!1s9Cg22OZ z^*%B4BZC}*Y9h$bfj^9Q;fgI|`*H^FliNf(Aj1J3)1C^g*e>?Igt{O_5^nFjnh3rh zU#eu#9SBh_g@Gw$OKES15wYN|Z?3Ijy=SvkciS!bJ0vX4fj%b6k39{%EKD0M>aODk z*)P*Z0$!di7r6rhbz6Rm-tcWAyaK>c@Ef=s3RH7pXo|wtnM54CMo+#U^GCsON#3IS z`xXc$!UcjMHEDUmrI#vNRyFaLQ=_w1;(%ju@Swm$YpW){ldpzBiC0&XVP%~IPEO0I z@_T464lOxc#G&P999lN|frEp~ZM};fD;wD1hGhXA&@RFQL%Uc_#X4hhuNwu128ALH z47ArRCDI%i`g`3h4h+0=F5K$|r=EjT$qItJRY4$exZADJcUkToc0h2=@<;4`P=dR{ z(nZYi6XKwy3;8$1K_zgIi{>Q)KEti>nHB};K3c)T2Q3Cz6sF?+aiL@H?~fzbrQ+x> z2+%$`Wn=6F43fvhnhMj~s3PP}i8(!JJzy?zY&^B&|!`+rHgP$qk9x(5|M=F5*$gwK=d|U?h7C#4j_-KMe)*l zFx}sC74`}3mz3eIz919d!q&#{Ubry-9W+@f4DZ|Ke!+}mv)S1WrvzpP+aYZ?%$JwR z2-NwKk|hpnELp-ZY@!j=2G$GV-tbHGg}YNA1{T}L6$>%dQZ2(8w??gLYw~{psFn~* literal 0 HcmV?d00001 diff --git a/alfie/__pycache__/training.cpython-37.pyc b/alfie/__pycache__/training.cpython-37.pyc index c56930b6343d58bdccdac8aef2ab17ffcd4aec5d..c24c541d09b443c7f08c00c083e9de38709f6470 100644 GIT binary patch delta 1435 zcmZ8h&u`pB6!zG=-nBR0Y|@bCN0T}UBXA+$Y@ge;D5Mw7-jn)mbLTS@C8o|LAhvf|2>M6-;32<6Ptb*cSOS_f?%T= z5Y8LTC}28Nn(EvSXeiPgr_{wIE*KFFKXmBIjH% zZ0y5#NJTi#%jrRXMGz*i&O5i{d0By@W*JU$$!b!QlXe`?2JF}b&L;qrWPWG@*}e6l z3PP*Bo5QVQ!8F#^)(j*0Y2wn>)>5C%!l@1on{?K`V!t)JF2{}8#iuVV*^<=P*Z(Cq zfcHrq#(Mn`xn8ed&qzOOZ+r4J_1ZL4!KB^l4oXprJt6tLgarvQ>=ba(7nHGRpEFyo zY7(XZaPsHWxnuZpy4P?$tizpxA#g!*IWYoqiw2oVs&e~)h%)zhjI=2>q+ppUsovTr zY=^_ijPo=PUSfxA22Q2c4t0FmCy=U+jCryP|m>+`)I?Gg1@UJ;v=@PFr|rI@mIM-bshp_5~y zuq8!@nO9O2pDw4&cRQX3SKkSJ9)%BKHy;2{7AvR-&wyG%>@x5NF2K$gx8=Bh?Ln_L z_hHU@3f6Fr(V*j~(1#0lc~!0yc;tyTIl#$dRXe~r;*vHll4i5RNb?Z-I2S}53>(G5 zZ-ZeOF0`p&ev><;R23d*M;YN*Z$~Z-I5sykt%Og4R3om7^?)2w1{!bBkW-dvaZdhn za<4iwf(tH-VrUDn26V(OiQ!M;m+JXa)_x~F_?a_`E+l82nU&|^oTITEyk4wwAret!Fm?!uAVkWdNPt;&c6yv)=jHBR z*^bs)aexyX$efUXgd<2Gg}rb>TtVUw^c{%{R}P2+RkOxCx}|UWtLm<*uCAI--}tUs zeW_Y?6&T*HyYKMF)ju`IKnlYtCiIn80wbt=q#qdvYdzzU)>B}rb>$0fTiKa=TRW;8 znLXuQy;rG&2EJxp(Uc?W$lkkjN9kETJJfC~zDY&~AL!p`gj|hX51fDW$=spPe9_o!aHo6JVR20{Fwd5=UILvn1bmulTyQ_QVvboZ(__eaG01aGmKZ5JR4Y{n~^59q^2orFl?p-{OA^2KpVQ34(kuE1m z9@7@=xCNX~11QD&M_ zaAIA~c} zoR)Z?W(x=>H8d=Yk({!mD~O1))Hm1+Y?12WP^aeq_ddq&b;A&9pqoZKOMi#md=x-g zuBkT6QLU!3>w|}jpBvR1Xv$zT?)sW6ZY+6YucO4`Stt?$opP|`^`$`ql1Q7h+g(Q5 zhtQ5joN=hrEEiD%b;_ibP{E=$_eZ2ALP!i{giABY0vdDa-Yyd%A`(lT1c5YTa!45@ zaGR!_G7p@xM(KH*WjXYLw6LY1cYUjv^2a8n6&x#U?2UUQhxUKopKp|%c+!q?oWx;7 zTS1z(0veKTEd1iWzi^R)ap)HOXT1z5d=ed=M!?d$>6_(4v-2pqfbdN5vp=(l?;y>b zqygdV5)xNHz!?kUY@gjhK{2vCyCJKk+Y6lpE(N_=*5@SJDa-XLuC60s82@^&VcbQy Sym;za_llzaJzeUqo9bT_?}2Fm diff --git a/alfie/test_kmerseq.py b/alfie/test_kmerseq.py index a3d5c85..71f59ed 100644 --- a/alfie/test_kmerseq.py +++ b/alfie/test_kmerseq.py @@ -1,37 +1,23 @@ +import pytest +from alfie.kmerseq import KmerFeatures -import unittest - -from kmerseq import KmerFeatures - -class KmerTests(unittest.TestCase): +def test_KmerFeatures(): """Unit tests for the KmerFeatures class.""" - @classmethod - def setUpClass(self): - """Initiate the test class instance.""" - self.test_kmers = KmerFeatures("test1", - "aaaaaattttttatatatgcgcgccccccgccgcgccgggc") - - def test_KmerFeatures(self): - - self.assertEqual(self.test_kmers.name, - "test1") + test_kmers = KmerFeatures("test1", + "aaaaaattttttatatatgcgcgccccccgccgcgccgggc") + + assert test_kmers.name == "test1" - self.assertEqual(self.test_kmers.labels.shape, - (256,)) + assert test_kmers.labels.shape == (256,) - self.assertEqual(list(self.test_kmers.labels[:3]), - ['AAAA', 'AAAC', 'AAAG']) + assert list(test_kmers.labels[:3]) == ['AAAA', 'AAAC', 'AAAG'] - self.assertEqual(list(self.test_kmers.labels[-3:]), - ['TTTC', 'TTTG', 'TTTT']) + assert list(test_kmers.labels[-3:]) == ['TTTC', 'TTTG', 'TTTT'] - self.assertEqual(self.test_kmers.kmer_freqs.shape, - (256,)) + assert test_kmers.kmer_freqs.shape == (256,) - with self.assertRaises(ValueError): - self.assertEqual(KmerFeatures("test1", "NOTDNA")) + with pytest.raises(ValueError): + KmerFeatures("test1", "NOTDNA") -if __name__ == '__main__': - unittest.main() diff --git a/alfie/test_seqio.py b/alfie/test_seqio.py index 684de25..dfac7b7 100644 --- a/alfie/test_seqio.py +++ b/alfie/test_seqio.py @@ -1,99 +1,84 @@ -import os -import unittest +#import os -from seqio import file_type, outfile_dict, read_fasta, read_fastq +import pytest + +from alfie.seqio import file_type, outfile_dict, read_fasta, read_fastq from alfie import ex_fasta_file, ex_fastq_file -class SeqioTests(unittest.TestCase): - """Unit tests for the seqio functions""" - @classmethod - def setUpClass(self): - """Initiate the test class instance.""" - self._expected_kingdom_dict = {0: 'alfie_out/animalia_test.fasta', - 1: 'alfie_out/bacteria_test.fasta', - 2: 'alfie_out/fungi_test.fasta', - 3: 'alfie_out/plantae_test.fasta', - 4: 'alfie_out/protista_test.fasta'} - - self._fasta_infile = ex_fasta_file - self._fastq_infile = ex_fastq_file - - @classmethod +""" +#TODO - unit tests for write - see if buffer or make and destroy files is best practice +#when you add the write tests, do this in pytest def tearDown(self): - """After unit tests, remove the temporary outputs.""" + #After unit tests, remove the temporary outputs. try: os.rmdir("alfie_out") except OSError: pass +""" + +def test_file_type(): + """Test that the file type is properly identified.""" + assert file_type("file_1.fa") == "fasta" + assert file_type("file_1.fasta") == "fasta" + assert file_type("in.file_1.fa") == "fasta" + assert file_type("file_2.fq") == "fastq" + assert file_type("file_2.fastq") == "fastq" + assert file_type("in.file_2.fq") == "fastq" + + with pytest.raises(ValueError): + file_type("infile_2.txt") + + with pytest.raises(ValueError): + file_type("in.file_2.csv") + + +def test_outfile_builder(): + """Test that the output file set is generated properly.""" + expected_kingdom_dict1 = {0: 'alfie_out/animalia_test.fasta', + 1: 'alfie_out/bacteria_test.fasta', + 2: 'alfie_out/fungi_test.fasta', + 3: 'alfie_out/plantae_test.fasta', + 4: 'alfie_out/protista_test.fasta'} + + expected_kingdom_dict2 = {0: 'diff_place/animalia_test.fastq', + 1: 'diff_place/bacteria_test.fastq', + 2: 'diff_place/fungi_test.fastq', + 3: 'diff_place/plantae_test.fastq', + 4: 'diff_place/protista_test.fastq'} - def test_file_type(self): - """Test that the file type is properly identified.""" - self.assertEqual(file_type("file_1.fa"), - "fasta") - self.assertEqual(file_type("file_1.fasta"), - "fasta") - self.assertEqual(file_type("in.file_1.fa"), - "fasta") - self.assertEqual(file_type("file_2.fq"), - "fastq") - self.assertEqual(file_type("file_2.fastq"), - "fastq") - self.assertEqual(file_type("in.file_2.fq"), - "fastq") - - with self.assertRaises(ValueError): - self.assertEqual(file_type("infile_2.txt")) + out1 = outfile_dict("test.fasta") + assert out1 == expected_kingdom_dict1 + + out2 = outfile_dict("in_data/test.fastq", folder_prefix = 'diff_place/') + assert out2 == expected_kingdom_dict2 + + +def test_fasta_reader(): + """ Test the fasta reader functions.""" + fasta_read = read_fasta(ex_fasta_file) - with self.assertRaises(ValueError): - self.assertEqual(file_type("in.file_2.csv")) - - def test_outfile_builder(self): - """Test that the output file set is generated properly.""" - self.assertEqual(outfile_dict("test.fasta"), - self._expected_kingdom_dict) - - self.assertEqual(outfile_dict("in_data/test.fasta"), - self._expected_kingdom_dict) - - def test_fasta_reader(self): - """ Test the fasta reader functions.""" - self._fasta_read = read_fasta(self._fasta_infile) - - self.assertEqual(len(self._fasta_read), 100) - - self.assertEqual(self._fasta_read[0]['name'], - "seq1_plantae") - self.assertEqual(self._fasta_read[1]['name'], - "seq2_bacteria") - self.assertEqual(self._fasta_read[2]['name'], - "seq3_protista") - - self.assertEqual(self._fasta_read[0]['sequence'][:25], - "TTCTAGGAGCATGTATATCTATGCT") - self.assertEqual(self._fasta_read[1]['sequence'][:25], - "ACGGGCTTATCATGGTATTTGGTGC") - self.assertEqual(self._fasta_read[2]['sequence'][:25], - "AGTATTAATTCGTATGGAATTAGCA") - - def test_fastq_reader(self): - """ Test the fastq reader functions.""" - self._fastq_read = read_fastq(self._fastq_infile) - - self.assertEqual(len(self._fastq_read), 100) + assert len(fasta_read) == 100 - for i in range(len(self._fastq_read)): - self.assertEqual(list(self._fastq_read[i].keys()), - ['name', 'sequence', 'strand', 'quality']) + assert fasta_read[0]['name'] == "seq1_plantae" + assert fasta_read[1]['name'] == "seq2_bacteria" + assert fasta_read[2]['name'] == "seq3_protista" + + assert fasta_read[0]['sequence'][:25] == "TTCTAGGAGCATGTATATCTATGCT" + assert fasta_read[1]['sequence'][:25] == "ACGGGCTTATCATGGTATTTGGTGC" + assert fasta_read[2]['sequence'][:25] == "AGTATTAATTCGTATGGAATTAGCA" + + +def test_fastq_reader(): + """ Test the fastq reader functions.""" + fastq_read = read_fastq(ex_fastq_file) - self.assertEqual(self._fastq_read[0]['sequence'][:25], - "ttctaggagcatgtatatctatgct") - self.assertEqual(self._fastq_read[1]['sequence'][:25], - "acgggcttatcatggtatttggtgc") - self.assertEqual(self._fastq_read[2]['sequence'][:25], - "agtattaattcgtatggaattagca") + assert len(fastq_read) == 100 + for i in range(len(fastq_read)): + assert list(fastq_read[i].keys()) == ['name', 'sequence', 'strand', 'quality'] -if __name__ == '__main__': - unittest.main() + assert fastq_read[0]['sequence'][:25] == "ttctaggagcatgtatatctatgct" + assert fastq_read[1]['sequence'][:25] == "acgggcttatcatggtatttggtgc" + assert fastq_read[2]['sequence'][:25] == "agtattaattcgtatggaattagca" diff --git a/alfie/test_training.py b/alfie/test_training.py index 075a496..6977234 100644 --- a/alfie/test_training.py +++ b/alfie/test_training.py @@ -1,87 +1,99 @@ -import unittest +"""Unit tests for the module: alfie.training """ -import training +import pytest +import alfie.training as training +import numpy as np import pandas as pd -class TrainingTests(unittest.TestCase): +#NOTE : I'm trying this in pytest as opposed to the unittest module, will see how it goes. - def test_split(self): +def test_split(): + """Tests for the stratified_taxon_split function.""" + data = pd.DataFrame({"phylum" : ["Mollusca"]*10 + ["Arthropoda"] * 15, + "data_col" : [np.random.randint(100) for x in range(25)]}) + #split on the column phylum, contians the classifications + train, test = training.stratified_taxon_split(data, class_col = "phylum", + test_size = .2, silent = True, seed = 1738) + # 80% of data in train + assert train.shape == (20, 2) + # index order is randomized + assert list(train.index) == [16, 13, 0, 17, 5, 3, 10, + 9, 18, 24, 23, 14, 2, 1, + 20, 12, 19, 6, 4, 22] - data = pd.DataFrame({"phylum" : ["Mollusca"]*10 + ["Arthropoda"] * 15, - "data_col" : [np.random.randint(100) for x in range(25)]}) - #split on the column phylum, contians the classifications - train, test = stratified_taxon_split(data, class_col = "phylum", - test_size = .2, silent = True) - # 80% of data in train - train.shape - # index order is randomized - train.index - test.shape + assert test.shape == (5, 2) + assert list(test.index) == [15, 21, 7, 11, 8] - def test_sample_sequences(self): - in_seq = "AAAAAAAAAATTTTTTTTTTGGGGGGGGGGCCCCCCCCCCAAAAAAAAAATTTTTTTTTTGGGGGGGGGG" - - sample_seq(in_seq, min_size = 25, max_size = 70, seed = 1738) - ['GGGCCCCCCCCCCAAAAAAAAAATTTTTTT'] +def test_sample_seq(): - sample_seq(in_seq, min_size = 25, max_size = 70, n = 2, seed = 1738) - ['ATTTTTTTTTTGGGGGGGGGGCCCCCCCCC', - 'TTTTTTTTGGGGGGGGGGCCCCCCCCCCAAAAAAAAAATTTTTTTTTTGGGG'] + in_seq = "AAAAAAAAAATTTTTTTTTTGGGGGGGGGGCCCCCCCCCCAAAAAAAAAATTTTTTTTTTGGGGGGGGGG" + out1 = training.sample_seq(in_seq, min_size = 25, max_size = 70, seed = 1738) + expected1 = ['GGGCCCCCCCCCCAAAAAAAAAATTTTTTT'] + + assert out1 == expected1 - def test_process_sequences(self): + out2 = training.sample_seq(in_seq, min_size = 25, max_size = 70, n = 2, seed = 1738) + expected2 = ['ATTTTTTTTTTGGGGGGGGGGCCCCCCCCC', + 'TTTTTTTTGGGGGGGGGGCCCCCCCCCCAAAAAAAAAATTTTTTTTTTGGGG'] - ex_dat = pd.DataFrame({"processid" : ["ex1", "ex2", "ex3", "ex4", "ex5",], - "sequence" : ["AAAAAG" * 50 , "AAATAA" * 50, "AAGAAA" * 50, "TTTTAT" * 50, "TCTTCT" * 50], - "kingdom" : ["animalia", "bacteria", "fungi", "plantae", "protista"]}) + assert out2 == expected2 - #process the example data with defaults - out_dat = process_sequences(ex_dat) - #dict with 4 equal lenght lists - out_dat.keys() - dict_keys(['ids', 'labels', 'data', 'seq']) - len(out_dat['ids']) == len(ex_dat['processid']) +def test_process_sequences(): - #different size k, turn off the subsampling, output a dataframe - out_dat2 = process_sequences(ex_dat, k = 2, - to_dataframe = True, - subsample = False) + ex_dat = pd.DataFrame({"processid" : ["ex1", "ex2", "ex3", "ex4", "ex5",], + "sequence" : ["AAAAAG" * 50 , "AAATAA" * 50, "AAGAAA" * 50, "TTTTAT" * 50, "TCTTCT" * 50], + "kingdom" : ["animalia", "bacteria", "fungi", "plantae", "protista"]}) - out_dat2.columns - Index(['ids', 'labels', 'data', 'seq'], dtype='object') + #process the example data with defaults + out_dat = training.process_sequences(ex_dat) - def test_shuffle_unison(self): + #dict with 4 equal lenght lists + assert list(out_dat.keys()) == ['ids', 'labels', 'data', 'seq'] + assert len(out_dat['ids']) == len(ex_dat['processid']) - x = np.array([[1,2], - [3,4], - [5,6], - [7,8]]) - y = np.array([[1,2], - [3,4], - [5,6], - [7,8]]) + #different size k, turn off the subsampling, output a dataframe + out_dat2 = training.process_sequences(ex_dat, k = 2, + to_dataframe = True, + subsample = False) - new_x, new_y = shuffle_unison(x, y, seed = 1738) + #query dataframe properties + assert list(out_dat2.columns) ==['ids', 'labels', 'data', 'seq'] + assert np.all(out_dat2.ids == ex_dat.processid) + assert out_dat2['data'][0].shape == (16,) - #is x the same as before shuffle_unison? - np.all(new_x == x) - False - #have x and y been shuffled in unison? - np.all(new_x == new_y) - def test_nn_constriction(self): +def test_shuffle_unison(): - dnn_1mer = training.alfie_dnn_default() - model1 = alfie_dnn_default(hidden_sizes = [10,4], in_shape = 4, n_classes = 2) - - model1.input.shape - TensorShape([None, 4]) - - model1.output.shape - TensorShape([None, 2]) - - model1.trainable + x = np.array([[1,2], + [3,4], + [5,6], + [7,8]]) + y = np.array([[1,2], + [3,4], + [5,6], + [7,8]]) + + new_x, new_y = training.shuffle_unison(x, y, seed = 1738) + + #is x the same as before shuffle_unison? + assert np.all(new_x == x) == False + #have x and y been shuffled in unison? + assert np.all(new_x == new_y) + + with pytest.raises(ValueError): + training.shuffle_unison(x, np.array([[1,1],[1,2]]), seed = 1738) + +def test_alfie_dnn_default(): + + model1 = training.alfie_dnn_default(hidden_sizes = [10,4], in_shape = 4, n_classes = 2) + + assert list(model1.input.shape) == [None, 4] + + assert list(model1.output.shape) == [None, 2] + + assert model1.trainable diff --git a/alfie/training.py b/alfie/training.py index b2602be..30ce535 100644 --- a/alfie/training.py +++ b/alfie/training.py @@ -25,7 +25,7 @@ from alfie.kmerseq import KmerFeatures -def stratified_taxon_split(input_data, class_col, test_size = 0.3, silent = False): +def stratified_taxon_split(input_data, class_col, test_size = 0.3, silent = False, seed = None): """ Conduct a stratified train/test split based on a user defined categorical column. @@ -45,6 +45,9 @@ def stratified_taxon_split(input_data, class_col, test_size = 0.3, silent = Fals silent : bool, should the split criteria be echoded, defualt is True. + seed : int, a random seed for repeatable random sampling. Default is None. + + Returns --------- out1, out2 : pandas.DataFrame, out1 is the training data frame, out2 is the test data frame. @@ -68,7 +71,7 @@ def stratified_taxon_split(input_data, class_col, test_size = 0.3, silent = Fals print(f'Conducting train/test split, split evenly by: {class_col}') #split off a test/valid set, 30% of the data total - strat_index = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=1738) + strat_index = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=seed) for train_index, test_valid_index in strat_index.split(input_data, input_data[class_col]): train, test = input_data.loc[train_index], input_data.loc[test_valid_index] @@ -95,7 +98,7 @@ def sample_seq(seq, min_size = 200, max_size = 600, n = 1, seed = None): n : int, the number of random samples to generte from each input sequence. Default is 1 (no upsampling). - seed : int, a random seed for repeatable random sampling. + seed : int, a random seed for repeatable random sampling. Default is None. Returns --------- @@ -192,9 +195,10 @@ def process_sequences(seq_df, id_col = 'processid', Examples --------- #build a dataframe of artifical data - >>> ex_dat = pd.DataFrame({"processid" : ["ex1", "ex2", "ex3", "ex4", "ex5",], - >>> "sequence" : ["AAAAAG" * 50 , "AAATAA" * 50, "AAGAAA" * 50, "TTTTAT" * 50, "TCTTCT" * 50], - >>> "kingdom" : ["animalia", "bacteria", "fungi", "plantae", "protista"]}) + >>> ex_dat = pd.DataFrame({ + >>> "processid" : ["ex1", "ex2", "ex3", "ex4", "ex5"], + >>> "sequence" : ["AAAAAG"*50, "AAATAA"*50, "AAGAAA"*50, "TTTTAT"*50, "TCTTCT"*50], + >>> "kingdom" : ["animalia", "bacteria", "fungi", "plantae", "protista"]}) #process the example data with defaults >>> out_dat = process_sequences(ex_dat) @@ -256,7 +260,7 @@ def shuffle_unison(x, y, seed = None): y : np.array, the second array to shuffle - seed : int, a random seed for repeatable random sampling. + seed : int, a random seed for repeatable random sampling. Default is None. Returns ---------