From debffdb24cda91fbd10af4a7852e2989522be466 Mon Sep 17 00:00:00 2001 From: Zhongkai Fu Date: Wed, 2 Dec 2015 23:02:15 -0800 Subject: [PATCH] #1. Support dropout for SimpleRNN (both forward and bi-directional) #2. Speed up training performance --- RNNSharp.v12.suo | Bin 143872 -> 143872 bytes RNNSharp/BiRNN.cs | 10 ++-- RNNSharp/LSTMRNN.cs | 2 +- RNNSharp/ModelSetting.cs | 8 +-- RNNSharp/RNN.cs | 12 +++-- RNNSharp/RNNEncoder.cs | 2 +- RNNSharp/SimpleRNN.cs | 103 +++++++++++++++++++------------------ RNNSharp/neuron.cs | 5 +- RNNSharpConsole/Program.cs | 10 ++-- 9 files changed, 78 insertions(+), 74 deletions(-) diff --git a/RNNSharp.v12.suo b/RNNSharp.v12.suo index 14b57f0757cbd62e31b161fe9e2acc23931a10d1..be2878193476d68c6258a6b23238e3b45be372bb 100644 GIT binary patch delta 2970 zcmeHIdrXu^5dUVm<9Ua7a3Bc6@hF^vcpwN^iztUutUN6~tMXDY2ne4*qR82*Nu#uQ zTiIwD1+2Ct+6bTVS}l4$uwb=5l2WzSl=^_GeT1B?_`vQS_qC9s_7Bs)x|g|cW_EXG zelxSX`kVIpoA$M75ZIjVg&fqodbJhfF2V%78dbwOd>@^c1#ukbOz9~dIDJZBbtx5O zl-O{j^n}e=hbxseltek4@*$M8gLEF0S5O*H$)3_kN-E&Ee(R=oD!Wkfq@<=anvyu@ z&T(1XD5?;hx>4SRZWbM~ueg};EbWC2l>7LaV8zjZC+Et#QU z8F&gZg6S+{FiO>+4Q!I399TvFDav3QhbpRKN$MB+oRR@2sKPPty2aqE6|{~f7=C&m zR{QM639eE2fzM1iu&@HGmRVO|-uHrxRm^9QQsBp?flSN<3xX}RHP9vAZup^F3T_zV zi?v9OdBtqp?!FK6%2a$3g-bK&l1SV)FPTXKKmP*A#!?kaqG$&53opo6`ISSIo?BYH z+d#21v!iou_r@lV*Q<6HSM>T}KYtbeX>>GJs{)u!%&RP(uvwKV2Q@X}H^huvsuWmX zh!IK)sIW@t^IQY47O(c&Z1B)oLJA$S?HM6821*Pu9$QNkf+prN80Mao!z8*->;p>z z%@fbdfwd;<9ucHWLWMn)*f3~n6wM#ZdFz0>X6bcP<$Iehs!lUpLbE0n8p9k7IZ3!Z zITZ8BR%po-8tbQ_G3M%J2~;_onWUp}lEko72d8l2(qK4_8y5R+EvYm(N+hhy452FU z8_=2jxI!+U%ivPzFO19O%D6SH!OdHo`rl9H9qZfA+1;hVELZHxTDGI=?9Y=;^Ol{s zo!)lUlKDmBU9rhOQ&F|(hC|VbjPTei@60&-U0jf5GCE+1;@h0KRjvgK`jnm9YHl=t z%CiYuOMLV3aqo#F-W{C1|3lySNI+s^JP>^_nyGNA*i`)5CGE%8&8hm7YpoktaDH4Y zz15~$ehJ(Ju8Ny;TwNpzB8ZX7D$@}A7*GJ{lBrn)PIa7vO7V2QwGiJz2vKOz_f#aX0 z$7Wp`*3HpkJ~36!vBqT=wP??YT{xD4(nHnNUxAbElnOm19>bpo!8}(6_|hMdLjAq7 z(2|k_;z8WaXM?~ervcWfwAef|R8Z#yqJOrpT6`hesMFJw*ptI9&<;xM#n_sCxn+4d zk}h~4>pp|Kno@-`D|+0eN4r%P@t9El_5oLE@nW!wcj|gwq|vSJ-DiZZri78kz|Y|t zYUB~|KMhxnSHFNDJCgG*Og7f8fZNCz|2CKa>kn=a(rRO*xOR;MJ3lhu$o1>~r3+t# z(N}r6KCI68@D$hrE-myCeD?ilhtZDdq_!0fVZAnp=tqF9QS&8?Lb!a-QIazqVuS;m zzea=x@NlmqQaE|jOOkbF3yPMJ;~~&V#`~Z?Qq%(xWM>fM8GjoCkAZE;!@s^NC*MbbhG@ruGYoPdk#oA? z>@@^Kh;qa%<|1>zMObcG2ssvrykWU(1MmFzAY7$GLsp`%H5G%P1t4MdTmFi25G zi}PHtDT(qbaxM%y-Pub#Fk~k7aIlw2_jh#D$RaJoDR)BDLKRXUveGG8@u{p<2*I90S2d^8^~x(6(&!_(g_{UK`;SYKfBM+5H0dh0-M&5Oi=aFF1Yg`y4v29l(B)G>h93qd^y=s^^? z3mIdSB(pRaf&dVLNxYGVfEqyGWkAgXhoK%KVtx@CVDf72Y%DrWS6NiL0HRXxG$;cL zK{_0YRIO|KeSc5bY9qWg`#295RoM7 zggag(zSpJcb(q70c|O-7qko$r)7xReWS=5joglITO)Af*(!gmdcE4~3Ch1wi_=??V z(Mf#D96BuIm=spyG=KI-kD$Ytrz7?z@4rWC4TE$bB)*vB$=U)4QURA%tpyf^j1|j+ zcL`JUP+`#}3u`o{@gnSmp2a#Kl}wOu(Dq2JL-N=o8jh_clDIW6*T45pX{oRoyJ)en zh2?gy$tmHRt(k>RDrQYTrDYkyy+G%Rb?J^KE359i5fDaW3W3Q&NGH zi@!u^91uhQI4N9SO4{4M|E|3&@R`=aY23;PFg&QBbLdH zv8v`Om}bbjZ{tp@A)mB-GxXkZ@OKcb_U@R?&>`F{uI#mRh#C|7+lPmX@JC~v>BHNo zae~TT(4m=+3C5k`R>3JPrwyl^&wTA1bU4eo99%~kq)(M5OUcN|Oldq$XpE%fucwyA zX|jE0mUaS0@vY{YF03>2MVp)?I%ei;HBhMTQO@#46W?S1n_^B7CbKu*%zV-l?c+Pt zJQGJ0eCmOxyH)7ZQnhM~9iKZQ5;{vy`-XJUV_tg?07;;_L3|W z-#EUj!)Xm~r>3kKX@>Mi zG>(E}zzt3S-XfPawBG}N0`G&+C-^t`5BMJ31h>F#(8csBylU%Q z3#lm`8sE7!S&N7YgspfDJtjJfyC}=qIWJG9cg5sKRd&s&etJxer+dt~tfnPYeRNB?xBI2zMu>X8 zqjq_%^YU&>m@D=?U630~Wsy>55J~SFUFJ{doL}=TQrXK40aiSuQ!)!J3F7jLkBIg& zdQn;yq&e|Qz;oq8ovX^~jr}q$g%*6BC{IRFglH-Ak5FTC^YPr5)JZ)tE&3!bm(M{O&B4ils7Tew?$_NfDaKb>x7732D94O63s+8|ls4aR{O9DZ8Kt~6Yud5S(pPEyj%Zzi@%{Dr7M{p4c_D(h%aR~ z-w-+_Ih9J4dNZXczq&zM<!dJ~oa_?L*_e6#CG5 zE`r)K&OKX>2g&ozqGETsCTePsFQe<RUU%CAPS|TspTp;&lixy$LHF&iX0?rBa>X#O3Lwb JxfROT@?WHh+D!le diff --git a/RNNSharp/BiRNN.cs b/RNNSharp/BiRNN.cs index 3a8b4df..b5abc57 100644 --- a/RNNSharp/BiRNN.cs +++ b/RNNSharp/BiRNN.cs @@ -112,12 +112,12 @@ public override void SetGradientCutoff(double newGradient) backwardRNN.SetGradientCutoff(newGradient); } - public override void SetRegularization(double newBeta) + public override void SetDropout(double newDropout) { - beta = newBeta; + dropout = newDropout; - forwardRNN.SetRegularization(newBeta); - backwardRNN.SetRegularization(newBeta); + forwardRNN.SetDropout(newDropout); + backwardRNN.SetDropout(newDropout); } public override void SetHiddenLayerSize(int newsize) @@ -453,7 +453,7 @@ public override void learnNet(State state, int timeat, bool biRNN = false) } - public override void computeNet(State state, double[] doutput) + public override void computeNet(State state, double[] doutput, bool isTrain = true) { } diff --git a/RNNSharp/LSTMRNN.cs b/RNNSharp/LSTMRNN.cs index 8d51957..17b8afe 100644 --- a/RNNSharp/LSTMRNN.cs +++ b/RNNSharp/LSTMRNN.cs @@ -579,7 +579,7 @@ public override void learnNet(State state, int timeat, bool biRNN = false) // forward process. output layer consists of tag value - public override void computeNet(State state, double[] doutput) + public override void computeNet(State state, double[] doutput, bool isTrain = true) { //inputs(t) -> hidden(t) //Get sparse feature and apply it into hidden layer diff --git a/RNNSharp/ModelSetting.cs b/RNNSharp/ModelSetting.cs index 2b6bb72..6f93f70 100644 --- a/RNNSharp/ModelSetting.cs +++ b/RNNSharp/ModelSetting.cs @@ -17,8 +17,8 @@ public class ModelSetting public double GetLearningRate(){ return m_LearningRate; } public void SetLearningRate(double r) { m_LearningRate = r; } - public double GetRegularization() { return m_Regularization; } - public void SetRegularization(double r) { m_Regularization = r; } + public double GetDropout() { return m_Dropout; } + public void SetDropout(double r) { m_Dropout = r; } public double GetTagTransitionWeight(){ return m_tagTransitionWeight; } public void SetTagTransitionWeight(double r) { m_tagTransitionWeight = r; } @@ -60,7 +60,7 @@ public long GetSaveStep() int m_NumHidden; double m_LearningRate; double m_tagTransitionWeight; - double m_Regularization; + double m_Dropout; int m_Bptt; int m_MaxIteration; bool m_bCRFTraining; @@ -91,7 +91,7 @@ public void DumpSetting() } Console.WriteLine("Learning rate: {0}", m_LearningRate); - Console.WriteLine("Regularization: {0}", m_Regularization); + Console.WriteLine("Dropout: {0}", m_Dropout); Console.WriteLine("Max Iteration: {0}", m_MaxIteration); Console.WriteLine("Hidden layer sizeļ¼š {0}", m_NumHidden); Console.WriteLine("RNN-CRF: {0}", m_bCRFTraining); diff --git a/RNNSharp/RNN.cs b/RNNSharp/RNN.cs index 1da560d..b8e6bb4 100644 --- a/RNNSharp/RNN.cs +++ b/RNNSharp/RNN.cs @@ -40,7 +40,7 @@ abstract public class RNN protected double minTknErrRatio; protected double lastTknErrRatio; protected long counter; - protected double beta; + protected double dropout; protected ParallelOptions parallelOption = new ParallelOptions(); protected double gradient_cutoff; protected bool m_bCRFTraining = false; @@ -175,7 +175,7 @@ public RNN() gradient_cutoff = 15; alpha = 0.1; - beta = 0.0000001; + dropout = 0; logp = 0; llogp = -100000000; minTknErrRatio = 1000000; @@ -214,7 +214,7 @@ public bool ShouldTrainingStop() public virtual void SetValidationSet(DataSet validation) { m_ValidationSet = validation; } public virtual void SetGradientCutoff(double newGradient) { gradient_cutoff = newGradient; } public virtual void SetLearningRate(double newAlpha) { alpha = newAlpha; } - public virtual void SetRegularization(double newBeta) { beta = newBeta; } + public virtual void SetDropout(double newDropout) { dropout = newDropout; } public virtual void SetHiddenLayerSize(int newsize) { L1 = newsize;} public virtual void SetModelFile(string strModelFile) { m_strModelFile = strModelFile; } @@ -226,7 +226,7 @@ public bool IsCRFModel() public double exp_10(double num) { return Math.Exp(num * 2.302585093); } public abstract void netReset(bool updateNet = false); - public abstract void computeNet(State state, double[] doutput); + public abstract void computeNet(State state, double[] doutput, bool isTrain = true); public virtual int[] PredictSentence(Sequence pSequence) @@ -589,6 +589,7 @@ public void matrixXvectorADD(neuron[] dest, neuron[] srcvec, Matrix srcm //ac mod Parallel.For(0, (to - from), parallelOption, i => { + dest[i + from].cellOutput = 0; for (int j = 0; j < to2 - from2; j++) { dest[i + from].cellOutput += srcvec[j + from2].cellOutput * srcmatrix[i][j]; @@ -600,6 +601,7 @@ public void matrixXvectorADD(neuron[] dest, neuron[] srcvec, Matrix srcm { Parallel.For(0, (to - from), parallelOption, i => { + dest[i + from].er = 0; for (int j = 0; j < to2 - from2; j++) { dest[i + from].er += srcvec[j + from2].er * srcmatrix[j][i]; @@ -801,7 +803,7 @@ public virtual Matrix InnerDecode(Sequence pSequence) { State state = pSequence.Get(curState); setInputLayer(state, curState, numStates, predicted); - computeNet(state, m[curState]); //compute probability distribution + computeNet(state, m[curState], false); //compute probability distribution predicted[curState] = GetBestOutputIndex(); } diff --git a/RNNSharp/RNNEncoder.cs b/RNNSharp/RNNEncoder.cs index b8d1e4d..a8d8fbe 100644 --- a/RNNSharp/RNNEncoder.cs +++ b/RNNSharp/RNNEncoder.cs @@ -70,7 +70,7 @@ public void Train() rnn.SetCRFTraining(m_modelSetting.IsCRFTraining()); rnn.SetLearningRate(m_modelSetting.GetLearningRate()); rnn.SetGradientCutoff(15.0); - rnn.SetRegularization(m_modelSetting.GetRegularization()); + rnn.SetDropout(m_modelSetting.GetDropout()); rnn.SetHiddenLayerSize(m_modelSetting.GetNumHidden()); rnn.SetTagBigramTransitionWeight(m_modelSetting.GetTagTransitionWeight()); diff --git a/RNNSharp/SimpleRNN.cs b/RNNSharp/SimpleRNN.cs index d32faf9..aa8470c 100644 --- a/RNNSharp/SimpleRNN.cs +++ b/RNNSharp/SimpleRNN.cs @@ -12,7 +12,7 @@ public class SimpleRNN : RNN protected int bptt; protected int bptt_block; protected neuron[] bptt_hidden; - protected neuron[] bptt_fea; + protected double[] bptt_fea; protected SparseVector[] bptt_inputs = new SparseVector[MAX_RNN_HIST]; // TODO: add const constraint protected Matrix mat_bptt_syn0_w = new Matrix(); @@ -30,7 +30,7 @@ public SimpleRNN() { m_modeltype = MODELTYPE.SIMPLE; gradient_cutoff = 15; - beta = 0.0000001; + dropout = 0; llogp = -100000000; iter = 0; @@ -99,10 +99,21 @@ public override void GetHiddenLayer(Matrix m, int curStatus) } } - public void computeHiddenActivity() + public void computeHiddenActivity(bool isTrain) { for (int a = 0; a < L1; a++) { + if (neuHidden[a].mask == true) + { + neuHidden[a].cellOutput = 0; + continue; + } + + if (isTrain == false) + { + neuHidden[a].cellOutput = neuHidden[a].cellOutput * (1.0 - dropout); + } + if (neuHidden[a].cellOutput > 50) neuHidden[a].cellOutput = 50; //for numerical stability if (neuHidden[a].cellOutput < -50) neuHidden[a].cellOutput = -50; //for numerical stability neuHidden[a].cellOutput = 1.0 / (1.0 + Math.Exp(-neuHidden[a].cellOutput)); @@ -110,7 +121,7 @@ public void computeHiddenActivity() } // forward process. output layer consists of tag value - public override void computeNet(State state, double[] doutput) + public override void computeNet(State state, double[] doutput, bool isTrain = true) { //keep last hidden layer and erase activations neuLastHidden = new neuron[L1]; @@ -144,13 +155,7 @@ public override void computeNet(State state, double[] doutput) } //activate 1 --sigmoid - computeHiddenActivity(); - - //initialize output nodes - for (int c = 0; c < L2; c++) - { - neuOutput[c].cellOutput = 0; - } + computeHiddenActivity(isTrain); matrixXvectorADD(neuOutput, neuHidden, mat_hidden2output, 0, L2, 0, L1, 0); if (doutput != null) @@ -174,11 +179,15 @@ public override void learnNet(State state, int timeat, bool biRNN = false) CalculateOutputLayerError(state, timeat); } + matrixXvectorADD(neuHidden, neuOutput, mat_hidden2output, 0, L1, 0, L2, 1); //error output->hidden for words from specific class + for (int a = 0; a < L1; a++) { - neuHidden[a].er = 0; + if (neuHidden[a].mask == true) + { + neuHidden[a].er = 0; + } } - matrixXvectorADD(neuHidden, neuOutput, mat_hidden2output, 0, L1, 0, L2, 1); //error output->hidden for words from specific class for (int a = 0; a < L1; a++) { @@ -209,7 +218,7 @@ void learnBptt(State state) { for (int a = 0; a < fea_size; a++) { - mat_bptt_synf[b][a] += neuHidden[b].er * bptt_fea[a + step * fea_size].cellOutput; + mat_bptt_synf[b][a] += neuHidden[b].er * bptt_fea[a + step * fea_size]; } }); } @@ -225,11 +234,6 @@ void learnBptt(State state) } }); - for (int a = 0; a < L1; a++) - { - neuLastHidden[a].er = 0; - } - matrixXvectorADD(neuLastHidden, neuHidden, mat_hiddenBpttWeight, 0, L1, 0, L1, 1); //propagates errors hidden->input to the recurrent part Parallel.For(0, L1, parallelOption, b => @@ -311,16 +315,7 @@ public void resetBpttMem() } bptt_hidden = new neuron[(bptt + bptt_block + 1) * L1]; - for (int a = 0; a < (bptt + bptt_block) * L1; a++) - { - bptt_hidden[a].cellOutput = 0; - bptt_hidden[a].er = 0; - } - - bptt_fea = new neuron[(bptt + bptt_block + 2) * fea_size]; - for (int a = 0; a < (bptt + bptt_block) * fea_size; a++) - bptt_fea[a].cellOutput = 0; - + bptt_fea = new double[(bptt + bptt_block + 2) * fea_size]; mat_bptt_syn0_w = new Matrix(L1, L0); mat_bptt_syn0_ph = new Matrix(L1, L1); mat_bptt_synf = new Matrix(L1, fea_size); @@ -360,13 +355,28 @@ public override void initMem() public override void netReset(bool updateNet = false) //cleans hidden layer activation + bptt history { for (int a = 0; a < L1; a++) + { neuHidden[a].cellOutput = 0.1; + neuHidden[a].mask = false; + } + + if (updateNet == true) + { + //Train mode + for (int a = 0; a < L1; a++) + { + if (rand.NextDouble() < dropout) + { + neuHidden[a].mask = true; + } + } + } if (bptt > 0) { bptt_inputs = new SparseVector[MAX_RNN_HIST]; bptt_hidden = new neuron[(bptt + bptt_block + 1) * L1]; - bptt_fea = new neuron[(bptt + bptt_block + 2) * fea_size]; + bptt_fea = new double[(bptt + bptt_block + 2) * fea_size]; } } @@ -375,37 +385,28 @@ public override void LearnBackTime(State state, int numStates, int curState) { if (bptt > 0) { - //shift memory needed for bptt to next time step - for (int a = bptt + bptt_block - 1; a > 0; a--) - bptt_inputs[a] = bptt_inputs[a - 1]; - bptt_inputs[0] = state.GetSparseData(); - - for (int a = bptt + bptt_block - 1; a > 0; a--) + int maxBptt = 0; + for (maxBptt = 0; maxBptt < bptt + bptt_block - 1; maxBptt++) { - for (int b = 0; b < L1; b++) + if (bptt_inputs[maxBptt] == null) { - bptt_hidden[a * L1 + b] = bptt_hidden[(a - 1) * L1 + b]; + break; } } - for (int a = bptt + bptt_block - 1; a > 0; a--) + //shift memory needed for bptt to next time step + for (int a = maxBptt; a > 0; a--) { - for (int b = 0; b < fea_size; b++) - { - bptt_fea[a * fea_size + b].cellOutput = bptt_fea[(a - 1) * fea_size + b].cellOutput; - } + bptt_inputs[a] = bptt_inputs[a - 1]; + Array.Copy(bptt_hidden, (a - 1) * L1, bptt_hidden, a * L1, L1); + Array.Copy(bptt_fea, (a - 1) * fea_size, bptt_fea, a * fea_size, fea_size); } + bptt_inputs[0] = state.GetSparseData(); } //Save hidden and feature layer nodes values for bptt - for (int b = 0; b < L1; b++) - { - bptt_hidden[b] = neuHidden[b]; - } - for (int b = 0; b < fea_size; b++) - { - bptt_fea[b].cellOutput = neuFeatures[b]; - } + Array.Copy(neuHidden, 0, bptt_hidden, 0, L1); + Array.Copy(neuFeatures, 0, bptt_fea, 0, fea_size); // time to learn bptt if (((curState % bptt_block) == 0) || (curState == numStates - 1)) diff --git a/RNNSharp/neuron.cs b/RNNSharp/neuron.cs index 87d3416..6689ef1 100644 --- a/RNNSharp/neuron.cs +++ b/RNNSharp/neuron.cs @@ -8,7 +8,8 @@ namespace RNNSharp { public struct neuron { - public double cellOutput; //actual value stored in neuron - public double er; //error value in neuron, used by learning algorithm + public double cellOutput; //actual value stored in neuron + public double er; //error value in neuron, used by learning algorithm + public bool mask; } } diff --git a/RNNSharpConsole/Program.cs b/RNNSharpConsole/Program.cs index 60f685b..6bf4b0b 100644 --- a/RNNSharpConsole/Program.cs +++ b/RNNSharpConsole/Program.cs @@ -22,7 +22,7 @@ class Program static int iCRF = 0; static long savestep = 0; static double alpha = 0.1; - static double beta = 0.0000001; + static double dropout = 0; static int bptt = 4; static int modelType = 0; static int nBest = 1; @@ -70,8 +70,8 @@ static void UsageTrain() Console.WriteLine(" -alpha "); Console.WriteLine("\tLearning rate, default is 0.1"); - Console.WriteLine(" -beta "); - Console.WriteLine("\tRegularization parameter, default is 1e-7"); + Console.WriteLine(" -dropout "); + Console.WriteLine("\tDropout parameter [0, 1.0), default is 0"); Console.WriteLine(" -layersize "); Console.WriteLine("\tHidden layer size for training, default is 200"); @@ -136,7 +136,7 @@ static void InitParameters(string[] args) if ((i = ArgPos("-crf", args)) >= 0) iCRF = int.Parse(args[i + 1]); if ((i = ArgPos("-maxiter", args)) >= 0) maxIter = int.Parse(args[i + 1]); if ((i = ArgPos("-alpha", args)) >= 0) alpha = double.Parse(args[i + 1]); - if ((i = ArgPos("-beta", args)) >= 0) beta = double.Parse(args[i + 1]); + if ((i = ArgPos("-dropout", args)) >= 0) dropout = double.Parse(args[i + 1]); if ((i = ArgPos("-bptt", args)) >= 0) bptt = int.Parse(args[i + 1]); if ((i = ArgPos("-nbest", args)) >= 0) nBest = int.Parse(args[i + 1]); if ((i = ArgPos("-dir", args)) >= 0) iDir = int.Parse(args[i + 1]); @@ -440,7 +440,7 @@ private static void Train() RNNConfig.SetMaxIteration(maxIter); RNNConfig.SetSaveStep(savestep); RNNConfig.SetLearningRate(alpha); - RNNConfig.SetRegularization(beta); + RNNConfig.SetDropout(dropout); RNNConfig.SetBptt(bptt); //Dump RNN setting on console