zhongkaifu#1. Support dropout for SimpleRNN (both forward and bi-dire…

…ctional) zhongkaifu#2. Speed up training performance
ericxsun · Dec 3, 2015 · debffdb · debffdb
1 parent a4dcfcf
commit debffdb
Show file tree

Hide file tree

Showing 9 changed files with 78 additions and 74 deletions.
diff --git a/RNNSharp.v12.suo b/RNNSharp.v12.suo
diff --git a/RNNSharp/BiRNN.cs b/RNNSharp/BiRNN.cs
@@ -112,12 +112,12 @@ public override void SetGradientCutoff(double newGradient)
             backwardRNN.SetGradientCutoff(newGradient);
         }
 
-        public override void SetRegularization(double newBeta)
+        public override void SetDropout(double newDropout)
         {
-            beta = newBeta;
+            dropout = newDropout;
 
-            forwardRNN.SetRegularization(newBeta);
-            backwardRNN.SetRegularization(newBeta);
+            forwardRNN.SetDropout(newDropout);
+            backwardRNN.SetDropout(newDropout);
         }
 
         public override void SetHiddenLayerSize(int newsize)
@@ -453,7 +453,7 @@ public override void learnNet(State state, int timeat, bool biRNN = false)
 
         }
 
-        public override void computeNet(State state, double[] doutput)
+        public override void computeNet(State state, double[] doutput, bool isTrain = true)
         {
 
         }

diff --git a/RNNSharp/LSTMRNN.cs b/RNNSharp/LSTMRNN.cs
@@ -579,7 +579,7 @@ public override void learnNet(State state, int timeat, bool biRNN = false)
 
 
         // forward process. output layer consists of tag value
-        public override void computeNet(State state, double[] doutput)
+        public override void computeNet(State state, double[] doutput, bool isTrain = true)
         {
             //inputs(t) -> hidden(t)
             //Get sparse feature and apply it into hidden layer

diff --git a/RNNSharp/ModelSetting.cs b/RNNSharp/ModelSetting.cs
@@ -17,8 +17,8 @@ public class ModelSetting
         public double GetLearningRate(){ return m_LearningRate; }
         public void SetLearningRate(double r) { m_LearningRate = r; }
 
-        public double GetRegularization() { return m_Regularization; }
-        public void SetRegularization(double r) { m_Regularization = r; }
+        public double GetDropout() { return m_Dropout; }
+        public void SetDropout(double r) { m_Dropout = r; }
 
         public double GetTagTransitionWeight(){ return m_tagTransitionWeight; }
         public void SetTagTransitionWeight(double r) { m_tagTransitionWeight = r; }
@@ -60,7 +60,7 @@ public long GetSaveStep()
         int m_NumHidden;
         double m_LearningRate;
         double m_tagTransitionWeight;
-        double m_Regularization;
+        double m_Dropout;
         int m_Bptt;
         int m_MaxIteration;
         bool m_bCRFTraining;
@@ -91,7 +91,7 @@ public void DumpSetting()
             }
 
             Console.WriteLine("Learning rate: {0}", m_LearningRate);
-            Console.WriteLine("Regularization: {0}", m_Regularization);
+            Console.WriteLine("Dropout: {0}", m_Dropout);
             Console.WriteLine("Max Iteration: {0}", m_MaxIteration);
             Console.WriteLine("Hidden layer size： {0}", m_NumHidden);
             Console.WriteLine("RNN-CRF: {0}", m_bCRFTraining);

diff --git a/RNNSharp/RNN.cs b/RNNSharp/RNN.cs
@@ -40,7 +40,7 @@ abstract public class RNN
         protected double minTknErrRatio;
         protected double lastTknErrRatio;
         protected long counter;
-        protected double beta;
+        protected double dropout;
         protected ParallelOptions parallelOption = new ParallelOptions();
         protected double gradient_cutoff;
         protected bool m_bCRFTraining = false;
@@ -175,7 +175,7 @@ public RNN()
             gradient_cutoff = 15;
 
             alpha = 0.1;
-            beta = 0.0000001;
+            dropout = 0;
             logp = 0;
             llogp = -100000000;
             minTknErrRatio = 1000000;
@@ -214,7 +214,7 @@ public bool ShouldTrainingStop()
         public virtual void SetValidationSet(DataSet validation) { m_ValidationSet = validation; }
         public virtual void SetGradientCutoff(double newGradient) { gradient_cutoff = newGradient; }
         public virtual void SetLearningRate(double newAlpha) { alpha = newAlpha; }
-        public virtual void SetRegularization(double newBeta) { beta = newBeta; }
+        public virtual void SetDropout(double newDropout) { dropout = newDropout; }
         public virtual void SetHiddenLayerSize(int newsize) { L1 = newsize;}
         public virtual void SetModelFile(string strModelFile) { m_strModelFile = strModelFile; }
 
@@ -226,7 +226,7 @@ public bool IsCRFModel()
         public double exp_10(double num) { return Math.Exp(num * 2.302585093); }
 
         public abstract void netReset(bool updateNet = false);
-        public abstract void computeNet(State state, double[] doutput);
+        public abstract void computeNet(State state, double[] doutput, bool isTrain = true);
 
 
         public virtual int[] PredictSentence(Sequence pSequence)
@@ -589,6 +589,7 @@ public void matrixXvectorADD(neuron[] dest, neuron[] srcvec, Matrix<double> srcm
                 //ac mod
                 Parallel.For(0, (to - from), parallelOption, i =>
                 {
+                    dest[i + from].cellOutput = 0;
                     for (int j = 0; j < to2 - from2; j++)
                     {
                         dest[i + from].cellOutput += srcvec[j + from2].cellOutput * srcmatrix[i][j];
@@ -600,6 +601,7 @@ public void matrixXvectorADD(neuron[] dest, neuron[] srcvec, Matrix<double> srcm
             {
                 Parallel.For(0, (to - from), parallelOption, i =>
                 {
+                    dest[i + from].er = 0;
                     for (int j = 0; j < to2 - from2; j++)
                     {
                         dest[i + from].er += srcvec[j + from2].er * srcmatrix[j][i];
@@ -801,7 +803,7 @@ public virtual Matrix<double> InnerDecode(Sequence pSequence)
             {
                 State state = pSequence.Get(curState);
                 setInputLayer(state, curState, numStates, predicted);
-                computeNet(state, m[curState]);      //compute probability distribution
+                computeNet(state, m[curState], false);      //compute probability distribution
 
                 predicted[curState] = GetBestOutputIndex();
             }

diff --git a/RNNSharp/RNNEncoder.cs b/RNNSharp/RNNEncoder.cs
@@ -70,7 +70,7 @@ public void Train()
             rnn.SetCRFTraining(m_modelSetting.IsCRFTraining());
             rnn.SetLearningRate(m_modelSetting.GetLearningRate());
             rnn.SetGradientCutoff(15.0);
-            rnn.SetRegularization(m_modelSetting.GetRegularization());
+            rnn.SetDropout(m_modelSetting.GetDropout());
             rnn.SetHiddenLayerSize(m_modelSetting.GetNumHidden());
             rnn.SetTagBigramTransitionWeight(m_modelSetting.GetTagTransitionWeight());
 

diff --git a/RNNSharp/SimpleRNN.cs b/RNNSharp/SimpleRNN.cs
@@ -12,7 +12,7 @@ public class SimpleRNN : RNN
         protected int bptt;
         protected int bptt_block;
         protected neuron[] bptt_hidden;
-        protected neuron[] bptt_fea;
+        protected double[] bptt_fea;
         protected SparseVector[] bptt_inputs = new SparseVector[MAX_RNN_HIST];    // TODO: add const constraint
 
         protected Matrix<double> mat_bptt_syn0_w = new Matrix<double>();
@@ -30,7 +30,7 @@ public SimpleRNN()
         {
             m_modeltype = MODELTYPE.SIMPLE;
             gradient_cutoff = 15;
-            beta = 0.0000001;
+            dropout = 0;
             llogp = -100000000;
             iter = 0;
 
@@ -99,18 +99,29 @@ public override void GetHiddenLayer(Matrix<double> m, int curStatus)
             }
         }
 
-        public void computeHiddenActivity()
+        public void computeHiddenActivity(bool isTrain)
         {
             for (int a = 0; a < L1; a++)
             {
+                if (neuHidden[a].mask == true)
+                {
+                    neuHidden[a].cellOutput = 0;
+                    continue;
+                }
+
+                if (isTrain == false)
+                {
+                    neuHidden[a].cellOutput = neuHidden[a].cellOutput * (1.0 - dropout);
+                }
+
                 if (neuHidden[a].cellOutput > 50) neuHidden[a].cellOutput = 50;  //for numerical stability
                 if (neuHidden[a].cellOutput < -50) neuHidden[a].cellOutput = -50;  //for numerical stability
                 neuHidden[a].cellOutput = 1.0 / (1.0 + Math.Exp(-neuHidden[a].cellOutput));
             }
         }
 
         // forward process. output layer consists of tag value
-        public override void computeNet(State state, double[] doutput)
+        public override void computeNet(State state, double[] doutput, bool isTrain = true)
         {
             //keep last hidden layer and erase activations
             neuLastHidden = new neuron[L1];
@@ -144,13 +155,7 @@ public override void computeNet(State state, double[] doutput)
             }
 
             //activate 1      --sigmoid
-            computeHiddenActivity();
-
-            //initialize output nodes
-            for (int c = 0; c < L2; c++)
-            {
-                neuOutput[c].cellOutput = 0;
-            }
+            computeHiddenActivity(isTrain);
 
             matrixXvectorADD(neuOutput, neuHidden, mat_hidden2output, 0, L2, 0, L1, 0);
             if (doutput != null)
@@ -174,11 +179,15 @@ public override void learnNet(State state, int timeat, bool biRNN = false)
                 CalculateOutputLayerError(state, timeat);
             }
 
+            matrixXvectorADD(neuHidden, neuOutput, mat_hidden2output, 0, L1, 0, L2, 1);	//error output->hidden for words from specific class    	
+
             for (int a = 0; a < L1; a++)
             {
-                neuHidden[a].er = 0;
+                if (neuHidden[a].mask == true)
+                {
+                    neuHidden[a].er = 0;
+                }
             }
-            matrixXvectorADD(neuHidden, neuOutput, mat_hidden2output, 0, L1, 0, L2, 1);	//error output->hidden for words from specific class    	
 
             for (int a = 0; a < L1; a++)
             {
@@ -209,7 +218,7 @@ void learnBptt(State state)
                     {
                         for (int a = 0; a < fea_size; a++)
                         {
-                            mat_bptt_synf[b][a] += neuHidden[b].er * bptt_fea[a + step * fea_size].cellOutput;
+                            mat_bptt_synf[b][a] += neuHidden[b].er * bptt_fea[a + step * fea_size];
                         }
                     });
                 }
@@ -225,11 +234,6 @@ void learnBptt(State state)
                     }
                 });
 
-                for (int a = 0; a < L1; a++)
-                {
-                    neuLastHidden[a].er = 0;
-                }
-
                 matrixXvectorADD(neuLastHidden, neuHidden, mat_hiddenBpttWeight, 0, L1, 0, L1, 1);		//propagates errors hidden->input to the recurrent part
 
                 Parallel.For(0, L1, parallelOption, b =>
@@ -311,16 +315,7 @@ public void resetBpttMem()
             }
 
             bptt_hidden = new neuron[(bptt + bptt_block + 1) * L1];
-            for (int a = 0; a < (bptt + bptt_block) * L1; a++)
-            {
-                bptt_hidden[a].cellOutput = 0;
-                bptt_hidden[a].er = 0;
-            }
-
-            bptt_fea = new neuron[(bptt + bptt_block + 2) * fea_size];
-            for (int a = 0; a < (bptt + bptt_block) * fea_size; a++)
-                bptt_fea[a].cellOutput = 0;
-
+            bptt_fea = new double[(bptt + bptt_block + 2) * fea_size];
             mat_bptt_syn0_w = new Matrix<double>(L1, L0);
             mat_bptt_syn0_ph = new Matrix<double>(L1, L1);
             mat_bptt_synf = new Matrix<double>(L1, fea_size);
@@ -360,13 +355,28 @@ public override void initMem()
         public override void netReset(bool updateNet = false)   //cleans hidden layer activation + bptt history
         {
             for (int a = 0; a < L1; a++)
+            {
                 neuHidden[a].cellOutput = 0.1;
+                neuHidden[a].mask = false;
+            }
+
+            if (updateNet == true)
+            {
+                //Train mode
+                for (int a = 0; a < L1; a++)
+                {
+                    if (rand.NextDouble() < dropout)
+                    {
+                        neuHidden[a].mask = true;
+                    }
+                }
+            }
 
             if (bptt > 0)
             {
                 bptt_inputs = new SparseVector[MAX_RNN_HIST];
                 bptt_hidden = new neuron[(bptt + bptt_block + 1) * L1];
-                bptt_fea = new neuron[(bptt + bptt_block + 2) * fea_size];
+                bptt_fea = new double[(bptt + bptt_block + 2) * fea_size];
             }
         }
 
@@ -375,37 +385,28 @@ public override void LearnBackTime(State state, int numStates, int curState)
         {
             if (bptt > 0)
             {
-                //shift memory needed for bptt to next time step
-                for (int a = bptt + bptt_block - 1; a > 0; a--)
-                    bptt_inputs[a] = bptt_inputs[a - 1];
-                bptt_inputs[0] = state.GetSparseData();
-
-                for (int a = bptt + bptt_block - 1; a > 0; a--)
+                int maxBptt = 0;
+                for (maxBptt = 0; maxBptt < bptt + bptt_block - 1; maxBptt++)
                 {
-                    for (int b = 0; b < L1; b++)
+                    if (bptt_inputs[maxBptt] == null)
                     {
-                        bptt_hidden[a * L1 + b] = bptt_hidden[(a - 1) * L1 + b];
+                        break;
                     }
                 }
 
-                for (int a = bptt + bptt_block - 1; a > 0; a--)
+                //shift memory needed for bptt to next time step
+                for (int a = maxBptt; a > 0; a--)
                 {
-                    for (int b = 0; b < fea_size; b++)
-                    {
-                        bptt_fea[a * fea_size + b].cellOutput = bptt_fea[(a - 1) * fea_size + b].cellOutput;
-                    }
+                    bptt_inputs[a] = bptt_inputs[a - 1];
+                    Array.Copy(bptt_hidden, (a - 1) * L1, bptt_hidden, a * L1, L1);
+                    Array.Copy(bptt_fea, (a - 1) * fea_size, bptt_fea, a * fea_size, fea_size);
                 }
+                bptt_inputs[0] = state.GetSparseData();
             }
 
             //Save hidden and feature layer nodes values for bptt
-            for (int b = 0; b < L1; b++)
-            {
-                bptt_hidden[b] = neuHidden[b];
-            }
-            for (int b = 0; b < fea_size; b++)
-            {
-                bptt_fea[b].cellOutput = neuFeatures[b];
-            }
+            Array.Copy(neuHidden, 0, bptt_hidden, 0, L1);
+            Array.Copy(neuFeatures, 0, bptt_fea, 0, fea_size);
 
             // time to learn bptt
             if (((curState % bptt_block) == 0) || (curState == numStates - 1))

diff --git a/RNNSharp/neuron.cs b/RNNSharp/neuron.cs
@@ -8,7 +8,8 @@ namespace RNNSharp
 {
     public struct neuron
     {
-            public double cellOutput;		//actual value stored in neuron
-            public double er;		//error value in neuron, used by learning algorithm
+        public double cellOutput;		//actual value stored in neuron
+        public double er;		//error value in neuron, used by learning algorithm
+        public bool mask;
     }
 }
diff --git a/RNNSharpConsole/Program.cs b/RNNSharpConsole/Program.cs
@@ -22,7 +22,7 @@ class Program
         static int iCRF = 0;
         static long savestep = 0;
         static double alpha = 0.1;
-        static double beta = 0.0000001;
+        static double dropout = 0;
         static int bptt = 4;
         static int modelType = 0;
         static int nBest = 1;
@@ -70,8 +70,8 @@ static void UsageTrain()
             Console.WriteLine(" -alpha <float>");
             Console.WriteLine("\tLearning rate, default is 0.1");
 
-            Console.WriteLine(" -beta <float>");
-            Console.WriteLine("\tRegularization parameter, default is 1e-7");
+            Console.WriteLine(" -dropout <float>");
+            Console.WriteLine("\tDropout parameter [0, 1.0), default is 0");
 
             Console.WriteLine(" -layersize <int>");
             Console.WriteLine("\tHidden layer size for training, default is 200");
@@ -136,7 +136,7 @@ static void InitParameters(string[] args)
             if ((i = ArgPos("-crf", args)) >= 0) iCRF = int.Parse(args[i + 1]);
             if ((i = ArgPos("-maxiter", args)) >= 0) maxIter = int.Parse(args[i + 1]);
             if ((i = ArgPos("-alpha", args)) >= 0) alpha = double.Parse(args[i + 1]);
-            if ((i = ArgPos("-beta", args)) >= 0) beta = double.Parse(args[i + 1]);
+            if ((i = ArgPos("-dropout", args)) >= 0) dropout = double.Parse(args[i + 1]);
             if ((i = ArgPos("-bptt", args)) >= 0) bptt = int.Parse(args[i + 1]);
             if ((i = ArgPos("-nbest", args)) >= 0) nBest = int.Parse(args[i + 1]);
             if ((i = ArgPos("-dir", args)) >= 0) iDir = int.Parse(args[i + 1]);
@@ -440,7 +440,7 @@ private static void Train()
             RNNConfig.SetMaxIteration(maxIter);
             RNNConfig.SetSaveStep(savestep);
             RNNConfig.SetLearningRate(alpha);
-            RNNConfig.SetRegularization(beta);
+            RNNConfig.SetDropout(dropout);
             RNNConfig.SetBptt(bptt);
 
             //Dump RNN setting on console