Merge branch 'terrytangyuan-master' into scala

XinYao1994 · Dec 28, 2015 · 933d3d4 · 933d3d4
2 parents 8449c89 + c75baf0
commit 933d3d4
Show file tree

Hide file tree

Showing 10 changed files with 71 additions and 76 deletions.
diff --git a/.gitignore b/.gitignore
@@ -87,35 +87,14 @@ example/notebooks/.ipynb_checkpoints/*
 
 
 # Scala package
-# Jetbrain
-scala-package/.idea
-
-# ctags
-scala-package/tags
-
-scala-package/*.class
-scala-package/*.log
-
-# sbt specific
-scala-package/.cache
-scala-package/.lib/
-scala-package/dist/*
-scala-package/target/
-scala-package/lib_managed/
-scala-package/src_managed/
-scala-package/project/boot/
-scala-package/project/plugins/project/
-
-#scala target folders
+*.idea
+*.class
 scala-package/*/target/
 scala-package/*/*/target/
-
-# IDE specific
 *.scala_dependencies
 *.worksheet
 *.idea
 *.iml
-#eclipse
-.classpath
-.project
-.settings
+*.classpath
+*.project
+*.settings
diff --git a/Makefile b/Makefile
@@ -200,4 +200,4 @@ clean_all: clean
 	cd $(PS_PATH); make clean; cd -
 
 -include build/*.d
--include build/*/*.d
+-include build/*/*.d
diff --git a/R-package/R/mxnet_generated.R b/R-package/R/mxnet_generated.R
@@ -243,7 +243,7 @@ mx.io.CSVIter <- function(...) {
 #'     Batch Param: Batch size.
 #' @param round.batch  boolean, optional, default=True
 #'     Batch Param: Use round robin to handle overflow batch.
-#' @param prefetch.buffer  , optional, default=4
+#' @param prefetch.buffer  long (non-negative), optional, default=4
 #'     Backend Param: Number of prefetched parameters
 #' @param rand.crop  boolean, optional, default=False
 #'     Augmentation Param: Whether to random crop on the image
@@ -284,9 +284,11 @@ mx.io.CSVIter <- function(...) {
 #' @param mean.r  float, optional, default=0
 #'     Augmentation Param: Mean value on R channel.
 #' @param mean.g  float, optional, default=0
-#'     Augmentation: Mean value on G channel.
+#'     Augmentation Param: Mean value on G channel.
 #' @param mean.b  float, optional, default=0
-#'     Augmentation: Mean value on B channel.
+#'     Augmentation Param: Mean value on B channel.
+#' @param mean.a  float, optional, default=0
+#'     Augmentation Param: Mean value on Alpha channel.
 #' @param scale  float, optional, default=1
 #'     Augmentation Param: Scale in color space.
 #' @param max.random.contrast  float, optional, default=0
@@ -320,7 +322,7 @@ mx.io.ImageRecordIter <- function(...) {
 #'     partition the data into multiple parts
 #' @param part.index  int, optional, default='0'
 #'     the index of the part will read
-#' @param prefetch.buffer  , optional, default=4
+#' @param prefetch.buffer  long (non-negative), optional, default=4
 #'     Backend Param: Number of prefetched parameters
 #' @return iter The result mx.dataiter
 #' 

diff --git a/R-package/man/mx.io.ImageRecordIter.Rd b/R-package/man/mx.io.ImageRecordIter.Rd
diff --git a/R-package/man/mx.io.MNISTIter.Rd b/R-package/man/mx.io.MNISTIter.Rd
diff --git a/R-package/src/Makevars.win b/R-package/src/Makevars.win
@@ -1,13 +1,3 @@
-# _*_ mode: makefile; _*_
-PKGROOT=../../
 
-# This file is only used for compilation from github
-# It will be replaced by more formal Rpackage structure
-# Where PKGROOT moved to root directory
-
-.PHONY: all mxnet
-all: $(SHLIB)
-
-
-PKG_CPPFLAGS = -I$(PKGROOT)/include -I$(PKGROOT)/dmlc-core/include
-PKG_LIBS = -L../inst/libs/x64/ -llibmxnet
+PKG_CPPFLAGS = -I../inst/include
+PKG_LIBS =  $(LAPACK_LIBS) $(BLAS_LIBS) -L../inst/libs/x64/ -llibmxnet
diff --git a/example/kaggle-ndsb2/README.md b/example/kaggle-ndsb2/README.md
@@ -53,17 +53,14 @@ Prepare raw data in ```data``` folder. The tree of ```data``` folder is like
 
 2. Run ```python3 Preprocessing.py``` to do preprocessing of data.
 3. After we have the processed data, run ```python3 Train.py``` to generate ```submission.csv```
-
+4. We also provide the R code with the same network structure and parameters in ```Train.R```. Right now it used the pre-processed csv files by ```Preprocessing.py```. We will add the pre-processing R code later.
 
 Note:
 - To run with python2, you need to change ```Train.py, line #139``` to the python2 syntax.
-- To modify network, change ```get_lenet``` function in ```Train.py```
+- To modify network, change ```get_lenet``` function in ```Train.py``` or ```get.lenet``` function in ```Train.R```.
 - We also provide ```local_train```, ```local_test``` file for local parameter tuning.
-- To run on multiple GPU with huge network, or questions about saving network paramter etc, please refer [MXNet docs](https://mxnet.readthedocs.org/en/latest/)
+- To run on multiple GPU with huge network, or questions about saving network parameters etc, please refer [MXNet docs](https://mxnet.readthedocs.org/en/latest/)
 
 
 ## About MXNet
-MXNet is a deep learning framework designed for both efficiency and flexibility by DMLC group. Like all other packages in DMLC,  it will fully utilize all the resources to solve the problem under limited resource constraint, with a flexible programming interface. You can use it for all purposes of data science and deep learning tasks with R, Julia, python and more. see
-
-
-
+MXNet is a deep learning framework designed for both efficiency and flexibility by DMLC group. Like all other packages in DMLC, it will fully utilize all the resources to solve the problem under limited resource constraint, with a flexible programming interface. You can use it for all purposes of data science and deep learning tasks with R, Julia, python and more.
diff --git a/example/kaggle-ndsb2/Train.R b/example/kaggle-ndsb2/Train.R
@@ -1,6 +1,12 @@
+# Train.R for Second Annual Data Science Bowl
+# Deep learning model with GPU support
+# Please refer to https://mxnet.readthedocs.org/en/latest/build.html#r-package-installation
+# for installation guide
+
 require(mxnet)
 require(data.table)
 
+##A lenet style net, takes difference of each frame as input.
 get.lenet <- function() {
   source <- mx.symbol.Variable("data")
   source <- (source-128) / 128
@@ -27,15 +33,19 @@ get.lenet <- function() {
     mx.symbol.Pooling(
       net, pool.type = "max", kernel = c(2, 2), stride = c(2, 2)
     )
+  # first fullc
   flatten <- mx.symbol.Flatten(net)
   flatten <- mx.symbol.Dropout(flatten)
   fc1 <- mx.symbol.FullyConnected(data = flatten, num.hidden = 600)
+  # Name the final layer as softmax so it auto matches the naming of data iterator
+  # Otherwise we can also change the provide_data in the data iter
   return(mx.symbol.LogisticRegressionOutput(data = fc1, name = 'softmax'))
 }
 
 network <- get.lenet()
 batch_size <- 32
 
+# CSVIter is uesed here, since the data can't fit into memory
 data_train <- mx.io.CSVIter(
   data.csv = "./train-64x64-data.csv", data.shape = c(64, 64, 30),
   label.csv = "./train-stytole.csv", label.shape = 600,
@@ -48,6 +58,7 @@ data_validate <- mx.io.CSVIter(
   batch.size = 1
 )
 
+# Custom evaluation metric on CRPS.
 mx.metric.CRPS <- mx.metric.custom("CRPS", function(label, pred) {
   pred <- as.array(pred)
   label <- as.array(label)
@@ -61,6 +72,7 @@ mx.metric.CRPS <- mx.metric.custom("CRPS", function(label, pred) {
   return(sum((label - pred) ^ 2) / length(label))
 })
 
+# Training the stytole net
 mx.set.seed(0)
 stytole_model <- mx.model.FeedForward.create(
   X = data_train,
@@ -73,8 +85,10 @@ stytole_model <- mx.model.FeedForward.create(
   eval.metric = mx.metric.CRPS
 )
 
+# Predict stytole
 stytole_prob = predict(stytole_model, data_validate)
 
+# Training the diastole net
 network = get.lenet()
 batch_size = 32
 data_train <-
@@ -95,6 +109,7 @@ diastole_model = mx.model.FeedForward.create(
   eval.metric = mx.metric.CRPS
 )
 
+# Predict diastole
 diastole_prob = predict(diastole_model, data_validate)
 
 accumulate_result <- function(validate_lst, prob) {
@@ -109,11 +124,12 @@ diastole_result = as.data.frame(accumulate_result("./validate-label.csv", diasto
 
 train_csv <- read.table("./train-label.csv", sep = ',')
 
+# we have 2 person missing due to frame selection, use udibr's hist result instead
 doHist <- function(data) {
   res <- rep(0, 600)
   for (i in 1:length(data)) {
     for (j in round(data[i]):600) {
-      res[j] = res[j] + 1 
+      res[j] = res[j] + 1
     }
   }
   return(res / length(data))
@@ -147,7 +163,7 @@ for (i in 1:nrow(res)) {
       res[i, 2:601] <- hDiastole
     } else {
       res[i, 2:601] <- hSystole
-    }    
+    }
   }
 }
 

diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/LRScheduler.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/LRScheduler.scala
@@ -7,7 +7,7 @@ import org.slf4j.LoggerFactory
  * @author Yuan Tang
  */
 
-abstract class LRScheduler(protected var baseLR: Float = 0.01f) {
+abstract class LRScheduler(var baseLR: Float = 0.01f) {
   /**
    * Base class of a learning rate scheduler
    *

diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/Adam.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/Adam.scala
@@ -1,3 +1,5 @@
+package ml.dmlc.mxnet.optimizer
+
 import ml.dmlc.mxnet.{NDArray, Optimizer, LRScheduler}
 import ml.dmlc.mxnet.NDArrayConversions._
 
@@ -8,7 +10,7 @@ import ml.dmlc.mxnet.NDArrayConversions._
  * Adam: A Method for Stochastic Optimization,
  * http://arxiv.org/abs/1412.6980
  *
- * @author Yuan Tang
+ * @author Yuan Tang, Yizhi Liu
  *
  * @param learningRate Float, Step size.
  * @param beta1 Float, Exponential decay rate for the first moment estimates.
@@ -21,12 +23,17 @@ import ml.dmlc.mxnet.NDArrayConversions._
  * @param lrScheduler The learning rate scheduler
  */
 class Adam(var learningRate: Float = 0.002f, val beta1: Float = 0.9f, val beta2: Float = 0.999f,
-          val epsilon: Float = 0.00000001f, val decayFactor: Float = 1-0.00000001f, val wd: Float = 0.0f,
+           val epsilon: Float = 1e-8f, val decayFactor: Float = 1-1e-8f, val wd: Float = 0.0f,
            rescaleGrad: Float = 1f, val clipGradient: Float = 0f,
-          val lrScheduler: LRScheduler = null) extends Optimizer(rescaleGrad: Float) {
+           val lrScheduler: LRScheduler = null) extends Optimizer(rescaleGrad: Float) {
 
   protected var time: Int = 0
-  protected var timeFirstIndex: Int = 0
+  protected var timeFirstIndex: Option[Int] = None
+
+  if (lrScheduler != null) {
+    lrScheduler.baseLR = learningRate
+  }
+
   /**
    * Update the parameters.
    * @param index An unique integer key used to index the parameters
@@ -45,41 +52,42 @@ class Adam(var learningRate: Float = 0.002f, val beta1: Float = 0.9f, val beta2:
         this.learningRate
       }) * lrScale.getOrElse(index, 1f)
 
-    var (mean, variance)  = state
+    val (mean, variance) = state.asInstanceOf[(NDArray, NDArray)]
 
-    if (timeFirstIndex == 0) {
-      timeFirstIndex = index
+    // increment time only when the first parameters is called
+    if (timeFirstIndex == None) {
+      timeFirstIndex = Option(index)
       time = 0
-    } else if (timeFirstIndex == index) {
+    } else if (timeFirstIndex.get == index) {
       time += 1
     }
 
     val t1: Int = time + 1
-    learningRate = (lr * math.sqrt(1.0 - math.pow(beta2, t1))/(1.0 - math.pow(beta1, t1))) toFloat
-    val beta1t = beta1 * math.pow(decayFactor, t1 - 1) toFloat
+    learningRate = (lr * math.sqrt(1.0 - math.pow(beta2, t1)) / (1.0 - math.pow(beta1, t1))).toFloat
+    val beta1t = beta1 * math.pow(decayFactor, t1 - 1).toFloat
 
     var resdGrad = grad * rescaleGrad
     if (clipGradient != 0f) {
       resdGrad = NDArray.clip(resdGrad, -clipGradient, clipGradient)
     }
 
-    val meanT = beta1t * mean.asInstanceOf[NDArray] + (1.0 - beta1t) * resdGrad toScalar
-    val varianceT = beta2 * variance.asInstanceOf[NDArray] + (1.0f - beta2) * resdGrad * resdGrad toScalar
+    val meanT = beta1t * mean + (1.0 - beta1t) * resdGrad
+    val varianceT = beta2 * variance + (1.0f - beta2) * resdGrad * resdGrad
 
-    var step = learningRate * meanT / (math.sqrt(varianceT) + epsilon)
+    var step = learningRate * meanT / (NDArray.sqrt(varianceT) + epsilon)
 
     if (wd > 0.0f) {
-      step += (lr * wd * weight).toScalar
+      step += lr * wd * weight
     }
 
-    weight += -step.toFloat
-    mean = meanT
-    variance = varianceT
+    weight += -step
+    mean.set(meanT)
+    variance.set(varianceT)
   }
 
   // Create additional optimizer state: mean, variance
-  override def createState(index: Int, weight: NDArray): AnyRef = {
-    timeFirstIndex = 0
+  override def createState(index: Int, weight: NDArray): (NDArray, NDArray) = {
+    timeFirstIndex = None // time is incremented only on the first index
     (NDArray.zeros(weight.shape, weight.context), // mean
       NDArray.zeros(weight.shape, weight.context)) // variance
   }