reminisce
diff --git a/‎docs/how_to/caffe.md
+43 b/‎docs/how_to/caffe.md
+43
diff --git a/‎example/caffe/caffe_net.py
+103 b/‎example/caffe/caffe_net.py
+103
diff --git a/‎example/caffe/data.py
+37 b/‎example/caffe/data.py
+37
diff --git a/‎example/caffe/train_model.py
+100 b/‎example/caffe/train_model.py
+100
diff --git a/‎make/config.mk
+4 b/‎make/config.mk
+4
diff --git a/‎plugin/caffe/caffe.mk
+15 b/‎plugin/caffe/caffe.mk
+15
diff --git a/‎plugin/caffe/caffe_blob.cc
+77 b/‎plugin/caffe/caffe_blob.cc
+77
@@ -0,0 +1,43 @@
+# How to use Caffe Op(Layer) in MXNet
+
+This tutorial demonstrates how to call Caffe operator in MXNet:
+
+* 1) Compile MXNet with Caffe support.
+
+* 2) Embed Caffe's neural network layers into MXNet's symbolic graph.
+
+## Install Caffe With MXNet interface
+* Download offical Caffe repository [BVLC/Caffe](https://github.com/BVLC/caffe).
+* Download mxnet-interface [patch] (https://github.com/BVLC/caffe/pull/4527.patch). Move patch file under your caffe folder and apply the patch by `git apply 4527.patch`.
+* Install caffe following [official guide](http://caffe.berkeleyvision.org/installation.html).
+
+## Compile with Caffe
+* In mxnet folder, open `config.mk` (if you haven't already, copy `make/config.mk` (Linux) or `make/osx.mk` (Mac) into MXNet root folder as `config.mk`) and uncomment the lines `CAFFE_PATH = $(HOME)/caffe` and `MXNET_PLUGINS += plugin/caffe/caffe.mk`. Modify `CAFFE_PATH` to your caffe installation if necessary. 
+* Run `make clean && make` to build with caffe support.
+
+## Caffe Operators(Layers)
+Caffe's neural network layers are supported by MXNet through `mxnet.symbol.CaffeOperator` symbol.
+For example, the following code shows multi-layer perception network and lenet for classifying MNIST digits ([full code](https://github.com/HrWangChengdu/mxnet/blob/master/example/caffe/caffe_net.py)):
+```Python
+data = mx.symbol.Variable('data')
+fc1  = mx.symbol.CaffeOp(data_0=data, num_weight=2, name='fc1', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 128} }")
+act1 = mx.symbol.CaffeOp(data_0=fc1, prototxt="layer{type:\"TanH\"}")
+fc2  = mx.symbol.CaffeOp(data_0=act1, num_weight=2, name='fc2', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 64} }")
+act2 = mx.symbol.CaffeOp(data_0=fc2, prototxt="layer{type:\"TanH\"}")
+fc3 = mx.symbol.CaffeOp(data_0=act2, num_weight=2, name='fc3', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 10}}")
+mlp = mx.symbol.SoftmaxOutput(data=fc3, name='softmax')
+```
+Let's break it down. First `data = mx.symbol.Variable('data')` defines a Variable as placeholder for input.
+Then it's fed through Caffe's operators with `fc1  = mx.symbol.CaffeOperator(data_0=data, num_weight=2, name='fc1', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 128} }")`.
+
+The inputs to caffe layer are named as data_i for i=0 ... num_data-1 as `num_data` is the number of inputs. You may skip the argument, as the example does, if its value is 1. `num_weight` is number of `blobs_`(weights) in caffe layer. The default value is 0, as most layers, e.g. tanh, owns no weight. `prototxt` is the caffe's layer configuration string. 
+
+We could also replace the last line by:
+```Python
+label = mx.symbol.Variable('softmax_label')
+mlp = mx.symbol.CaffeLoss(data=fc3, label=label, grad_scale=1, name='softmax', prototxt="layer{type:\"SoftmaxWithLoss\"}")
+```
+to use loss funciton in caffe.
+
+## Use your own customized layers
+Running new caffe layer from mxnet is no difference than using regular caffe layers, through rules above. There's no need to add any code in mxnet.
@@ -0,0 +1,103 @@
+import os, sys
+import mxnet as mx
+from data import get_iterator 
+import argparse
+import train_model
+
+def get_mlp():
+    """
+    multi-layer perceptron
+    """
+    data = mx.symbol.Variable('data')
+    fc1  = mx.symbol.CaffeOp(data_0=data, num_weight=2, name='fc1', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 128} }")
+    act1 = mx.symbol.CaffeOp(data_0=fc1, prototxt="layer{type:\"TanH\"}")
+    fc2  = mx.symbol.CaffeOp(data_0=act1, num_weight=2, name='fc2', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 64} }")
+    act2 = mx.symbol.CaffeOp(data_0=fc2, prototxt="layer{type:\"TanH\"}")
+    fc3 = mx.symbol.CaffeOp(data_0=act2, num_weight=2, name='fc3', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 10}}")
+    if use_caffe_loss:
+        label = mx.symbol.Variable('softmax_label')
+        mlp = mx.symbol.CaffeLoss(data=fc3, label=label, grad_scale=1, name='softmax', prototxt="layer{type:\"SoftmaxWithLoss\"}")
+    else:
+        mlp = mx.symbol.SoftmaxOutput(data=fc3, name='softmax')
+    return mlp
+
+def get_lenet():
+    """
+    LeCun, Yann, Leon Bottou, Yoshua Bengio, and Patrick
+    Haffner. "Gradient-based learning applied to document recognition."
+    Proceedings of the IEEE (1998)
+    """
+    data = mx.symbol.Variable('data')
+
+    # first conv
+    conv1 = mx.symbol.CaffeOp(data_0=data, num_weight=2, prototxt="layer{type:\"Convolution\" convolution_param { num_output: 20 kernel_size: 5 stride: 1} }")
+    act1 = mx.symbol.CaffeOp(data_0=conv1, prototxt="layer{type:\"TanH\"}")
+    pool1 = mx.symbol.CaffeOp(data_0=act1, prototxt="layer{type:\"Pooling\" pooling_param { pool: MAX kernel_size: 2 stride: 2}}")
+
+    # second conv
+    conv2 = mx.symbol.CaffeOp(data_0=pool1, num_weight=2, prototxt="layer{type:\"Convolution\" convolution_param { num_output: 50 kernel_size: 5 stride: 1} }")
+    act2 = mx.symbol.CaffeOp(data_0=conv2, prototxt="layer{type:\"TanH\"}")
+    pool2 = mx.symbol.CaffeOp(data_0=act2, prototxt="layer{type:\"Pooling\" pooling_param { pool: MAX kernel_size: 2 stride: 2}}")
+
+    fc1 = mx.symbol.CaffeOp(data_0=pool2, num_weight=2, prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 500} }")
+    act3 = mx.symbol.CaffeOp(data_0=fc1, prototxt="layer{type:\"TanH\"}")
+
+    # second fullc
+    fc2 = mx.symbol.CaffeOp(data_0=act3, num_weight=2, prototxt="layer{type:\"InnerProduct\"inner_product_param{num_output: 10} }")
+    if use_caffe_loss:
+        label = mx.symbol.Variable('softmax_label')
+        lenet = mx.symbol.CaffeLoss(data=fc2, label=label, grad_scale=1, name='softmax', prototxt="layer{type:\"SoftmaxWithLoss\"}")
+    else:
+        lenet = mx.symbol.SoftmaxOutput(data=fc2, name='softmax')
+    return lenet
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='train an image classifer on mnist')
+    parser.add_argument('--network', type=str, default='lenet',
+                        choices = ['mlp', 'lenet'],
+                        help='the cnn to use')
+    parser.add_argument('--caffe-loss', type=int, default=0,
+                        help='Use CaffeLoss symbol')
+    parser.add_argument('--data-dir', type=str, default='mnist/',
+                        help='the input data directory')
+    parser.add_argument('--gpus', type=str,
+                        help='the gpus will be used, e.g "0,1,2,3"')
+    parser.add_argument('--num-examples', type=int, default=60000,
+                        help='the number of training examples')
+    parser.add_argument('--batch-size', type=int, default=128,
+                        help='the batch size')
+    parser.add_argument('--lr', type=float, default=.1,
+                        help='the initial learning rate')
+    parser.add_argument('--model-prefix', type=str,
+                        help='the prefix of the model to load/save')
+    parser.add_argument('--save-model-prefix', type=str,
+                        help='the prefix of the model to save')
+    parser.add_argument('--num-epochs', type=int, default=10,
+                        help='the number of training epochs')
+    parser.add_argument('--load-epoch', type=int,
+                        help="load the model on an epoch using the model-prefix")
+    parser.add_argument('--kv-store', type=str, default='local',
+                        help='the kvstore type')
+    parser.add_argument('--lr-factor', type=float, default=1,
+                        help='times the lr with a factor for every lr-factor-epoch epoch')
+    parser.add_argument('--lr-factor-epoch', type=float, default=1,
+                        help='the number of epoch to factor the lr, could be .5')
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    use_caffe_loss = args.caffe_loss
+
+    if args.network == 'mlp':
+        data_shape = (784, )
+        net = get_mlp()
+    else:
+        data_shape = (1, 28, 28)
+        net = get_lenet()
+
+    # train
+    if use_caffe_loss:
+        train_model.fit(args, net, get_iterator(data_shape), mx.metric.Caffe())
+    else:
+        train_model.fit(args, net, get_iterator(data_shape))
@@ -0,0 +1,37 @@
+import sys
+import os
+# code to automatically download dataset
+curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+sys.path.append(os.path.join(curr_path, "../../tests/python/common"))
+import get_data
+import mxnet as mx
+
+def get_iterator(data_shape):
+    def get_iterator_impl(args, kv):
+        """return train and val iterators for mnist"""
+        # download data
+        get_data.GetMNIST_ubyte()
+        flat = False if len(data_shape) == 3 else True
+
+        train           = mx.io.MNISTIter(
+            image       = "data/train-images-idx3-ubyte",
+            label       = "data/train-labels-idx1-ubyte",
+            input_shape = data_shape,
+            batch_size  = args.batch_size,
+            shuffle     = True,
+            flat        = flat,
+            num_parts   = kv.num_workers,
+            part_index  = kv.rank)
+
+        val = mx.io.MNISTIter(
+            image       = "data/t10k-images-idx3-ubyte",
+            label       = "data/t10k-labels-idx1-ubyte",
+            input_shape = data_shape,
+            batch_size  = args.batch_size,
+            flat        = flat,
+            num_parts   = kv.num_workers,
+            part_index  = kv.rank)
+
+        return (train, val)
+    return get_iterator_impl
+
@@ -0,0 +1,100 @@
+import mxnet as mx
+import logging
+import os
+
+def fit(args, network, data_loader, eval_metrics=None, batch_end_callback=None):
+    # kvstore
+    kv = mx.kvstore.create(args.kv_store)
+
+    # logging
+    head = '%(asctime)-15s Node[' + str(kv.rank) + '] %(message)s'
+    if 'log_file' in args and args.log_file is not None:
+        log_file = args.log_file
+        log_dir = args.log_dir
+        log_file_full_name = os.path.join(log_dir, log_file)
+        if not os.path.exists(log_dir):
+            os.mkdir(log_dir)
+        logger = logging.getLogger()
+        handler = logging.FileHandler(log_file_full_name)
+        formatter = logging.Formatter(head)
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+        logger.setLevel(logging.DEBUG)
+        logger.info('start with arguments %s', args)
+    else:
+        logging.basicConfig(level=logging.DEBUG, format=head)
+        logging.info('start with arguments %s', args)
+
+    # load model
+    model_prefix = args.model_prefix
+    if model_prefix is not None:
+        model_prefix += "-%d" % (kv.rank)
+    model_args = {}
+    if args.load_epoch is not None:
+        assert model_prefix is not None
+        tmp = mx.model.FeedForward.load(model_prefix, args.load_epoch)
+        model_args = {'arg_params' : tmp.arg_params,
+                      'aux_params' : tmp.aux_params,
+                      'begin_epoch' : args.load_epoch}
+    # save model
+    save_model_prefix = args.save_model_prefix
+    if save_model_prefix is None:
+        save_model_prefix = model_prefix
+    checkpoint = None if save_model_prefix is None else mx.callback.do_checkpoint(save_model_prefix)
+
+    # data
+    (train, val) = data_loader(args, kv)
+
+    # train
+    devs = mx.cpu() if args.gpus is None else [
+        mx.gpu(int(i)) for i in args.gpus.split(',')]
+
+    epoch_size = args.num_examples / args.batch_size
+
+    if args.kv_store == 'dist_sync':
+        epoch_size /= kv.num_workers
+        model_args['epoch_size'] = epoch_size
+
+    if 'lr_factor' in args and args.lr_factor < 1:
+        model_args['lr_scheduler'] = mx.lr_scheduler.FactorScheduler(
+            step = max(int(epoch_size * args.lr_factor_epoch), 1),
+            factor = args.lr_factor)
+
+    if 'clip_gradient' in args and args.clip_gradient is not None:
+        model_args['clip_gradient'] = args.clip_gradient
+
+    # disable kvstore for single device
+    if 'local' in kv.type and (
+            args.gpus is None or len(args.gpus.split(',')) is 1):
+        kv = None
+
+    model = mx.model.FeedForward(
+        ctx                = devs,
+        symbol             = network,
+        num_epoch          = args.num_epochs,
+        learning_rate      = args.lr,
+        momentum           = 0.9,
+        wd                 = 0.00001,
+        initializer        = mx.init.Xavier(factor_type="in", magnitude=2.34),
+        **model_args)
+
+    if eval_metrics == None:
+        eval_metrics = ['accuracy']
+        ## TopKAccuracy only allows top_k > 1
+        for top_k in [5, 10, 20]:
+            eval_metrics.append(mx.metric.create('top_k_accuracy', top_k = top_k))
+
+    if batch_end_callback is not None:
+        if not isinstance(batch_end_callback, list):
+            batch_end_callback = [batch_end_callback]
+    else:
+        batch_end_callback = []
+    batch_end_callback.append(mx.callback.Speedometer(args.batch_size, 50))
+
+    model.fit(
+       X                  = train,
+       eval_data          = val,
+       eval_metric        = eval_metrics,
+       kvstore            = kv,
+       batch_end_callback = batch_end_callback,
+       epoch_end_callback = checkpoint)
@@ -111,6 +111,10 @@ EXTRA_OPERATORS =
 # plugins
 #----------------------------
 
+# whether to use caffe integration. This requires including caffe submodule.
+# CAFFE_PATH = caffe-lite 
+# MXNET_PLUGINS += plugin/caffe/caffe.mk
+
 # whether to use torch integration. This requires installing torch.
 # You also need to add TORCH_PATH/install/lib to your LD_LIBRARY_PATH
 # TORCH_PATH = $(HOME)/torch
 
@@ -0,0 +1,15 @@
+CFLAGS += -I$(CAFFE_PATH)/include -I$(CAFFE_PATH)/build/src
+LDFLAGS += -lprotobuf -lboost_system -lboost_thread -lboost_filesystem -lgflags -lglog -L$(CAFFE_PATH)/build/lib -lcaffe
+
+ifeq ($(USE_CUDNN), 1)
+	CFLAGS += -DUSE_CUDNN=1
+endif
+
+ifeq ($(USE_CUDA), 0)
+	CFLAGS += -DCPU_ONLY=1
+endif
+
+CAFFE_SRC = $(wildcard plugin/caffe/*.cc)
+PLUGIN_OBJ += $(patsubst %.cc, build/%.o, $(CAFFE_SRC))
+CAFFE_CUSRC = $(wildcard plugin/caffe/*.cu)
+PLUGIN_CUOBJ += $(patsubst %.cu, build/%_gpu.o, $(CAFFE_CUSRC))
@@ -0,0 +1,77 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file caffe_blob.cc
+ * \brief Implementations of SetDataGradToBlob given various device/dimension
+ * \author Haoran Wang 
+*/
+#include "caffe_blob.h"
+namespace mxnet {
+namespace op {
+namespace caffe {
+
+template<>
+void SetDataGradToBlob<mshadow::cpu, float>(caffeMemoryTypes memType,
+                            std::vector<::caffe::Blob<float>*>::iterator blob,
+                            std::vector<mshadow::TBlob>::const_iterator itr) {
+  float *data_ptr = reinterpret_cast<float*>((*itr).dptr_);
+  if (memType == Data)
+    (*blob)->set_cpu_data(data_ptr);
+  else
+    (*blob)->set_cpu_diff(data_ptr);
+}
+
+template<>
+void SetDataGradToBlob<mshadow::cpu, double>(caffeMemoryTypes memType,
+                            std::vector<::caffe::Blob<double>*>::iterator blob,
+                            std::vector<mshadow::TBlob>::const_iterator itr) {
+  double *data_ptr = reinterpret_cast<double*>((*itr).dptr_);
+  if (memType == Data)
+    (*blob)->set_cpu_data(data_ptr);
+  else
+    (*blob)->set_cpu_diff(data_ptr);
+}
+
+template<>
+void SetDataGradToBlob<mshadow::gpu, float>(caffeMemoryTypes memType,
+                            std::vector<::caffe::Blob<float>*>::iterator blob,
+                            std::vector<mshadow::TBlob>::const_iterator itr) {
+  float *data_ptr = reinterpret_cast<float*>((*itr).dptr_);
+  if (memType == Data)
+    (*blob)->set_gpu_data(data_ptr);
+  else
+    (*blob)->set_gpu_diff(data_ptr);
+}
+
+template<>
+void SetDataGradToBlob<mshadow::gpu, double>(caffeMemoryTypes memType,
+                            std::vector<::caffe::Blob<double>*>::iterator blob,
+                            std::vector<mshadow::TBlob>::const_iterator itr) {
+  double *data_ptr = reinterpret_cast<double*>((*itr).dptr_);
+  if (memType == Data)
+    (*blob)->set_gpu_data(data_ptr);
+  else
+    (*blob)->set_gpu_diff(data_ptr);
+}
+
+mshadow::TShape Vector2TShape(const std::vector<int> &vec_int) {
+  mshadow::TShape res;
+  std::vector<mshadow::index_t> vec_indx;
+  for (int i = 0; i < vec_int.size(); ++i)
+    vec_indx.push_back(vec_int[i]);
+  // 0-dim represents scalar in caffe
+  if (vec_int.size() == 0)
+    vec_indx.push_back(1);
+  res = vec_indx;
+  return res;
+}
+
+std::vector<int> TShape2Vector(const mshadow::TShape &tshape) {
+  std::vector<int> s;
+  for (int i =0 ; i < tshape.ndim(); ++i)
+    s.push_back(tshape[i]);
+  return s;
+}
+
+}  // namespace caffe
+}  // namespace op
+}  // namespace mxnet