diff --git a/.travis.yml b/.travis.yml
index 0705baa1aca8b4..4fb2ca938795bb 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,6 +4,7 @@ cache:
     - $HOME/third_party
     - $HOME/.ccache
     - $HOME/.cache/pip
+    - $HOME/Library/Caches/Homebrew
 sudo: required
 dist: trusty
 os:
@@ -25,9 +26,9 @@ addons:
     packages:
       - gcc-4.8
       - g++-4.8
+      - gfortran-4.8
       - git
       - build-essential
-      - libatlas-base-dev
       - python
       - python-pip
       - python2.7-dev
@@ -54,7 +55,9 @@ before_install:
     fi
   - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then paddle/scripts/travis/before_install.osx.sh; fi
   - if [[ "$JOB" == "PRE_COMMIT" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
-  - pip install numpy wheel protobuf sphinx recommonmark sphinx_rtd_theme virtualenv pre-commit requests==2.9.2 LinkChecker
+  # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python 
+  # protobuf version.
+  - pip install numpy wheel 'protobuf==3.1' sphinx recommonmark sphinx_rtd_theme virtualenv pre-commit requests==2.9.2 LinkChecker
 script:
   - paddle/scripts/travis/main.sh
 notifications:
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 26306f9849100d..235c95f017f2b6 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -16,7 +16,8 @@
 set(CBLAS_FOUND OFF)
 
 ## Find MKL First.
-set(MKL_ROOT $ENV{MKLROOT} CACHE PATH "Folder contains MKL")
+set(INTEL_ROOT "/opt/intel" CACHE PATH "Folder contains intel libs")
+set(MKL_ROOT ${INTEL_ROOT}/mkl CACHE PATH "Folder contains MKL")
 
 find_path(MKL_INCLUDE_DIR mkl.h PATHS
   ${MKL_ROOT}/include)
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index 71e20c85276b01..ab105611c812a4 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -1,11 +1,11 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -29,12 +29,14 @@ INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR})
 ExternalProject_Add(
     glog
     ${EXTERNAL_PROJECT_LOG_ARGS}
+    DEPENDS gflags
     GIT_REPOSITORY  "https://github.com/google/glog.git"
     PREFIX          ${GLOG_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
     CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-    CMAKE_ARGS      -DWITH_GFLAGS=OFF
+    CMAKE_ARGS      -DWITH_GFLAGS=ON
+    CMAKE_ARGS      -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags
     CMAKE_ARGS      -DBUILD_TESTING=OFF
 )
 
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 0e8c29c831c823..29d17691db9f45 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -15,7 +15,6 @@
 INCLUDE(cblas)
 
 IF(NOT ${CBLAS_FOUND})
-    MESSAGE(FATAL_ERROR "Please install OpenBlas, MKL or ATLAS.")
     INCLUDE(ExternalProject)
 
     SET(CBLAS_SOURCES_DIR ${THIRD_PARTY_PATH}/openblas)
@@ -28,20 +27,40 @@ IF(NOT ${CBLAS_FOUND})
         SET(CBLAS_LIBRARIES "${CBLAS_INSTALL_DIR}/lib/libopenblas.a" CACHE FILEPATH "openblas library" FORCE)
     ENDIF(WIN32)
 
+    IF(CMAKE_COMPILER_IS_GNUCC)
+        ENABLE_LANGUAGE(Fortran)
+        LIST(APPEND CBLAS_LIBRARIES gfortran pthread)
+    ENDIF(CMAKE_COMPILER_IS_GNUCC)
+
+    IF(NOT CMAKE_Fortran_COMPILER)
+        MESSAGE(FATAL_ERROR "To build lapack in libopenblas, "
+                "you need to set gfortran compiler: cmake .. -DCMAKE_Fortran_COMPILER=...")
+    ENDIF(NOT CMAKE_Fortran_COMPILER)
+
     ExternalProject_Add(
         openblas
         ${EXTERNAL_PROJECT_LOG_ARGS}
-        URL                 "https://github.com/xianyi/OpenBLAS/archive/v0.2.19.tar.gz"
+        GIT_REPOSITORY      https://github.com/xianyi/OpenBLAS.git
+        GIT_TAG             v0.2.19
         PREFIX              ${CBLAS_SOURCES_DIR}
         INSTALL_DIR         ${CBLAS_INSTALL_DIR}
         BUILD_IN_SOURCE     1
-        CONFIGURE_COMMAND   ""
-        BUILD_COMMAND       make CC=${CMAKE_C_COMPILER} FC=${CMAKE_Fortran_COMPILER}
-        INSTALL_COMMAND     make install PREFIX=<INSTALL_DIR>
+        BUILD_COMMAND       ${CMAKE_MAKE_PROGRAM} FC=${CMAKE_Fortran_COMPILER} CC=${CMAKE_C_COMPILER} HOSTCC=${CMAKE_C_COMPILER} NO_SHARED=1 libs netlib
+        INSTALL_COMMAND     ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 PREFIX=<INSTALL_DIR>
         UPDATE_COMMAND      ""
+        CONFIGURE_COMMAND   ""
+    )
+
+    ExternalProject_Add_Step(
+        openblas lapacke_install
+        COMMAND ${CMAKE_COMMAND} -E copy "${CBLAS_SOURCES_DIR}/src/openblas/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h" "${CBLAS_INSTALL_DIR}/include/lapacke_mangling.h"
+        COMMAND ${CMAKE_COMMAND} -E copy "${CBLAS_SOURCES_DIR}/src/openblas/lapack-netlib/LAPACKE/include/lapacke.h" "${CBLAS_INSTALL_DIR}/include/lapacke.h"
+        COMMAND ${CMAKE_COMMAND} -E copy "${CBLAS_SOURCES_DIR}/src/openblas/lapack-netlib/LAPACKE/include/lapacke_config.h" "${CBLAS_INSTALL_DIR}/include/lapacke_config.h"
+        COMMAND ${CMAKE_COMMAND} -E copy "${CBLAS_SOURCES_DIR}/src/openblas/lapack-netlib/LAPACKE/include/lapacke_utils.h" "${CBLAS_INSTALL_DIR}/include/lapacke_utils.h"
+        DEPENDEES install
     )
 
     LIST(APPEND external_project_dependencies openblas)
-ENDIF()
+ENDIF(NOT ${CBLAS_FOUND})
 
 INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 613614c0e3d42f..84f459033f06f8 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -29,17 +29,12 @@ IF(WIN32)
         "${PROTOBUF_INSTALL_DIR}/lib/libprotoc.lib" CACHE FILEPATH "protoc library." FORCE)
   SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc.exe" CACHE FILEPATH "protobuf executable." FORCE)
 ELSE(WIN32)
-  IF(${HOST_SYSTEM} STREQUAL "centos")
-    SET(LIB "lib64")
-  ELSE()
-    SET(LIB "lib")
-  ENDIF()
   SET(PROTOBUF_LITE_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/${LIB}/libprotobuf-lite.a" CACHE FILEPATH "protobuf lite library." FORCE)
+        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite.a" CACHE FILEPATH "protobuf lite library." FORCE)
   SET(PROTOBUF_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/${LIB}/libprotobuf.a" CACHE FILEPATH "protobuf library." FORCE)
+        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf.a" CACHE FILEPATH "protobuf library." FORCE)
   SET(PROTOBUF_PROTOC_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/${LIB}/libprotoc.a" CACHE FILEPATH "protoc library." FORCE)
+        "${PROTOBUF_INSTALL_DIR}/lib/libprotoc.a" CACHE FILEPATH "protoc library." FORCE)
   SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc" CACHE FILEPATH "protobuf executable." FORCE)
 ENDIF(WIN32)
 
@@ -58,6 +53,7 @@ ExternalProject_Add(
     -DCMAKE_POSITION_INDEPENDENT_CODE=ON
     -DCMAKE_BUILD_TYPE=Release
     -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
+    -DCMAKE_INSTALL_LIBDIR=lib
 )
 
 LIST(APPEND external_project_dependencies protobuf)
diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake
index 29247d5c3d474a..6372a9a768e580 100644
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -26,11 +26,12 @@ IF(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
     find_python_module(wheel REQUIRED)
     find_python_module(google.protobuf REQUIRED)
     FIND_PACKAGE(NumPy REQUIRED)
-    IF(${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0")
+    IF(${PY_GOOGLE.PROTOBUF_VERSION} AND ${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0")
         MESSAGE(FATAL_ERROR "Found Python Protobuf ${PY_GOOGLE.PROTOBUF_VERSION} < 3.0.0, "
-        "please use pip to upgrade protobuf.")
-    ENDIF(${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0")
+        "please use pip to upgrade protobuf. pip install -U protobuf")
+    ENDIF()
 ELSE(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
+    MESSAGE(FATAL_ERROR "Please install python 2.7 before building PaddlePaddle.")
     ##################################### PYTHON ########################################
     SET(PYTHON_SOURCES_DIR ${THIRD_PARTY_PATH}/python)
     SET(PYTHON_INSTALL_DIR ${THIRD_PARTY_PATH}/install/python)
diff --git a/cmake/external/swig.cmake b/cmake/external/swig.cmake
index 63e8bd25462e50..744c766ee7b067 100644
--- a/cmake/external/swig.cmake
+++ b/cmake/external/swig.cmake
@@ -38,14 +38,6 @@ IF(NOT SWIG_FOUND)
         SET(SWIG_DIR ${SWIG_SOURCES_DIR} CACHE FILEPATH "SWIG Directory" FORCE)
         SET(SWIG_EXECUTABLE ${SWIG_SOURCES_DIR}/swig.exe  CACHE FILEPATH "SWIG Executable" FORCE)
     ELSE(WIN32)
-        # From PCRE configure
-        ExternalProject_Add(pcre
-            ${EXTERNAL_PROJECT_LOG_ARGS}
-            GIT_REPOSITORY https://github.com/svn2github/pcre.git
-            PREFIX ${SWIG_SOURCES_DIR}/pcre
-            CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${SWIG_INSTALL_DIR}/pcre
-        )
-
         # swig uses bison find it by cmake and pass it down
         FIND_PACKAGE(BISON)
 
@@ -54,16 +46,11 @@ IF(NOT SWIG_FOUND)
             GIT_REPOSITORY      https://github.com/swig/swig.git
             GIT_TAG             rel-3.0.10
             PREFIX              ${SWIG_SOURCES_DIR}
-            CONFIGURE_COMMAND   cd ${SWIG_SOURCES_DIR}/src/swig && ./autogen.sh
-            CONFIGURE_COMMAND   cd ${SWIG_SOURCES_DIR}/src/swig &&
-            env "PCRE_LIBS=${SWIG_INSTALL_DIR}/pcre/lib/libpcre.a ${SWIG_INSTALL_DIR}/pcre/lib/libpcrecpp.a ${SWIG_INSTALL_DIR}/pcre/lib/libpcreposix.a"
-            ./configure
-                --prefix=${SWIG_INSTALL_DIR}
-                --with-pcre-prefix=${SWIG_INSTALL_DIR}/pcre
-            BUILD_COMMAND   cd ${SWIG_SOURCES_DIR}/src/swig && make
-            INSTALL_COMMAND cd ${SWIG_SOURCES_DIR}/src/swig && make install
-            UPDATE_COMMAND  ""
-            DEPENDS pcre
+            CONFIGURE_COMMAND   cd <SOURCE_DIR> && ./autogen.sh && ./configure
+                                --prefix=${SWIG_INSTALL_DIR} --without-pcre
+            BUILD_COMMAND       cd <SOURCE_DIR> && make
+            INSTALL_COMMAND     cd <SOURCE_DIR> && make install
+            UPDATE_COMMAND      ""
         )
 
         SET(SWIG_DIR ${SWIG_INSTALL_DIR}/share/swig/${SWIG_TARGET_VERSION})
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index f5e4b3e1eb39ac..172c318b35d611 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -54,6 +54,7 @@ ExternalProject_Add(
     CMAKE_ARGS      -DWITH_GPU=${WITH_GPU}
     CMAKE_ARGS      -DWITH_OMP=${USE_OMP}
     CMAKE_ARGS      -DWITH_TORCH=OFF
+    CMAKE_ARGS      -DCMAKE_DISABLE_FIND_PACKAGE_Torch=TRUE
     CMAKE_ARGS      -DBUILD_SHARED=ON
 )
 
diff --git a/cmake/system.cmake b/cmake/system.cmake
index 788db404ebfb6f..3e472da7e0bd9c 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -12,6 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# Detects the OS and sets appropriate variables.
+# CMAKE_SYSTEM_NAME only give us a coarse-grained name,
+# but the name like centos is necessary in some scenes
+# to distinguish system for customization.
+#
+# for instance, protobuf libs path is <install_dir>/lib64
+# on CentOS, but <install_dir>/lib on other systems.
+
 IF(WIN32)
     SET(HOST_SYSTEM "win32")
 ELSE(WIN32)
@@ -21,6 +29,7 @@ ELSE(WIN32)
         SET(MACOS_VERSION ${VERSION})
         SET(HOST_SYSTEM "macosx")
     ELSE(APPLE)
+
         IF(EXISTS "/etc/issue")
             FILE(READ "/etc/issue" LINUX_ISSUE)
             IF(LINUX_ISSUE MATCHES "CentOS")
@@ -29,8 +38,24 @@ ELSE(WIN32)
                 SET(HOST_SYSTEM "debian")
             ELSEIF(LINUX_ISSUE MATCHES "Ubuntu")
                 SET(HOST_SYSTEM "ubuntu")
+            ELSEIF(LINUX_ISSUE MATCHES "Red Hat")
+                SET(HOST_SYSTEM "redhat")
+            ELSEIF(LINUX_ISSUE MATCHES "Fedora")
+                SET(HOST_SYSTEM "fedora")
             ENDIF()
         ENDIF(EXISTS "/etc/issue")
+
+        IF(EXISTS "/etc/redhat-release")
+            FILE(READ "/etc/redhat-release" LINUX_ISSUE)
+            IF(LINUX_ISSUE MATCHES "CentOS")
+                SET(HOST_SYSTEM "centos")
+            ENDIF()
+        ENDIF(EXISTS "/etc/redhat-release")
+
+        IF(NOT HOST_SYSTEM)
+            SET(HOST_SYSTEM ${CMAKE_SYSTEM_NAME})
+        ENDIF()
+
     ENDIF(APPLE)
 ENDIF(WIN32)
 
@@ -47,7 +72,7 @@ SET(EXTERNAL_PROJECT_LOG_ARGS
     LOG_DOWNLOAD    0     # Wrap download in script to log output
     LOG_UPDATE      1     # Wrap update in script to log output
     LOG_CONFIGURE   1     # Wrap configure in script to log output
-    LOG_BUILD       1     # Wrap build in script to log output
+    LOG_BUILD       0     # Wrap build in script to log output
     LOG_TEST        1     # Wrap test in script to log output
-    LOG_INSTALL     1     # Wrap install in script to log output
+    LOG_INSTALL     0     # Wrap install in script to log output
 )
diff --git a/demo/image_classification/prediction.py b/demo/image_classification/prediction.py
index 9a86aafcb2fa4d..49c0ff600c40e0 100755
--- a/demo/image_classification/prediction.py
+++ b/demo/image_classification/prediction.py
@@ -126,7 +126,7 @@ def forward(self, data, output_layer):
         # For oversampling, average predictions across crops.
         # If not, the shape of output[name]: (1, class_number),
         # the mean is also applicable.
-        return output[output_layer].mean(0)
+        return output[output_layer]['value'].mean(0)
 
     def predict(self, image=None, output_layer=None):
         assert isinstance(image, basestring)
diff --git a/demo/mnist/api_train.py b/demo/mnist/api_train.py
index f301da382ff8a5..ea1caa7dd9653a 100644
--- a/demo/mnist/api_train.py
+++ b/demo/mnist/api_train.py
@@ -6,33 +6,15 @@
 
 The user api could be simpler and carefully designed.
 """
-import py_paddle.swig_paddle as api
-from py_paddle import DataProviderConverter
-import paddle.trainer.PyDataProvider2 as dp
-import numpy as np
 import random
-from mnist_util import read_from_mnist
-from paddle.trainer_config_helpers import *
-
-
-def optimizer_config():
-    settings(
-        learning_rate=1e-4,
-        learning_method=AdamOptimizer(),
-        batch_size=1000,
-        model_average=ModelAverage(average_window=0.5),
-        regularization=L2Regularization(rate=0.5))
 
+import numpy as np
+import paddle.v2 as paddle_v2
+import py_paddle.swig_paddle as api
+from paddle.trainer_config_helpers import *
+from py_paddle import DataProviderConverter
 
-def network_config():
-    imgs = data_layer(name='pixel', size=784)
-    hidden1 = fc_layer(input=imgs, size=200)
-    hidden2 = fc_layer(input=hidden1, size=200)
-    inference = fc_layer(input=hidden2, size=10, act=SoftmaxActivation())
-    cost = classification_cost(
-        input=inference, label=data_layer(
-            name='label', size=10))
-    outputs(cost)
+from mnist_util import read_from_mnist
 
 
 def init_parameter(network):
@@ -75,19 +57,35 @@ def input_order_converter(generator):
 def main():
     api.initPaddle("-use_gpu=false", "-trainer_count=4")  # use 4 cpu cores
 
-    # get enable_types for each optimizer.
-    # enable_types = [value, gradient, momentum, etc]
-    # For each optimizer(SGD, Adam), GradientMachine should enable different
-    # buffers.
-    opt_config_proto = parse_optimizer_config(optimizer_config)
-    opt_config = api.OptimizationConfig.createFromProto(opt_config_proto)
-    _temp_optimizer_ = api.ParameterOptimizer.create(opt_config)
-    enable_types = _temp_optimizer_.getParameterTypes()
+    optimizer = paddle_v2.optimizer.Adam(
+        learning_rate=1e-4,
+        batch_size=1000,
+        model_average=ModelAverage(average_window=0.5),
+        regularization=L2Regularization(rate=0.5))
+
+    # Create Local Updater. Local means not run in cluster.
+    # For a cluster training, here we can change to createRemoteUpdater
+    # in future.
+    updater = optimizer.create_local_updater()
+    assert isinstance(updater, api.ParameterUpdater)
+
+    # define network
+    images = paddle_v2.layer.data(
+        name='pixel', type=paddle_v2.data_type.dense_vector(784))
+    label = paddle_v2.layer.data(
+        name='label', type=paddle_v2.data_type.integer_value(10))
+    hidden1 = paddle_v2.layer.fc(input=images, size=200)
+    hidden2 = paddle_v2.layer.fc(input=hidden1, size=200)
+    inference = paddle_v2.layer.fc(input=hidden2,
+                                   size=10,
+                                   act=paddle_v2.activation.Softmax())
+    cost = paddle_v2.layer.classification_cost(input=inference, label=label)
 
     # Create Simple Gradient Machine.
-    model_config = parse_network_config(network_config)
-    m = api.GradientMachine.createFromConfigProto(
-        model_config, api.CREATE_MODE_NORMAL, enable_types)
+    model_config = paddle_v2.layer.parse_network(cost)
+    m = api.GradientMachine.createFromConfigProto(model_config,
+                                                  api.CREATE_MODE_NORMAL,
+                                                  optimizer.enable_types())
 
     # This type check is not useful. Only enable type hint in IDE.
     # Such as PyCharm
@@ -96,19 +94,12 @@ def main():
     # Initialize Parameter by numpy.
     init_parameter(network=m)
 
-    # Create Local Updater. Local means not run in cluster.
-    # For a cluster training, here we can change to createRemoteUpdater
-    # in future.
-    updater = api.ParameterUpdater.createLocalUpdater(opt_config)
-    assert isinstance(updater, api.ParameterUpdater)
-
     # Initialize ParameterUpdater.
     updater.init(m)
 
     # DataProvider Converter is a utility convert Python Object to Paddle C++
     # Input. The input format is as same as Paddle's DataProvider.
-    converter = DataProviderConverter(
-        input_types=[dp.dense_vector(784), dp.integer_value(10)])
+    converter = DataProviderConverter(input_types=[images.type, label.type])
 
     train_file = './data/raw_data/train'
     test_file = './data/raw_data/t10k'
diff --git a/demo/mnist/api_train_v2.py b/demo/mnist/api_train_v2.py
new file mode 100644
index 00000000000000..06beb7024d1fd0
--- /dev/null
+++ b/demo/mnist/api_train_v2.py
@@ -0,0 +1,62 @@
+import paddle.v2 as paddle
+
+
+def main():
+    paddle.init(use_gpu=False, trainer_count=1)
+
+    # define network topology
+    images = paddle.layer.data(
+        name='pixel', type=paddle.data_type.dense_vector(784))
+    label = paddle.layer.data(
+        name='label', type=paddle.data_type.integer_value(10))
+    hidden1 = paddle.layer.fc(input=images, size=200)
+    hidden2 = paddle.layer.fc(input=hidden1, size=200)
+    inference = paddle.layer.fc(input=hidden2,
+                                size=10,
+                                act=paddle.activation.Softmax())
+    cost = paddle.layer.classification_cost(input=inference, label=label)
+
+    parameters = paddle.parameters.create(cost)
+
+    adam_optimizer = paddle.optimizer.Adam(learning_rate=0.01)
+
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=adam_optimizer)
+
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 1000 == 0:
+                result = trainer.test(reader=paddle.reader.batched(
+                    paddle.dataset.mnist.test(), batch_size=256))
+
+                print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics,
+                    result.metrics)
+
+        else:
+            pass
+
+    trainer.train(
+        reader=paddle.reader.batched(
+            paddle.reader.shuffle(
+                paddle.dataset.mnist.train(), buf_size=8192),
+            batch_size=32),
+        event_handler=event_handler)
+
+    # output is a softmax layer. It returns probabilities.
+    # Shape should be (100, 10)
+    probs = paddle.infer(
+        output=inference,
+        parameters=parameters,
+        reader=paddle.reader.batched(
+            paddle.reader.firstn(
+                paddle.reader.map_readers(lambda item: (item[0], ),
+                                          paddle.dataset.mnist.test()),
+                n=100),
+            batch_size=32))
+    print probs.shape
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/model_zoo/resnet/classify.py b/demo/model_zoo/resnet/classify.py
index 4631816c43ef48..6074cc1d3a85e1 100755
--- a/demo/model_zoo/resnet/classify.py
+++ b/demo/model_zoo/resnet/classify.py
@@ -156,7 +156,7 @@ def forward(self, data, output_layer):
             # For oversampling, average predictions across crops.
             # If not, the shape of output[name]: (1, class_number),
             # the mean is also applicable.
-            res[name] = output[name].mean(0)
+            res[name] = output[name]['value'].mean(0)
 
         return res
 
diff --git a/demo/sentiment/dataprovider.py b/demo/sentiment/dataprovider.py
index 00f72cecacb454..4b7f5d0e504aef 100755
--- a/demo/sentiment/dataprovider.py
+++ b/demo/sentiment/dataprovider.py
@@ -32,4 +32,6 @@ def process(settings, file_name):
             word_slot = [
                 settings.word_dict[w] for w in words if w in settings.word_dict
             ]
+            if not word_slot:
+                continue
             yield word_slot, label
diff --git a/demo/sentiment/predict.py b/demo/sentiment/predict.py
index 8ec490f6469192..64c78e0d6b9297 100755
--- a/demo/sentiment/predict.py
+++ b/demo/sentiment/predict.py
@@ -138,7 +138,11 @@ def main():
 
     batch = []
     for line in sys.stdin:
-        batch.append([predict.get_index(line)])
+        words = predict.get_index(line)
+        if words:
+            batch.append([words])
+        else:
+            print('All the words in [%s] are not in the dictionary.' % line)
         if len(batch) == batch_size:
             predict.batch_predict(batch)
             batch = []
diff --git a/demo/traffic_prediction/predict.sh b/demo/traffic_prediction/predict.sh
index cec35dce11d1c1..2dbd5e8805dd97 100755
--- a/demo/traffic_prediction/predict.sh
+++ b/demo/traffic_prediction/predict.sh
@@ -25,6 +25,6 @@ paddle train \
     --config_args=is_predict=1 \
     --predict_output_dir=. 
 
-python gen_result.py > result.txt
+python gen_result.py > result.csv
 
 rm -rf rank-00000
diff --git a/doc/api/trainer_config_helpers/layers.rst b/doc/api/trainer_config_helpers/layers.rst
index 4e429650e54517..bbea823de4d870 100644
--- a/doc/api/trainer_config_helpers/layers.rst
+++ b/doc/api/trainer_config_helpers/layers.rst
@@ -139,24 +139,12 @@ lstmemory
     :members: lstmemory
     :noindex:
 
-lstm_step_layer
----------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: lstm_step_layer
-    :noindex:
-
 grumemory
 ---------
 ..  automodule:: paddle.trainer_config_helpers.layers
     :members: grumemory
     :noindex:
 
-gru_step_layer
----------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: gru_step_layer
-    :noindex:
-
 Recurrent Layer Group
 =====================
 
@@ -172,6 +160,18 @@ recurrent_group
     :members: recurrent_group
     :noindex:
     
+lstm_step_layer
+---------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: lstm_step_layer
+    :noindex:
+
+gru_step_layer
+---------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: gru_step_layer
+    :noindex:
+
 beam_search
 ------------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -279,6 +279,12 @@ concat_layer
     :members: concat_layer
     :noindex:
 
+seq_concat_layer
+----------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: seq_concat_layer
+    :noindex:
+
 Reshaping Layers
 ================
 
@@ -302,6 +308,18 @@ repeat_layer
     :members: repeat_layer
     :noindex:
 
+rotate_layer
+------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: rotate_layer
+    :noindex:
+
+seq_reshape_layer
+-----------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: seq_reshape_layer
+    :noindex:
+
 Math Layers
 ===========
 
@@ -382,6 +400,15 @@ sampling_id_layer
     :members: sampling_id_layer
     :noindex:
 
+Slicing and Joining Layers
+==========================
+
+pad_layer
+-----------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: pad_layer
+    :noindex:
+
 ..  _api_trainer_config_helpers_layers_cost_layers:
 
 Cost Layers
@@ -441,6 +468,12 @@ ctc_layer
     :members: ctc_layer
     :noindex:
 
+warp_ctc_layer
+--------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: warp_ctc_layer
+    :noindex:
+
 nce_layer
 -----------
 ..  automodule:: paddle.trainer_config_helpers.layers
diff --git a/doc/design/api.md b/doc/design/api.md
new file mode 100644
index 00000000000000..8185d2af0ea264
--- /dev/null
+++ b/doc/design/api.md
@@ -0,0 +1,262 @@
+# PaddlePaddle Design Doc
+
+## Ingredients
+
+As our design principle is starting from the essence: how could we
+allow users to express and solve their problems at neural networks.
+Some essential concepts that our API have to provide include:
+
+1. A *topology* is an expression of *layers*.
+
+1. A layer could be any kind of computation, including *cost*.
+
+1. Some layers have parameters, some don't. Most costs don't have
+   parameters.
+
+1. In some topologies, layers share parameters.  For
+   example,
+   [the network for training a ranking model](https://github.com/PaddlePaddle/Paddle/issues/1311#issuecomment-279121850).
+
+1. At programming time, users specify topologies and possible sharing
+   of parameters.  PaddlePaddle can figure out and create parameters
+   required (and possibly shared) by one or more topologies.
+
+
+## Starting from Examples
+
+As a summarization
+of
+[our disucssion](https://github.com/PaddlePaddle/Paddle/issues/1315),
+let us present two examples here:
+
+
+### Example 1. Sharing Parameters between Layers
+
+We use
+the
+[3-branch ranking](https://github.com/PaddlePaddle/Paddle/issues/1311#issuecomment-279121850) model
+in this example.  For your convenience, I copy-a-paste the model's
+topology as follows:
+
+```
+A -> f -\
+Q -> f --> cost
+B -> f -/
+```
+
+The following program trains the topology including the cost, and then
+use the sub-network in the trained topology in inference:
+
+```python
+def f(in):
+    e = paddle.layer.embedding(in, parameter_name="embedding")
+    o = paddle.layer.softmax(e, parameter_name="semantic")
+    return o
+
+# Create 3 topologies (subnets), they share parameters because all
+# correspoinding layers have the same parameter names.
+fA = f(paddle.layer.data(input_name="A"))
+fB = f(paddle.layer.data(input_name="B"))
+fQ = f(paddle.layer.data(input_name="Q"))
+
+topology = paddle.layer.less_than(
+               paddle.layer.cross_entropy(fA, fQ),
+               paddle.layer.corss_entropy(fB, fQ))
+
+# Derive parameters required in topology and create them in model.
+parameters = paddle.parameters.create(topology)
+
+# Estimate parameters used in topology from data.
+paddle.train(topology, parameters, reader=read_ranking_model_data)
+
+# Inference using fA (or fB or fC, as they share their parameters).
+[testA, testB, testQ] = read_ranking_model_data()
+print "The sematic-vector of testA: ", paddle.infer(fA, parameters, testA)
+```
+
+
+### Example 2. Sharing Parameters between "Models"
+
+We use [GAN](https://github.com/PaddlePaddle/book/tree/develop/gan) in
+this example.  In the following example program, `d0` and `d1`
+correspond to the two networks in the following figure:
+
+<img src="https://github.com/wangyang59/book/raw/00036f4b0da5225041a6824587c1a01cf20159b1/gan/image/gan_ig.png" width=400 />
+
+```python
+def G(in):
+    # over-simplified example as G has only one layers:
+    return paddle.layer.fc(in, parameter_name="G")
+
+def D(in);
+    # again, over-simplified:
+    return paddle.layer.fc(in, parameter_name="D")
+
+# Construct the first topology, which contains both D and G.
+# By learning this topology, we update parameters of G.
+d0 = paddle.layer.should_be_false(D(G(paddle.layer.data())))
+
+# Construct a second topology d1, which contains only D. By
+# training this topology, we update parameters of D.  Note
+# that d1 share parameters with d0.
+d1 = paddle.layer.should_be_true(D(paddle.layer.data()))
+
+# Create parameters from a list of multiple topologies (models) for
+# the chance to share parameters between these topologies.
+parameters = paddle.parameters.create([d0, d1])
+
+# Iterative training of GAN.
+for ...:
+    train(d0, parameters, reader=read_from_rng, immutable_parameters={"D"})
+    train(d1, parameters, reader=read_from_realistic_images)
+
+# Use d1 for inference:
+print "D thinks a batch of images are realistic ", infer(d1, parameters, read_mnist_images)
+```
+
+
+### Summarization
+
+
+Above two programs reveal some important design concerns:
+
+1. Users describe a topology as an expression of layers.  Every layer
+   has a *parameter name*.  If the users don't specify it explicitly, it's automatically generated as a unique name.  By
+   specifying the parameter name, users can specify the sharing of
+   parameters between layers and even between topologies.
+
+1. `paddle.parameters.create` figures out parameters required by one
+   or more topologies from parameter names of layers.  It creates these
+   parameters and returns a `ParameterSet` object, which is in essence
+   a map from *parameter names* to *parameters*.
+
+1. At training and inference time, `paddle.train` and `paddle.infer`
+   requires both a topology and the parameter set that holds the parameters of that topology.  There are some reasons:
+
+   1. This prevents users from forgetting to call
+      `paddle.parameters.create`.
+   1. `paddle.train` needs to know which parameter set to update.
+   1. Users could load another (pre-trained) parameter set and use it
+      with a topology in `train.infer`.
+
+1. By specifying the `immutable_parameters` parameter of
+   `paddle.train`, we can forbid the update of these parameters.
+
+
+## Reader
+
+Not all programming frameworks allow users to define I/O functions.
+An example is Google MapReduce, which can only read from text,
+SSTable, and RecordIO files.  Hadoop MapReduce allows users to define
+readers and writers by deriving from base classes `Reader` and
+`Writer`.  The former is less flexible but also less error-prone.  We
+decide to provide the flexibility to users to define their readers.
+
+
+There are some open questions here:
+
+1. **Should a reader return a Python dictionary?**
+
+1. **How to map multiple outputs from a reader to multiple data layers?**
+
+1. **How to easily compose some existing readers to read more data and
+   feed a topology with more data layers?**
+
+
+## Training
+
+The recommended way to training a model is to call `paddle.train`,
+which simply calls `paddle.trainer.Default`, a global variable of
+type `paddle.trainer.SGD`.  Equivalently, we can do
+
+```python
+opt = paddle.trainer.SGD(..., paddle.updater.Adam(...))
+opt.train(topology, parameters, reader=read, ...)
+```
+
+### Updater
+
+Please be aware that a trainer can accept an updater as its data
+member, where an updater is a class derived from
+`paddle.trainer.Updater`.  This is to make it easier to customize
+trainers, as discussed
+[here](https://github.com/PaddlePaddle/Paddle/issues/1319).
+
+### Event Handler
+
+`paddle.train` and `paddle.trainer.XXX.train` take an optional
+parameter `event_handler`, which should be either `None` or a function
+that handle some events:
+
+1. BeginTraining
+1. EndTraining
+1. BeginIteration
+1. EndIteration
+1. BeginPass
+1. EndPass
+
+where EndPass is sent if and only if the reader yields
+`end_pass=True`.
+
+An example as follows:
+
+```python
+def event_handler(event):
+    if ininstance(event, paddle.event.EndIteration):
+        print paddle.test(...)
+
+paddle.train(topology, parameters, reader, event_handler)
+```
+
+If we are writing a PaddlePaddle program in and for iPython/Jypyter,
+we can use metaplotlib in the event handler to plot a curve of
+cost/error versus iterations, as shown
+[here](https://blog.dominodatalab.com/interactive-dashboards-in-jupyter/).
+
+### Distributed Training
+
+If users want to do distributed training on a cluster, s/he should
+call `paddle.dist_train` and provides access tokens to the cluster as
+a parameter.
+
+For example, if the user has a TLS certificate that allows him to
+access a Kubernetes cluster, s/he should be able to call
+
+```python
+paddle.dist_train(model,
+                  trainer=paddle.trainer.SGD(...,
+                                             paddle.updater.Adam(...)),
+                  reader=read,
+                  k8s_user="yi",
+                  k8s_token="kube_cluster_tls.pem",
+                  k8s_job="hello",
+                  num_parameter_servers=15)
+```
+
+The pseudo code if `paddle.dist_train` is as follows:
+
+```python
+def dist_train(topology, parameters, trainer, reader, ...):
+    if os.getenv("KUBERNETES_SERVICE_HOST") == None:
+        image_name = k8s_user + '/' + k8s_job
+        docker_build(image_name)
+        docker_push()
+        kube_ctrl_start_job(image_name, k8s_user, k8s_token)
+    else:
+        rank = kube_list_containers_in_job_and_return_current_containers_rank()
+        if rank == 0:
+            master()
+        elif rank < 15:
+            parameter_server()
+        else:
+            trainer.train(model, reader=read)
+```
+
+Please be aware that if a process is running on the Kubernetes
+cluster, it will have some environment variables pre-defined.
+
+If `dist_train` doesn't see these environment variables, it knows
+that it's running on users' personal computer, and it should work as a
+*launcher*.  Otherwise, it knows that it's running on the cluster and
+need to figure out its role as either the master, or a trainer, or a
+parameter server.
diff --git a/doc/design/reader/README.md b/doc/design/reader/README.md
new file mode 100644
index 00000000000000..17d52b9e20b813
--- /dev/null
+++ b/doc/design/reader/README.md
@@ -0,0 +1,161 @@
+# Python Data Reader Design Doc
+
+At training and testing time, PaddlePaddle programs need to read data. To ease the users' work to write data reading code, we define that
+
+- A *reader* is a function that reads data (from file, network, random number generator, etc) and yields data items.
+- A *reader creator* is a function that returns a reader function.
+- A *reader* decorator is a function, which accepts one or more readers, and returns a reader.
+
+and provide frequently used reader creators and reader decorators.
+
+## Data Reader Interface
+
+Indeed, *data reader* doesn't have to be a function that reads and yields data items. It can be any function with no parameter that creates a iterable (anything can be used in `for x in iterable`):
+
+```
+iterable = data_reader()
+```
+
+Element produced from the iterable should be a **single** entry of data, **not** a mini batch. That entry of data could be a single item, or a tuple of items. Item should be of [supported type](http://www.paddlepaddle.org/doc/ui/data_provider/pydataprovider2.html?highlight=dense_vector#input-types) (e.g., numpy 1d array of float32, int, list of int)
+
+An example implementation for single item data reader creator:
+
+```python
+def reader_creator_random_image(width, height):
+	def reader():
+		while True:
+			yield numpy.random.uniform(-1, 1, size=width*height)
+	return reader
+```
+
+An example implementation for multiple item data reader creator:
+```python
+def reader_creator_random_imageand_label(widht, height, label):
+	def reader():
+		while True:
+			yield numpy.random.uniform(-1, 1, size=width*height), label
+	return reader
+```
+
+## Usage
+
+data reader, mapping from item(s) read to data layer, batch size and number of total pass will be passed into `paddle.train`:
+
+```python
+# two data layer is created:
+image_layer = paddle.layer.data("image", ...)
+label_layer = paddle.layer.data("label", ...)
+
+# ...
+
+paddle.train(paddle.dataset.mnist, {"image":0, "label":1}, 128, 10, ...)
+```
+
+## Data Reader Decorator
+
+*Data reader decorator* takes a single or multiple data reader, returns a new data reader. It is similar to a [python decorator](https://wiki.python.org/moin/PythonDecorators), but it does not use `@` syntax.
+
+Since we have a strict interface for data readers (no parameter, return a single data item). Data reader can be used flexiable via data reader decorators. Following are a few examples:
+
+### Prefetch Data
+
+Since reading data may take time and training can not proceed without data. It is generally a good idea to prefetch data.
+
+Use `paddle.reader.buffered` to prefetch data:
+
+```python
+buffered_reader = paddle.reader.buffered(paddle.dataset.mnist, 100)
+```
+
+`buffered_reader` will try to buffer (prefetch) `100` data entries.
+
+### Compose Multiple Data Readers
+
+For example, we want to use a source of real images (reusing mnist dataset), and a source of random images as input for [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661).
+
+We can do:
+
+```python
+def reader_creator_random_image(width, height):
+	def reader():
+		while True:
+			yield numpy.random.uniform(-1, 1, size=width*height)
+	return reader
+
+def reader_creator_bool(t):
+	def reader:
+		while True:
+			yield t
+	return reader
+
+true_reader = reader_creator_bool(True)
+false_reader = reader_creator_bool(False)
+
+reader = paddle.reader.compose(paddle.dataset.mnist, data_reader_creator_random_image(20, 20), true_reader, false_reader)
+# Skipped 1 because paddle.dataset.mnist produces two items per data entry.
+# And we don't care second item at this time.
+paddle.train(reader, {"true_image":0, "fake_image": 2, "true_label": 3, "false_label": 4}, ...)
+```
+
+### Shuffle
+
+Given shuffle buffer size `n`, `paddle.reader.shuffle` will return a data reader that buffers `n` data entries and shuffle them before a data entry is read.
+
+Example:
+```python
+reader = paddle.reader.shuffle(paddle.dataset.mnist, 512)
+```
+
+## Q & A
+
+### Why return only a single entry, but not a mini batch?
+
+If a mini batch is returned, data reader need to take care of batch size. But batch size is a concept for training, it makes more sense for user to specify batch size as a parameter for `train`.
+
+Practically, always return a single entry make reusing existing data readers much easier (e.g., if existing reader return not a single entry but 3 entries, training code will be more complex because it need to handle cases like batch size 2).
+
+### Why use a dictionary but not a list to provide mapping?
+
+We decided to use dictionary (`{"image":0, "label":1}`) instead of list (`["image", "label"]`) is because that user can easily resue item (e.g., using `{"image_a":0, "image_b":0, "label":1}`) or skip item (e.g., using `{"image_a":0, "label":2}`).
+
+### How to create custom data reader creator
+
+```python
+def image_reader_creator(image_path, label_path, n):
+	def reader():
+		f = open(image_path)
+		l = open(label_path)
+		images = numpy.fromfile(
+			f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28)).astype('float32')
+		images = images / 255.0 * 2.0 - 1.0
+		labels = numpy.fromfile(l, 'ubyte', count=n).astype("int")
+		for i in xrange(n):
+			yield images[i, :], labels[i] # a single entry of data is created each time
+		f.close()
+		l.close()
+	return reader
+
+# images_reader_creator creates a reader
+reader = image_reader_creator("/path/to/image_file", "/path/to/label_file", 1024)
+paddle.train(reader, {"image":0, "label":1}, ...)
+```
+
+### How is `paddle.train` implemented
+
+An example implementation of paddle.train could be:
+
+```python
+def make_minibatch(reader, minibatch_size):
+	def ret():
+		r = reader()
+		buf = [r.next() for x in xrange(minibatch_size)]
+		while len(buf) > 0:
+			yield buf
+			buf = [r.next() for x in xrange(minibatch_size)]
+	return ret
+
+def train(reader, mapping, batch_size, total_pass):
+	for pass_idx in range(total_pass):
+		for mini_batch in make_minibatch(reader): # this loop will never end in online learning.
+			do_forward_backward(mini_batch, mapping)
+```
diff --git a/doc/getstarted/build_and_install/build_from_source_en.md b/doc/getstarted/build_and_install/build_from_source_en.md
index 6954be3b2bb956..d9d54bff3096cb 100644
--- a/doc/getstarted/build_and_install/build_from_source_en.md
+++ b/doc/getstarted/build_and_install/build_from_source_en.md
@@ -4,6 +4,8 @@ Installing from Sources
 * [1. Download and Setup](#download)
 * [2. Requirements](#requirements)
 * [3. Build on Ubuntu](#ubuntu)
+* [4. Build on Centos](#centos)
+
 
 ## <span id="download">Download and Setup</span> 
 You can download PaddlePaddle from the [github source](https://github.com/PaddlePaddle/Paddle).
@@ -16,9 +18,10 @@ cd paddle
 
 To compile the source code, your computer must be equipped with the following dependencies.
 
-- **Compiler**: GCC >= 4.8 or Clang >= 3.3 (AppleClang >= 5.1)
-- **CMake**: version >= 3.0 (at least CMake 3.4 on Mac OS X)
+- **Compiler**: GCC >= 4.8 or Clang >= 3.3 (AppleClang >= 5.1) and gfortran compiler
+- **CMake**: CMake >= 3.0 (at least CMake 3.4 on Mac OS X)
 - **BLAS**: MKL, OpenBlas or ATLAS
+- **Python**: only support Python 2.7
 
 **Note:** For CUDA 7.0 and CUDA 7.5, GCC 5.0 and up are not supported!
 For CUDA 8.0, GCC versions later than 5.3 are not supported!
@@ -64,7 +67,8 @@ As a simple example, consider the following:
 
 1. **BLAS Dependencies(optional)**
   
-    Paddle will find BLAS from system's default path. But you can specify MKL, OpenBLAS or ATLAS via `MKL_ROOT`, `OPENBLAS_ROOT` or `ATLAS_ROOT`.
+    CMake will search BLAS libraries from system. If not found, OpenBLAS will be downloaded, built and installed automatically.
+    To utilize preinstalled BLAS， you can simply specify MKL, OpenBLAS or ATLAS via `MKL_ROOT`, `OPENBLAS_ROOT` or `ATLAS_ROOT`.
 
     ```bash
     # specify MKL
@@ -94,12 +98,78 @@ As a simple example, consider the following:
 
 ### Install Dependencies
 
-- **CPU Dependencies**
+- **Paddle Dependencies**
 
     ```bash
     # necessary
     sudo apt-get update
-    sudo apt-get install -y g++ make cmake build-essential libatlas-base-dev python python-pip libpython-dev git
+    sudo apt-get install -y git curl gcc g++ gfortran make build-essential automake
+    sudo apt-get install -y python python-pip python-numpy libpython-dev bison
+    sudo pip install 'protobuf==3.1.0.post1'
+
+    # install cmake 3.4
+    curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
+        cd cmake-3.4.1 && ./bootstrap && make -j4 && sudo make install && \
+        cd .. && rm -rf cmake-3.4.1
+    ```
+
+- **GPU Dependencies (optional)**
+
+    To build GPU version, you will need the following installed:
+
+        1. a CUDA-capable GPU
+        2. A supported version of Linux with a gcc compiler and toolchain
+        3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
+        4. NVIDIA cuDNN Library (availabel at https://developer.nvidia.com/cudnn)
+
+    The CUDA development environment relies on tight integration with the host development environment,
+    including the host compiler and C runtime libraries, and is therefore only supported on
+    distribution versions that have been qualified for this CUDA Toolkit release.
+        
+    After downloading cuDNN library, issue the following commands:
+
+    ```bash
+    sudo tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local
+    sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
+    ```
+    Then you need to set LD\_LIBRARY\_PATH, PATH environment variables in ~/.bashrc.
+
+    ```bash
+    export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+    export PATH=/usr/local/cuda/bin:$PATH
+    ```
+
+### Build and Install
+
+As usual, the best option is to create build folder under paddle project directory.
+
+```bash
+mkdir build && cd build
+``` 
+
+Finally, you can build and install PaddlePaddle:
+
+```bash
+# you can add build option here, such as:    
+cmake .. -DCMAKE_INSTALL_PREFIX=<path to install>
+# please use sudo make install, if you want to install PaddlePaddle into the system
+make -j `nproc` && make install
+# set PaddlePaddle installation path in ~/.bashrc
+export PATH=<path to install>/bin:$PATH
+# install PaddlePaddle Python modules.
+sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
+```
+## <span id="centos">Build on Centos 7</span>
+
+### Install Dependencies
+
+- **CPU Dependencies**
+
+    ```bash
+    # necessary
+    sudo yum update
+    sudo yum install -y epel-release
+    sudo yum install -y make cmake3 python-devel python-pip gcc-gfortran swig git
     sudo pip install wheel numpy
     sudo pip install 'protobuf>=3.0.0'
     ```
@@ -142,7 +212,7 @@ Finally, you can build and install PaddlePaddle:
 
 ```bash
 # you can add build option here, such as:    
-cmake .. -DCMAKE_INSTALL_PREFIX=<path to install>
+cmake3 .. -DCMAKE_INSTALL_PREFIX=<path to install>
 # please use sudo make install, if you want to install PaddlePaddle into the system
 make -j `nproc` && make install
 # set PaddlePaddle installation path in ~/.bashrc
diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/getstarted/build_and_install/docker_install_cn.rst
index 35234e0eb3ece3..6b132d2a4d31ab 100644
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -12,7 +12,7 @@ PaddlePaddle项目提供官方 `Docker <https://www.docker.com/>`_ 镜像。Dock
 PaddlePaddle提供的Docker镜像版本
 --------------------------------
 
-我们提供了12个 `Docker image <https://hub.docker.com/r/paddledev/paddle/tags/>`_ ，他们的image name都是 :code:`paddle-dev/paddle` ，tag分别为
+我们提供了12个 `Docker image <https://hub.docker.com/r/paddledev/paddle/tags/>`_ ，他们的image name都是 :code:`paddledev/paddle` ，tag分别为
 
 +-----------------+------------------+------------------------+-----------------------+
 |                 |   normal         |           devel        |          demo         |
@@ -45,7 +45,7 @@ PaddlePaddle提供的Docker镜像版本
 
     if cat /proc/cpuinfo | grep -q avx ; then echo "Support AVX"; else echo "Not support AVX"; fi
 
-如果输出 :code:`Support AVX`，则可以选择上表中的AVX版本PaddlePaddle。否则需要选择非AVX的PaddlePaddle。选择普通CPU版本的devel版本的image，则可以使用 :code:`paddle-dev/paddle:cpu-devel-latest` 来引用这个image。
+如果输出 :code:`Support AVX`，则可以选择上表中的AVX版本PaddlePaddle。否则需要选择非AVX的PaddlePaddle。选择普通CPU版本的devel版本的image，则可以使用 :code:`paddledev/paddle:cpu-devel-latest` 来引用这个image。
 
 PaddlePaddle提供的镜像并不包含任何命令运行，想要运行PaddlePaddle，您需要进入镜像运行PaddlePaddle
 程序或者自定义一个含有启动脚本的image。具体请参考注意事项中的 :code:`使用ssh访问PaddlePaddle镜像`
diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst
index 51a1a11674d987..5a1056e859a0c9 100644
--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -16,70 +16,71 @@ Developers can work on PaddlePaddle using Docker.  This allows
 developers to work on different platforms -- Linux, Mac OS X, and
 Windows -- in a consistent way.
 
-The general development workflow with Docker and CMake is as follows:
-
-1. Get the source code of Paddle:
+1. Build the Development Environment as a Docker Image
 
    .. code-block:: bash
 
-      git clone https://github.com/PaddlePaddle/Paddle.git
+      git clone --recursive https://github.com/PaddlePaddle/Paddle
+      cd Paddle
+      docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile .
 
 
-2. Build a development Docker image :code:`paddle:dev` from the source
-   code.  This image contains all the development tools and
-   dependencies of PaddlePaddle.
+   Note that by default :code:`docker build` wouldn't import source
+   tree into the image and build it.  If we want to do that, we need
+   to set a build arg:
 
    .. code-block:: bash
 
-      cd paddle
-      docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile .
+      docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile --build-arg BUILD_AND_INSTALL=ON .
+
+
+2. Run the Development Environment
 
-   Sometimes docker build might suffer from a slow network connection to the official Ubuntu apt-source servers. In such case, we can specify an apt-source mirror server that is geologically nearer to us. In the following example, we specified an apt-source server that responds fast in China.You can specify the UBUNTU MIRROR with :code:`--build-arg UBUNTU_MIRROR` like the example below.
+   Once we got the image :code:`paddle:dev`, we can use it to develop
+   Paddle by mounting the local source code tree into a container that
+   runs the image:
 
    .. code-block:: bash
 
-      docker build \
-       --build-arg UBUNTU_MIRROR="http://mirrors.163.com" \
-       -t paddle:dev \
-       -f paddle/scripts/docker/Dockerfile .
+      docker run -d -p 2202:22 -v $PWD:/paddle paddle:dev
+
+   This runs a container of the development environment Docker image
+   with the local source tree mounted to :code:`/paddle` of the
+   container.
+
+   Note that the default entry-point of :code:`paddle:dev` is
+   :code:`sshd`, and above :code:`docker run` commands actually starts
+   an SSHD server listening on port 2202.  This allows us to log into
+   this container with:
+
+   .. code-block:: bash
 
+      ssh root@localhost -p 2202
 
-3. Run the image as a container and mounting local source code
-   directory into the container.  This allows us to change the code on
-   the host and build it within the container.
+   Usually, I run above commands on my Mac.  I can also run them on a
+   GPU server :code:`xxx.yyy.zzz.www` and ssh from my Mac to it:
 
    .. code-block:: bash
 
-      docker run       \
-       -d              \
-       --name paddle   \
-       -p 2022:22      \
-       -v $PWD:/paddle \
-       paddle:dev
+      my-mac$ ssh root@xxx.yyy.zzz.www -p 2202
 
-   where :code:`-d` makes the container running in background,
-   :code:`--name paddle` allows us to run a nginx container to serve
-   documents in this container, :code:`-p 2022:22` allows us to SSH
-   into this container, :code:`-v $PWD:/paddle` shares the source code
-   on the host with the container.
+3. Build and Install Using the Development Environment
 
-4. SSH into the container:
+   Once I am in the container, I can use
+   :code:`paddle/scripts/docker/build.sh` to build, install, and test
+   Paddle:
 
    .. code-block:: bash
 
-      ssh root@localhost -p 2022
+      /paddle/paddle/scripts/docker/build.sh
 
-5. We can edit the source code in the container or on this host.  Then
-   we can build using cmake
+   This builds everything about Paddle in :code:`/paddle/build`.  And
+   we can run unit tests there:
 
    .. code-block:: bash
 
-      cd /paddle # where paddle source code has been mounted into the container
-      mkdir -p build
-      cd build
-      cmake -DWITH_TESTING=ON ..
-      make -j `nproc`
-      CTEST_OUTPUT_ON_FAILURE=1 ctest
+      cd /paddle/build
+      ctest
 
 
 CPU-only and GPU Images
diff --git a/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst b/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
index 943b1d4bb84646..4b328fc9d38bc5 100644
--- a/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
+++ b/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
@@ -32,7 +32,7 @@ pooling_layer 的使用示例如下，详细见 :ref:`api_trainer_config_helpers
         
 - `pooling_type` 目前支持两种，分别是：MaxPooling()和AvgPooling()。
 
-- `agg_level=AggregateLevel.TIMESTEP` 时（默认值）：
+- `agg_level=AggregateLevel.EACH_TIMESTEP` 时（默认值）：
 
   - 作用：双层序列经过运算变成一个0层序列，或单层序列经过运算变成一个0层序列
   - 输入：一个双层序列，或一个单层序列
@@ -54,7 +54,7 @@ last_seq 的使用示例如下（ :ref:`api_trainer_config_helpers_layers_first_
         last = last_seq(input=layer,
                         agg_level=AggregateLevel.EACH_SEQUENCE)
         
-- `agg_level=AggregateLevel.TIMESTEP` 时（默认值）：
+- `agg_level=AggregateLevel.EACH_TIMESTEP` 时（默认值）：
 
   - 作用：一个双层序列经过运算变成一个0层序列，或一个单层序列经过运算变成一个0层序列
   - 输入：一个双层序列或一个单层序列
diff --git a/doc/howto/index_cn.rst b/doc/howto/index_cn.rst
index bd3d0ec2920570..5b84eea491f874 100644
--- a/doc/howto/index_cn.rst
+++ b/doc/howto/index_cn.rst
@@ -10,6 +10,7 @@
   usage/cmd_parameter/index_cn.rst
   usage/concepts/use_concepts_cn.rst
   usage/cluster/cluster_train_cn.md
+  usage/k8s/k8s_basis_cn.md
   usage/k8s/k8s_cn.md
   usage/k8s/k8s_distributed_cn.md
 
diff --git a/doc/howto/usage/cluster/cluster_train_cn.md b/doc/howto/usage/cluster/cluster_train_cn.md
index acdcfa1c0047ce..274452fbf0c595 100644
--- a/doc/howto/usage/cluster/cluster_train_cn.md
+++ b/doc/howto/usage/cluster/cluster_train_cn.md
@@ -6,7 +6,7 @@
 
 在本文中，我们将阐释如何在集群上运行分布式 Paddle 训练作业。我们将以[推荐系统](https://github.com/baidu/Paddle/tree/develop/demo/recommendation)为例创建分布式的单进程训练。
 
-在本文中使用的[脚本](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train)通过 SSH 运行分布式作业。 它们还可以供那些运行更复杂的集群管理系统（如 MPI 和 [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/k8s) ）的用户参考。
+在本文中使用的[脚本](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train)通过 SSH 运行分布式作业。 它们还可以供那些运行更复杂的集群管理系统（如 MPI 和 [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/k8s) ）的用户参考。
 
 ## 前提条件
 
diff --git a/doc/howto/usage/cluster/cluster_train_en.md b/doc/howto/usage/cluster/cluster_train_en.md
index 30963dcd927250..c60876721cbf55 100644
--- a/doc/howto/usage/cluster/cluster_train_en.md
+++ b/doc/howto/usage/cluster/cluster_train_en.md
@@ -2,7 +2,7 @@
 
 In this article, we explain how to run distributed Paddle training jobs on clusters.  We will create the distributed version of the single-process training example, [recommendation](https://github.com/baidu/Paddle/tree/develop/demo/recommendation).
 
-[Scripts](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train) used in this article launch distributed jobs via SSH.  They also work as a reference for users running more sophisticated cluster management systems like MPI and [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/k8s).
+[Scripts](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train) used in this article launch distributed jobs via SSH.  They also work as a reference for users running more sophisticated cluster management systems like MPI and [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/k8s).
 
 ## Prerequisite
 
diff --git a/doc/howto/usage/cmd_parameter/arguments_cn.md b/doc/howto/usage/cmd_parameter/arguments_cn.md
index 833e21dd19ef3c..2e2a2fcc54a09f 100644
--- a/doc/howto/usage/cmd_parameter/arguments_cn.md
+++ b/doc/howto/usage/cmd_parameter/arguments_cn.md
@@ -127,11 +127,6 @@
 <td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
 </tr>
 
-<tr>
-<td class="left">allow_inefficient_sparse_update</td>
-<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
-</tr>
-
 <tr>
 <td class="left">start_pass</td>
 <td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
diff --git a/doc/howto/usage/cmd_parameter/arguments_en.md b/doc/howto/usage/cmd_parameter/arguments_en.md
index 013edbc9047817..e5546f0ddc78a9 100644
--- a/doc/howto/usage/cmd_parameter/arguments_en.md
+++ b/doc/howto/usage/cmd_parameter/arguments_en.md
@@ -127,11 +127,6 @@ It looks like there are a lot of arguments. However, most of them are for develo
 <td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
 </tr>
 
-<tr>
-<td class="left">allow_inefficient_sparse_update</td>
-<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
-</tr>
-
 <tr>
 <td class="left">start_pass</td>
 <td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
diff --git a/doc/howto/usage/cmd_parameter/detail_introduction_cn.md b/doc/howto/usage/cmd_parameter/detail_introduction_cn.md
index dbf7c6f00b8ba5..3b573a324d541b 100644
--- a/doc/howto/usage/cmd_parameter/detail_introduction_cn.md
+++ b/doc/howto/usage/cmd_parameter/detail_introduction_cn.md
@@ -306,10 +306,6 @@
   - 指示是否显示参数服务器上的稀疏参数分布的日志细节.
   - 类型: bool (默认: 0).
 
-* `--allow_inefficient_sparse_update`
-  - 指示是否允许低效率的稀疏更新.
-  - 类型: bool (默认: 0).
-
 * `--check_sparse_distribution_batches`
   - 每运行多少个批次执行一次稀疏参数分布的检查.
   - 类型: int32 (默认: 100).
diff --git a/doc/howto/usage/cmd_parameter/detail_introduction_en.md b/doc/howto/usage/cmd_parameter/detail_introduction_en.md
index aa69a3bd5423c4..33b7ec0d51a96e 100644
--- a/doc/howto/usage/cmd_parameter/detail_introduction_en.md
+++ b/doc/howto/usage/cmd_parameter/detail_introduction_en.md
@@ -310,10 +310,6 @@
   - show log details for sparse parameter distribution in pserver.
   - type: bool (default: 0).
 
-* `--allow_inefficient_sparse_update`
-  - Whether to allow inefficient sparse update.
-  - type: bool (default: 0).
-
 * `--check_sparse_distribution_batches`
   - Running sparse parameter distribution check every so many batches.
   - type: int32 (default: 100).
diff --git a/doc/howto/usage/k8s/k8s_aws_en.md b/doc/howto/usage/k8s/k8s_aws_en.md
index b04bfba590de42..ce72b0803818d5 100644
--- a/doc/howto/usage/k8s/k8s_aws_en.md
+++ b/doc/howto/usage/k8s/k8s_aws_en.md
@@ -1,26 +1,67 @@
-# Kubernetes on AWS
 
-## Create AWS Account and IAM Account
+# Distributed PaddlePaddle Training on AWS with Kubernetes
 
-To use AWS, we need to sign up an AWS account on Amazon's Web site.
-An AWS account allows us to login to the AWS Console Web interface to
-create IAM users and user groups. Usually, we create a user group with
-privileges required to run PaddlePaddle, and we create users for
-those who are going to run PaddlePaddle and add these users into the
-group. IAM users can identify themselves using password and tokens,
-where passwords allows users to log in to the AWS Console, and tokens
-make it easy for users to submit and inspect jobs from the command
-line.
+We will show you step by step on how to run distributed PaddlePaddle training on AWS cluster with Kubernetes. Let's start from core concepts.
+
+## Distributed PaddlePaddle Training Core Concepts
+
+### Distributed Training Job
+
+A distributed training job is represented by a [Kubernetes job](https://kubernetes.io/docs/user-guide/jobs/#what-is-a-job).
+
+Each Kuberentes job is described by a job config file, which specifies the information like the number of [pods](https://kubernetes.io/docs/user-guide/pods/#what-is-a-pod) in the job and environment variables.
+
+In a distributed training job, we would:
+
+1. prepare partitioned training data and configuration file on a distributed file system (in this tutorial we use Amazon Elastic File System), and
+1. create and submit the Kubernetes job config to the Kubernetes cluster to start the training job.
+
+### Parameter Servers and Trainers
+
+There are two roles in a PaddlePaddle cluster: *parameter server (pserver)* and *trainer*. Each parameter server process maintains a shard of the global model. Each trainer has its local copy of the model, and uses its local data to update the model. During the training process, trainers send model updates to parameter servers, parameter servers are responsible for aggregating these updates, so that trainers can synchronize their local copy with the global model.
+
+<center>![Model is partitioned into two shards. Managed by two parameter servers respectively.](src/pserver_and_trainer.png)</center>
+
+In order to communicate with pserver, trainer needs to know the ip address of each pserver. In kubernetes it's better to use a service discovery mechanism (e.g., DNS hostname) rather than static ip address, since any pserver's pod may be killed and a new pod could be schduled onto another node of different ip address. However, now we are using static ip. This will be improved.
+
+Parameter server and trainer are packaged into a same docker image. They will run once pod is scheduled by kubernetes job.
+
+### Trainer ID
+
+Each trainer process requires a trainer ID, a zero-based index value, passed in as a command-line parameter. The trainer process thus reads the data partition indexed by this ID.
+
+### Training
+
+The entry-point of a container is a shell script. It can see some environment variables pre-defined by Kubernetes. This includes one that gives the job's identity, which can be used in a remote call to the Kubernetes apiserver that lists all pods in the job.
+
+We rank each pod by sorting them by their ips. The rank of each pod could be the "pod ID". Because we run one trainer and one parameter server in each pod, we can use this "pod ID" as the trainer ID. A detailed workflow of the entry-point script is as follows:
+
+1. Query the api server to get pod information, and assign the `trainer_id` by sorting the ip.
+1. Copy the training data from EFS persistent volume into container.
+1. Parse the `paddle pserver` and `paddle trainer` startup parameters from environment variables, and then start up the processes.
+1. Trainer with `train_id` 0 will automatically write results onto EFS volume.
+
+
+## PaddlePaddle on AWS with Kubernetes
+
+### Choose AWS Service Region
+This tutorial requires several AWS services work in the same region. Before we create anything in AWS, please check the following link
+https://aws.amazon.com/about-aws/global-infrastructure/regional-product-services/
+Choose a region which has the following services available: EC2, EFS, VPS, CloudFormation, KMS, VPC, S3.
+In this tutorial, we use "Oregon(us-west-2)" as example.
+
+### Create AWS Account and IAM Account
+
+Under each AWS account, we can create multiple [IAM](http://docs.aws.amazon.com/IAM/latest/UserGuide/introduction.html) users. This allows us to grant some privileges to each IAM user and to create/operate AWS clusters as an IAM user.
 
 To sign up an AWS account, please
 follow
 [this guide](http://docs.aws.amazon.com/lambda/latest/dg/setting-up.html).
-To create users and user groups under an AWS account, please
+To create IAM users and user groups under an AWS account, please
 follow
 [this guide](http://docs.aws.amazon.com/IAM/latest/UserGuide/id_users_create.html).
 
-Please be aware that this tutorial needs the following privileges in
-the user group:
+Please be aware that this tutorial needs the following privileges for the user in IAM:
 
 - AmazonEC2FullAccess
 - AmazonS3FullAccess
@@ -31,25 +72,16 @@ the user group:
 - IAMUserSSHKeys
 - IAMFullAccess
 - NetworkAdministrator
+- AWSKeyManagementServicePowerUser
 
 
-By the time we write this tutorial, we noticed that Chinese AWS users
-might suffer from authentication problems when running this tutorial.
-Our solution is that we create a VM instance with the default Amazon
-AMI and in the same zone as our cluster runs, so we can SSH to this VM
-instance as a tunneling server and control our cluster and jobs from
-it.
-
-
-## PaddlePaddle on AWS
-
-Here we will show you step by step on how to run PaddlePaddle training on AWS cluster.
-
-
-###Download kube-aws and kubectl
+### Download kube-aws and kubectl
 
-####kube-aws
+#### kube-aws
 
+[kube-aws](https://github.com/coreos/kube-aws) is a CLI tool to automate cluster deployment to AWS.
+##### Verify kube-aws integrity
+Note: if you are using a non-official release (e.g RC release) kube-aws, you can skip this setp.
 Import the CoreOS Application Signing Public Key:
 
 ```
@@ -63,7 +95,7 @@ gpg2 --fingerprint FC8A365E
 ```
 The correct key fingerprint is `18AD 5014 C99E F7E3 BA5F 6CE9 50BD D3E0 FC8A 365E`
 
-Go to the [releases](https://github.com/coreos/kube-aws/releases) and download the latest release tarball and detached signature (.sig) for your architecture.
+We can download `kube-aws` from its [release page](https://github.com/coreos/kube-aws/releases). In this tutorial, we use version 0.9.1
 
 Validate the tarball's GPG signature:
 
@@ -74,7 +106,7 @@ PLATFORM=darwin-amd64
 
 gpg2 --verify kube-aws-${PLATFORM}.tar.gz.sig kube-aws-${PLATFORM}.tar.gz
 ```
-
+##### Install kube-aws
 Extract the binary:
 
 ```
@@ -88,34 +120,39 @@ mv ${PLATFORM}/kube-aws /usr/local/bin
 ```
 
 
-####kubectl
+#### kubectl
 
-Go to the [releases](https://github.com/kubernetes/kubernetes/releases) and download the latest release tarball.
+[kubectl](https://kubernetes.io/docs/user-guide/kubectl-overview/) is a command line interface for running commands against Kubernetes clusters.
 
-Extract the tarball and then concate the kubernetes binaries directory into PATH:
+Download `kubectl` from the Kubernetes release artifact site with the `curl` tool.
 
 ```
-export PATH=<path/to/kubernetes-directory>/platforms/linux/amd64:$PATH
+# OS X
+curl -O https://storage.googleapis.com/kubernetes-release/release/"$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)"/bin/darwin/amd64/kubectl
 
+# Linux
+curl -O https://storage.googleapis.com/kubernetes-release/release/"$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)"/bin/linux/amd64/kubectl
 ```
 
-User credentials and security tokens will be generated later in user directory, not in `~/.kube/config`, they will be necessary to use the CLI or the HTTP Basic Auth.
-
+Make the kubectl binary executable and move it to your PATH (e.g. `/usr/local/bin`):
 
-###Configure AWS Credentials
+```
+chmod +x ./kubectl
+sudo mv ./kubectl /usr/local/bin/kubectl
+```
 
-First check out [this](http://docs.aws.amazon.com/cli/latest/userguide/installing.html) for installing the AWS command line interface, if you use ec2 instance with default amazon AMI, the cli tool has already been installed on your machine.
+### Configure AWS Credentials
 
+First check out [this](http://docs.aws.amazon.com/cli/latest/userguide/installing.html) for installing the AWS command line interface.
 
 And then configure your AWS account information:
 
 ```
 aws configure
-
 ```
 
 
-Fill in the required fields (You can get your AWS aceess key id and AWS secrete access key by following [this](http://docs.aws.amazon.com/cli/latest/userguide/cli-chap-getting-started.html) instruction):
+Fill in the required fields:
 
 
 ```
@@ -123,36 +160,44 @@ AWS Access Key ID: YOUR_ACCESS_KEY_ID
 AWS Secrete Access Key: YOUR_SECRETE_ACCESS_KEY
 Default region name: us-west-2
 Default output format: json
-
 ```
 
-Test that your credentials work by describing any instances you may already have running on your account:
+`YOUR_ACCESS_KEY_ID`, and `YOUR_SECRETE_ACCESS_KEY` is the IAM key and secret from [Create AWS Account and IAM Account](#create-aws-account-and-iam-account)
+
+Verify that your credentials work by describing any instances you may already have running on your account:
 
 ```
 aws ec2 describe-instances
 ```
 
-###Define Cluster Parameters
+### Define Cluster Parameters
 
-####EC2 key pair
+#### EC2 key pair
 
 The keypair that will authenticate SSH access to your EC2 instances. The public half of this key pair will be configured on each CoreOS node.
 
-After creating a key pair, you will use the name you gave the keys to configure the cluster. Key pairs are only available to EC2 instances in the same region. More info in the [EC2 Keypair docs](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html).
+Follow [EC2 Keypair User Guide](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html) to create a EC2 key pair
+
+After creating a key pair, you will use the key pair name to configure the cluster.
+
+Key pairs are only available to EC2 instances in the same region. We are using us-west-2 in our tutorial, so make sure to creat key pairs in that region (Oregon).
 
-####KMS key
+Your browser will download a `key-name.pem` file which is the key to access the EC2 instances. We will use it later.
+
+
+#### KMS key
 
 Amazon KMS keys are used to encrypt and decrypt cluster TLS assets. If you already have a KMS Key that you would like to use, you can skip creating a new key and provide the Arn string for your existing key.
 
-You can create a KMS key in the AWS console, or with the aws command line tool:
+You can create a KMS key with the aws command line tool:
 
 ```
-$ aws kms --region=us-west-2 create-key --description="kube-aws assets"
+aws kms --region=us-west-2 create-key --description="kube-aws assets"
 {
     "KeyMetadata": {
         "CreationDate": 1458235139.724,
         "KeyState": "Enabled",
-        "Arn": "arn:aws:kms:us-west-2:xxxxxxxxx:key/xxxxxxxxxxxxxxxxxxx",
+        "Arn": "arn:aws:kms:us-west-2:aaaaaaaaaaaaa:key/xxxxxxxxxxxxxxxxxxx",
         "AWSAccountId": "xxxxxxxxxxxxx",
         "Enabled": true,
         "KeyUsage": "ENCRYPT_DECRYPT",
@@ -162,14 +207,16 @@ $ aws kms --region=us-west-2 create-key --description="kube-aws assets"
 }
 ```
 
-You will use the `KeyMetadata.Arn` string to identify your KMS key in the init step.
+We will need to use the value of `Arn` later.
+
+And then let's add several inline policies in your IAM user permission.
 
-And then you need to add several inline policies in your user permission.
+Go to [IAM Console](https://console.aws.amazon.com/iam/home?region=us-west-2#/home). Click on button `Users`, click user that we just created, and then click on `Add inline policy` button, and select `Custom Policy`.
 
-kms inline policy:
+Paste into following inline policies:
 
 ```
-{
+ (Caution: node_0, node_1, node_2 directories represents PaddlePaddle node and train_id, not the Kubernetes node){
     "Version": "2012-10-17",
     "Statement": [
         {
@@ -180,18 +227,10 @@ kms inline policy:
                 "kms:Encrypt"
             ],
             "Resource": [
-                "arn:aws:kms:*:xxxxxxxxx:key/*"
+                "arn:aws:kms:*:AWS_ACCOUNT_ID:key/*"
             ]
-        }
-    ]
-}
-```
-cloudformation inline policy:
-
-```
-"Version": "2012-10-17",
-    "Statement": [
-        {
+        },
+		{
             "Sid": "Stmt1482205746000",
             "Effect": "Allow",
             "Action": [
@@ -200,26 +239,44 @@ cloudformation inline policy:
                 "cloudformation:DeleteStack",
                 "cloudformation:DescribeStacks",
                 "cloudformation:DescribeStackResource",
-                "cloudformation:GetTemplate"
+                "cloudformation:GetTemplate",
+                "cloudformation:DescribeStackEvents"
             ],
             "Resource": [
-                "arn:aws:cloudformation:us-west-2:xxxxxxxxx:stack/YOUR_CLUSTER_NAME/*"
+                "arn:aws:cloudformation:us-west-2:AWS_ACCOUNT_ID:stack/MY_CLUSTER_NAME/*"
             ]
         }
     ]
 }
 ```
+`Version` : Its value has to be exactly "2012-10-17".
+`AWS_ACCOUNT_ID`: You can get it from following command line:
 
+```
+aws sts get-caller-identity --output text --query Account
+```
 
-####External DNS name
+`MY_CLUSTER_NAME`: Pick a MY_CLUSTER_NAME that you like, you will use it later as well. 
+Please note, stack name must satisfy regular expression pattern: [a-zA-Z][-a-zA-Z0-9*]*, which means no "_" or "-" in stack name, or kube-aws will throw error in later steps.
 
-When the cluster is created, the controller will expose the TLS-secured API on a public IP address. You will need to create an A record for the external DNS hostname you want to point to this IP address. You can find the API external IP address after the cluster is created by invoking kube-aws status.
+#### External DNS name
 
-####S3 bucket
+When the cluster is created, the controller will expose the TLS-secured API on a DNS name.
+
+DNS name should have a CNAME points to cluster DNS name or an A record points to the cluster IP address.
+
+We will need to use DNS name later in tutorial. If you don't already own one, you can choose any DNS name (e.g., `paddle`) and modify `/etc/hosts` to associate cluster IP with that DNS name for your local machine. And add name service (route53) in aws to associate the IP to paddle for cluster. We will find the cluster IP in later steps.
+
+#### S3 bucket
 
 You need to create an S3 bucket before startup the Kubernetes cluster.
 
-####Initialize an asset directory
+There are some bugs in aws cli in creating S3 bucket, so let's use the [S3 Console](https://console.aws.amazon.com/s3/home?region=us-west-2).
+
+Click on `Create Bucket`, fill in a unique BUCKET_NAME, and make sure region is us-west-2 (Oregon).
+
+
+#### Initialize Assets
 
 Create a directory on your local machine to hold the generated assets:
 
@@ -231,284 +288,245 @@ $ cd my-cluster
 Initialize the cluster CloudFormation stack with the KMS Arn, key pair name, and DNS name from the previous step:
 
 ```
-$ kube-aws init \
---cluster-name=my-cluster-name \
---external-dns-name=my-cluster-endpoint \
---region=us-west-1 \
---availability-zone=us-west-1c \
---key-name=key-pair-name \
+kube-aws init \
+--cluster-name=MY_CLUSTER_NAME \
+--external-dns-name=MY_EXTERNAL_DNS_NAME \
+--region=us-west-2 \
+--availability-zone=us-west-2a \
+--key-name=KEY_PAIR_NAME \
 --kms-key-arn="arn:aws:kms:us-west-2:xxxxxxxxxx:key/xxxxxxxxxxxxxxxxxxx"
 ```
 
-There will now be a cluster.yaml file in the asset directory. This is the main configuration file for your cluster.
+`MY_CLUSTER_NAME`: the one you picked in [KMS key](#kms-key)
 
-####Render contents of the asset directory
+`MY_EXTERNAL_DNS_NAME`: see [External DNS name](#external-dns-name)
 
-In the simplest case, you can have kube-aws generate both your TLS identities and certificate authority for you.
+`KEY_PAIR_NAME`: see [EC2 key pair](#ec2-key-pair)
 
-```
-$ kube-aws render credentials --generate-ca
-```
+`--kms-key-arn`: the "Arn" in [KMS key](#kms-key)
 
-The next command generates the default set of cluster assets in your asset directory.
+Here `us-west-2a` is used for parameter `--availability-zone`, but supported availability zone varies among AWS accounts.
 
-```
-sh $ kube-aws render stack
-```
+Please check if `us-west-2a` is supported by `aws ec2 --region us-west-2 describe-availability-zones`, if not switch to other supported availability zone. (e.g., `us-west-2a`, or `us-west-2b`)
 
-Here's what the directory structure looks like:
 
-```
-$ tree
-.
-├── cluster.yaml
-├── credentials
-│   ├── admin-key.pem
-│   ├── admin.pem
-│   ├── apiserver-key.pem
-│   ├── apiserver.pem
-│   ├── ca-key.pem
-│   ├── ca.pem
-│   ├── worker-key.pem
-│   └── worker.pem
-│   ├── etcd-key.pem
-│   └── etcd.pem
-│   ├── etcd-client-key.pem
-│   └── etcd-client.pem
-├── kubeconfig
-├── stack-template.json
-└── userdata
-    ├── cloud-config-controller
-    └── cloud-config-worker
-```
-
-These assets (templates and credentials) are used to create, update and interact with your Kubernetes cluster.
+There will now be a cluster.yaml file in the asset directory. This is the main configuration file for your cluster.
 
+By default `kube-aws` will only create one worker node. Let's edit `cluster.yaml` and change `workerCount` from 1 to 3.
 
-###Kubernetes Cluster Start Up
 
-####Create the instances defined in the CloudFormation template
+#### Render contents of the asset directory
 
-Now for the exciting part, creating your cluster:
+In the simplest case, you can have kube-aws generate both your TLS identities and certificate authority for you.
 
 ```
-$ kube-aws up --s3-uri s3://<your-bucket-name>/<prefix>
+kube-aws render credentials --generate-ca
 ```
 
-####Configure DNS
-
-You can invoke `kube-aws status` to get the cluster API endpoint after cluster creation, if necessary. This command can take a while. And then dig the load balancer hostname to get the ip address, use this ip to setup an A record for your external dns name.
-
-####Access the cluster
-
-Once the API server is running, you should see:
+The next command generates the default set of cluster assets in your asset directory.
 
 ```
-$ kubectl --kubeconfig=kubeconfig get nodes
-NAME                                       STATUS                     AGE
-ip-10-0-0-xxx.us-west-1.compute.internal   Ready                      5m
-ip-10-0-0-xxx.us-west-1.compute.internal   Ready                      5m
-ip-10-0-0-xx.us-west-1.compute.internal    Ready,SchedulingDisabled   5m
+kube-aws render stack
 ```
+Assets (templates and credentials) that are used to create, update and interact with your Kubernetes cluster will be created under your current folder.
 
 
-###Setup PaddlePaddle Environment on AWS
-
-Now, we've created a cluster with following network capability:
-
-1. All Kubernetes nodes can communicate with each other.
-
-1. All Docker containers on Kubernetes nodes can communicate with each other.
-
-1. All Kubernetes nodes can communicate with all Docker containers on Kubernetes nodes.
-
-1. All other traffic loads from outside of Kubernetes nodes cannot reach to the Docker containers on Kubernetes nodes except for creating the services for containers.
-
-
-For sharing the training data across all the Kubernetes nodes, we use EFS (Elastic File System) in AWS. Ceph might be a better solution, but it requires high version of Linux kernel that might not be stable enough at this moment. We haven't automated the EFS setup at this moment, so please do the following steps:
+### Kubernetes Cluster Start Up
 
+#### Create the instances defined in the CloudFormation template
 
-1. Make sure you added AmazonElasticFileSystemFullAccess policy in your group.
+Now let's create your cluster (choose any `PREFIX` for the command below):
 
-1. Create the Elastic File System in AWS console, and attach the new VPC with it.
-<center>![](src/create_efs.png)</center>
-
-
-1. Modify the Kubernetes security group under ec2/Security Groups, add additional inbound policy "All TCP TCP 0 - 65535 0.0.0.0/0" for Kubernetes default VPC security group. 
-<center>![](src/add_security_group.png)</center>
+```
+kube-aws up --s3-uri s3://BUCKET_NAME/PREFIX
+```
 
+`BUCKET_NAME`: the bucket name that you used in [S3 bucket](#s3-bucket)
 
-1. Follow the EC2 mount instruction to mount the disk onto all the Kubernetes nodes, we recommend to mount EFS disk onto ~/efs.
-<center>![](src/efs_mount.png)</center>
 
+#### Configure DNS
 
-Before starting the training, you should place your user config and divided training data onto EFS. When the training start, each task will copy related files from EFS into container, and it will also write the training results back onto EFS, we will show you how to place the data later in this article.
+You can invoke `kube-aws status` to get the cluster API endpoint after cluster creation.
 
+```
+$ kube-aws status
+Cluster Name:		paddle-cluster
+Controller DNS Name:	paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com
+```
 
+If you own a DNS name, set the A record to any of the above ip. __Or__ you can set up CNAME point to `Controller DNS Name` (`paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com`)
 
-###Core Concept of PaddlePaddle Training on AWS
+##### Find IP address
 
-Now we've already setup a 3 nodes distributed Kubernetes cluster, and on each node we've attached the EFS volume, in this training demo, we will create three Kubernetes pod and scheduling them on 3 node. Each pod contains a PaddlePaddle container. When container gets created, it will start pserver and trainer process, load the training data from EFS volume and start the distributed training task.
+Use command `dig` to check the load balancer hostname to get the ip address.
 
-####Use Kubernetes Job
+```
+$ dig paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com
 
-We use Kubernetes job to represent one time of distributed training. After the job get finished, Kubernetes will destroy job container and release all related resources.
+;; QUESTION SECTION:
+;paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. IN A
 
-We can write a yaml file to describe the Kubernetes job. The file contains lots of configuration information, for example PaddlePaddle's node number, `paddle pserver` open port number, the network card info etc., these information are passed into container for processes to use as environment variables.
+;; ANSWER SECTION:
+paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. 59 IN A 54.241.164.52
+paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. 59 IN A 54.67.102.112
+```
 
-In one time of distributed training, user will confirm the PaddlePaddle node number first. And then upload the pre-divided training data and configuration file onth EFS volume. And then create the Kubernetes job yaml file; submit to the Kubernetes cluster to start the training job.
+In the above output, both ip `54.241.164.52`, `54.67.102.112` will work.
 
-####Create PaddlePaddle Node
+*If you own a DNS name*, set the A record to any of the above ip. Then you can skip to the step "Access the cluster".
 
-After Kubernetes master gets the request, it will parse the yaml file and create several pods (defined by PaddlePaddle's node number), Kubernetes will allocate these pods onto cluster's node. A pod represents a PaddlePaddle node, when pod is successfully allocated onto one physical/virtual machine, Kubernetes will startup the container in the pod, and this container will use the environment variables in yaml file and start up `paddle pserver` and `paddle trainer` processes.
+*If you do not own a DNS name*:
+##### Update local DNS association
+Edit `/etc/hosts` to associate above ip with the DNS name.
+##### Add Route53 private name service in VPC
+ - Open [Route53 Console](https://console.aws.amazon.com/route53/home)
+ - Create hosted zone with following config
+   - Domain name: "paddle"
+   - Type: "Private hosted zone for amazon VPC"
+   - VPC ID: `<Your VPC ID>`
 
+   ![route53 zone setting](src/route53_create_zone.png)
+ - Add A record
+    - Click on the zone "paddle" just created
+    - Click the button "Create record set"
+        - Name : leave blank
+        - type: "A"
+        - Value: `<kube-controller ec2 private ip>`
 
-####Start up Training
+        ![route53 create recordset](src/route53_create_recordset.png)
+ - Verify name service
+    - Connect to any instance created by kube-aws via ssh
+    - Run command "host paddle", see if the ip returned is the private ip of kube-controller
 
-After container gets started, it starts up the distributed training by using scripts. We know `paddle train` process need to know other node's ip address and it's own trainer_id, since PaddlePaddle currently don't have the ability to do the service discovery, so in the start up script, each node will use job pod's name to query all to pod info from Kubernetes apiserver (apiserver's endpoint is an environment variable in container by default).
+#### Access the cluster
 
-With pod information, we can assign each pod a unique trainer_id. Here we sort all the pods by pod's ip, and assign the index to each PaddlePaddle node as it's trainer_id. The workflow of starting up the script is as follows:
+Once the API server is running, you should see:
 
-1. Query the api server to get pod information, and assign the trainer_id by sorting the ip.
-1. Copy the training data from EFS sharing volume into container.
-1. Parse the `paddle pserver` and 'paddle trainer' startup parameters from environment variables, and then start up the processes.
-1. PaddlePaddle will automatically write the result onto the PaddlePaddle node with trainer_id:0, we set the output path to be the EFS volume to save the result data.
+```
+$ kubectl --kubeconfig=kubeconfig get nodes 
+NAME                                       STATUS    AGE
+ip-10-0-0-134.us-west-2.compute.internal   Ready     6m
+ip-10-0-0-238.us-west-2.compute.internal   Ready     6m
+ip-10-0-0-50.us-west-2.compute.internal    Ready     6m
+ip-10-0-0-55.us-west-2.compute.internal    Ready     6m
+```
 
 
-###Start PaddlePaddle Training Demo on AWS
+### Setup Elastic File System for Cluster
 
-Now we'll start a PaddlePaddle training demo on AWS, steps are as follows:
+Training data is usually served on a distributed filesystem, we use Elastic File System (EFS) on AWS.
 
-1. Build PaddlePaddle Docker image.
-1. Divide the training data file and upload it onto the EFS sharing volume.
-1. Create the training job yaml file, and start up the job.
-1. Check the result after training.
+1. Create security group for EFS in [security group console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#SecurityGroups:sort=groupId)
+  1. Look up security group id for `paddle-cluster-sg-worker` (`sg-055ee37d` in the image below)
+  <center>![](src/worker_security_group.png)</center>
+  2. Add security group `paddle-efs` with `ALL TCP` inbound rule and custom source as group id of `paddle-cluster-sg-worker`. And VPC of `paddle-cluster-vpc`. Make sure availability zone is same as the one you used in [Initialize Assets](#initialize-assets).
+  <center>![](src/add_security_group.png)</center>
 
-####Build PaddlePaddle Docker Image
+2. Create the Elastic File System in [EFS console](https://us-west-2.console.aws.amazon.com/efs/home?region=us-west-2#/wizard/1) with `paddle-cluster-vpc` VPC. Make sure subnet is `paddle-cluster-Subnet0` andd security group is `paddle-efs`.
+<center>![](src/create_efs.png)</center>
 
-PaddlePaddle docker image need to provide the runtime environment for `paddle pserver` and `paddle train`, so the container use this image should have two main function:
 
-1. Copy the training data into container.
-1. Generate the startup parameter for `paddle pserver` and `paddle train` process, and startup the training.
+### Start PaddlePaddle Training Demo on AWS
 
+#### Configure Kubernetes Volume that Points to EFS
 
-Since official `paddledev/paddle:cpu-latest` have already included the PaddlePaddle binary, but lack of the above functionalities, so we will create the startup script based on this image, to achieve the work above. the detailed Dockerfile is as follows:
+First we need to create a [PersistentVolume](https://kubernetes.io/docs/user-guide/persistent-volumes/) to provision EFS volumn.
 
+Save following snippet as `pv.yaml`
 ```
-FROM paddledev/paddle:cpu-latest
-
-MAINTAINER zjsxzong89@gmail.com
-
-COPY start.sh /root/
-COPY start_paddle.py /root/
-CMD ["bash"," -c","/root/start.sh"]
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: efsvol
+spec:
+  capacity:
+    storage: 100Gi
+  accessModes:
+    - ReadWriteMany
+  nfs:
+    server: EFS_DNS_NAME
+    path: "/"
 ```
 
-At this point, we will copy our `start.sh` and `start_paddle.py` file into container, and then exec `start_paddle.py` script to start up the training, all the steps like assigning trainer_id, getting other nodes' ip are implemented in `start_paddle.py`.
-
-`start_paddle.py` will start parsing the parameters.
+`EFS_DNS_NAME`: DNS name as shown in description of `paddle-efs` that we created. Looks similar to `fs-2cbf7385.efs.us-west-2.amazonaws.com`
 
+Run following command to create a persistent volumn:
 ```
-parser = argparse.ArgumentParser(prog="start_paddle.py",
-                                     description='simple tool for k8s')
-    args, train_args_list = parser.parse_known_args()
-    train_args = refine_unknown_args(train_args_list)
-    train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2]))
-    podlist = getPodList()
+kubectl --kubeconfig=kubeconfig create -f pv.yaml
 ```
 
-And then using function `getPodList()` to query all the pod information from the job name through Kubernetes api server. When all the pods are in the running status, using `getIdMap(podlist)` to get the trainer_id.
+Next let's create a [PersistentVolumeClaim](https://kubernetes.io/docs/user-guide/persistent-volumes/) to claim the persistent volume.
 
+Save following snippet as `pvc.yaml`.
 ```
-    podlist = getPodList()
-    # need to wait until all pods are running
-    while not isPodAllRunning(podlist):
-        time.sleep(10)
-        podlist = getPodList()
-    idMap = getIdMap(podlist)
+kind: PersistentVolumeClaim
+apiVersion: v1
+metadata:
+  name: efsvol
+spec:
+  accessModes:
+    - ReadWriteMany
+  resources:
+    requests:
+      storage: 50Gi
 ```
 
-In function `getIdMap(podlist)`, we use podlist to get the ip address for each pod and sort them, use the index as the trainer_id.
-
+Run following command to create a persistent volumn claim:
 ```
-def getIdMap(podlist):
-    '''
-    generate tainer_id by ip
-    '''
-    ips = []
-    for pod in podlist["items"]:
-        ips.append(pod["status"]["podIP"])
-    ips.sort()
-    idMap = {}
-    for i in range(len(ips)):
-        idMap[ips[i]] = i
-    return idMap
+kubectl --kubeconfig=kubeconfig create -f pvc.yaml
 ```
 
-After getting `idMap`, we use function `startPaddle(idMap, train_args_dict)` to generate `paddle pserver` and `paddle train` start up parameters and then start up the processes.
+#### Prepare Training Data
 
-In function `startPaddle`, the most important work is to generate `paddle pserver` and `paddle train` start up parameters. For example, `paddle train` parameter parsing, we will get parameters like `PADDLE_NIC`, `PADDLE_PORT`, `PADDLE_PORTS_NUM`, and get the `trainer_id` from `idMap`.
+We will now launch a kubernetes job that downloads, saves and evenly splits training data into 3 shards on the persistent volumn that we just created.
 
+save following snippet as `paddle-data-job.yaml`
 ```
-    program = 'paddle train'
-    args = " --nics=" + PADDLE_NIC
-    args += " --port=" + str(PADDLE_PORT)
-    args += " --ports_num=" + str(PADDLE_PORTS_NUM)
-    args += " --comment=" + "paddle_process_by_paddle"
-    ip_string = ""
-    for ip in idMap.keys():
-        ip_string += (ip + ",")
-    ip_string = ip_string.rstrip(",")
-    args += " --pservers=" + ip_string
-    args_ext = ""
-    for key, value in train_args_dict.items():
-        args_ext += (' --' + key + '=' + value)
-    localIP = socket.gethostbyname(socket.gethostname())
-    trainerId = idMap[localIP]
-    args += " " + args_ext + " --trainer_id=" + \
-        str(trainerId) + " --save_dir=" + JOB_PATH_OUTPUT
-```
-
-Use `docker build` to build toe Docker Image:
-
-```
-docker build -t your_repo/paddle:mypaddle .
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-data
+spec:
+  template:
+    metadata:
+      name: pi
+    spec:
+      containers:
+      - name: paddle-data
+        image: paddledev/paddle-tutorial:k8s_data
+        imagePullPolicy: Always
+        volumeMounts:
+        - mountPath: "/efs"
+          name: efs
+        env:
+        - name: OUT_DIR
+          value: /efs/paddle-cluster-job
+        - name: SPLIT_COUNT
+          value: "3"
+      volumes:
+        - name: efs
+          persistentVolumeClaim:
+            claimName: efsvol
+      restartPolicy: Never
 ```
 
-And then push the built image onto docker registry.
-
+Run following command to launch the job:
 ```
-docker push  your_repo/paddle:mypaddle
+kubectl --kubeconfig=kubeconfig create -f paddle-data-job.yaml
 ```
 
-####Upload Training Data File
-
-Here we will use PaddlePaddle's official recommendation demo as the content for this training, we put the training data file into a directory named by job name, which located in EFS sharing volume, the tree structure for the directory looks like:
-
+Job may take 7 min to finish, use following command to check job status. Do not proceed until `SUCCESSFUL` for `paddle-data` job is `1`
 ```
-efs
-└── paddle-cluster-job
-    ├── data
-    │   ├── 0
-    │   │
-    │   ├── 1
-    │   │
-    │   └── 2
-    ├── output
-    └── recommendation
+$ kubectl --kubeconfig=kubeconfig get jobs
+NAME          DESIRED   SUCCESSFUL   AGE
+paddle-data   1         1            6m
 ```
 
-The `paddle-cluster-job` directory is the job name for this training, this training includes 3 PaddlePaddle node, we store the pre-divided data under `paddle-cluster-job/data` directory, directory 0, 1, 2 each represent 3 nodes' trainer_id. the training data in in recommendation directory, the training results and logs will be in the output directory.
-
-
-####Create Kubernetes Job
-
-Kubernetes use yaml file to describe job details, and then use command line tool to create the job in Kubernetes cluster.
-
-In yaml file, we describe the Docker image we use for this training, the node number we need to startup, the volume mounting information and all the necessary parameters we need for `paddle pserver` and `paddle train` processes.
+Data preparation is done by docker image `paddledev/paddle-tutorial:k8s_data`, see [here](src/k8s_data/README.md) for how to build this docker image and source code.
 
-The yaml file content is as follows:
+#### Start Training
 
+Now we are ready to start paddle training job. Save following snippet as `paddle-cluster-job.yaml`
 ```
 apiVersion: batch/v1
 kind: Job
@@ -522,12 +540,12 @@ spec:
       name: paddle-cluster-job
     spec:
       volumes:
-      - name: jobpath
-        hostPath:
-          path: /home/admin/efs
+      - name: efs
+        persistentVolumeClaim:
+          claimName: efsvol
       containers:
       - name: trainer
-        image: drinkcode/paddle:k8s-job
+        image: paddledev/paddle-tutorial:k8s_train
         command: ["bin/bash",  "-c", "/root/start.sh"]
         env:
         - name: JOB_NAME
@@ -537,7 +555,7 @@ spec:
         - name: JOB_NAMESPACE
           value: default
         - name: TRAIN_CONFIG_DIR
-          value: recommendation
+          value: quick_start
         - name: CONF_PADDLE_NIC
           value: eth0
         - name: CONF_PADDLE_PORT
@@ -548,119 +566,124 @@ spec:
           value: "2"
         - name: CONF_PADDLE_GRADIENT_NUM
           value: "3"
+        - name: TRAINER_COUNT
+          value: "3"
         volumeMounts:
-        - name: jobpath
-          mountPath: /home/jobpath
+        - mountPath: "/home/jobpath"
+          name: efs
         ports:
-        - name: jobport
-          hostPort: 30001
-          containerPort: 30001
+        - name: jobport0
+          hostPort: 7164
+          containerPort: 7164
+        - name: jobport1
+          hostPort: 7165
+          containerPort: 7165
+        - name: jobport2
+          hostPort: 7166
+          containerPort: 7166
+        - name: jobport3
+          hostPort: 7167
+          containerPort: 7167
       restartPolicy: Never
-
 ```
 
-In yaml file, the metadata's name is the job's name. `parallelism, completions` means this job will simultaneously start up 3 PaddlePaddle nodes, and this job will be finished when there are 3 finished pods. For the data store volume, we declare the path jobpath, it mount the /home/admin/efs on host machine into the container with path /home/jobpath. So in container, the /home/jobpath actually stores the data onto EFS sharing volume.
-
-`env` field represents container's environment variables, we pass the PaddlePaddle parameters into containers by using the `env` field.
+`parallelism: 3, completions: 3` means this job will simultaneously start 3 PaddlePaddle pods, and this job will be finished when there are 3 finished pods.
 
-`JOB_PATH` represents the sharing volume path, `JOB_NAME` represents job name, `TRAIN_CONFIG_DIR` represents the training data file directory, we can these three parameters to get the file path for this training.
+`env` field represents container's environment variables, we specify PaddlePaddle parameters by environment variables.
 
-`CONF_PADDLE_NIC` represents `paddle pserver` process's `--nics` parameters, the NIC name.
+`ports` indicates that TCP port 7164 - 7167 are exposed for communication between `pserver` ans trainer. port starts continously from `CONF_PADDLE_PORT` (7164) to `CONF_PADDLE_PORT + CONF_PADDLE_PORTS_NUM + CONF_PADDLE_PORTS_NUM_SPARSE - 1` (7167). We use multiple ports for dense and sparse paramter updates to improve latency.
 
-`CONF_PADDLE_PORT` represents `paddle pserver` process's `--port` parameters, `CONF_PADDLE_PORTS_NUM` represents `--port_num` parameter.
-
-`CONF_PADDLE_PORTS_NUM_SPARSE` represents the sparse updated port number, `--ports_num_for_sparse` parameter.
+Run following command to launch the job.
+```
+kubectl --kubeconfig=kubeconfig create -f paddle-claster-job.yaml
+```
 
-`CONF_PADDLE_GRADIENT_NUM` represents the training node number, `--num_gradient_servers` parameter.
+Inspect individual pods
 
-After we create the yaml file, we can use Kubernetes command line tool to create the job onto the cluster.
+```
+$ kubectl --kubeconfig=kubeconfig get pods
+NAME                       READY     STATUS    RESTARTS   AGE
+paddle-cluster-job-cm469   1/1       Running   0          9m
+paddle-cluster-job-fnt03   1/1       Running   0          9m
+paddle-cluster-job-jx4xr   1/1       Running   0          9m
+```
 
+Inspect individual console output
 ```
-kubectl create -f job.yaml
+kubectl --kubeconfig=kubeconfig log -f POD_NAME
 ```
 
-After we execute the above command, Kubernetes will create 3 pods and then pull the PaddlePaddle image, then start up the containers for training.
+`POD_NAME`: name of any pod (e.g., `paddle-cluster-job-cm469`).
 
+Run `kubectl --kubeconfig=kubeconfig describe job paddle-cluster-job` to check training job status. It will complete in around 20 minutes.
 
+The details for start `pserver` and `trainer` are hidden inside docker image `paddledev/paddle-tutorial:k8s_train`, see [here](src/k8s_train/README.md) for how to build the docker image and source code.
 
-####Check Training Results
+#### Inspect Training Output
 
-During the training, we can see the logs and models on EFS sharing volume, the output directory contains the training results. (Caution: node_0, node_1, node_2 directories represents PaddlePaddle node and train_id, not the Kubernetes node)
+Training output (model snapshot and logs) will be saved in EFS. We can ssh into worker EC2 instance, mount EFS and check training output.
 
+1. ssh Into Worker EC2 instance
 ```
-[root@paddle-kubernetes-node0 output]# tree -d
-.
-├── node_0
-│   ├── server.log
-│   └── train.log
-├── node_1
-│   ├── server.log
-│   └── train.log
-├── node_2
-......
-├── pass-00002
-│   ├── done
-│   ├── ___embedding_0__.w0
-│   ├── ___embedding_1__.w0
-......
+chmod 400 key-name.pem
+ssh -i key-name.pem core@INSTANCE_IP
 ```
 
-We can always check the container training status through logs, for example:
+`INSTANCE_IP`: public IP address of EC2 kubernetes worker node. Go to [EC2 console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#Instances:sort=instanceId) and check `public IP` of any `paddle-cluster-kube-aws-worker` instance.
 
+2. Mount EFS
 ```
-[root@paddle-kubernetes-node0 node_0]# cat train.log
-I1116 09:10:17.123121    50 Util.cpp:155] commandline:
- /usr/local/bin/../opt/paddle/bin/paddle_trainer
-    --nics=eth0 --port=7164
-    --ports_num=2 --comment=paddle_process_by_paddle
-    --pservers=192.168.129.66,192.168.223.143,192.168.129.71
-    --ports_num_for_sparse=2 --config=./trainer_config.py
-    --trainer_count=4 --num_passes=10 --use_gpu=0 
-    --log_period=50 --dot_period=10 --saving_period=1 
-    --local=0 --trainer_id=0
-    --save_dir=/home/jobpath/paddle-cluster-job/output
-I1116 09:10:17.123440    50 Util.cpp:130] Calling runInitFunctions
-I1116 09:10:17.123764    50 Util.cpp:143] Call runInitFunctions done.
-[WARNING 2016-11-16 09:10:17,227 default_decorators.py:40] please use keyword arguments in paddle config.
-[INFO 2016-11-16 09:10:17,239 networks.py:1282] The input order is [movie_id, title, genres, user_id, gender, age, occupation, rating]
-[INFO 2016-11-16 09:10:17,239 networks.py:1289] The output order is [__regression_cost_0__]
-I1116 09:10:17.392917    50 Trainer.cpp:170] trainer mode: Normal
-I1116 09:10:17.613910    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
-I1116 09:10:17.680917    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
-I1116 09:10:17.681543    50 GradientMachine.cpp:134] Initing parameters..
-I1116 09:10:18.012390    50 GradientMachine.cpp:141] Init parameters done.
-I1116 09:10:18.018641    50 ParameterClient2.cpp:122] pserver 0 192.168.129.66:7164
-I1116 09:10:18.018950    50 ParameterClient2.cpp:122] pserver 1 192.168.129.66:7165
-I1116 09:10:18.019069    50 ParameterClient2.cpp:122] pserver 2 192.168.223.143:7164
-I1116 09:10:18.019492    50 ParameterClient2.cpp:122] pserver 3 192.168.223.143:7165
-I1116 09:10:18.019716    50 ParameterClient2.cpp:122] pserver 4 192.168.129.71:7164
-I1116 09:10:18.019836    50 ParameterClient2.cpp:122] pserver 5 192.168.129.71:7165
+mkdir efs
+sudo mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 EFS_DNS_NAME:/ efs
 ```
 
-It'll take around 8 hours to finish this PaddlePaddle recommendation training demo on three 2 core 8 GB EC2 machine (m3.large).
+`EFS_DNS_NAME`: DNS name as shown in description of `paddle-efs` that we created. Look similar to `fs-2cbf7385.efs.us-west-2.amazonaws.com`.
 
+Now folder `efs` will have structure similar to:
+```
+-- paddle-cluster-job
+    |-- ...
+    |-- output
+    |   |-- node_0
+    |   |   |-- server.log
+    |   |   `-- train.log
+    |   |-- node_1
+    |   |   |-- server.log
+    |   |   `-- train.log
+    |   |-- node_2
+    |   |   |-- server.log
+    |   |   `-- train.log
+    |   |-- pass-00000
+    |   |   |-- ___fc_layer_0__.w0
+    |   |   |-- ___fc_layer_0__.wbias
+    |   |   |-- done
+    |   |   |-- path.txt
+    |   |   `-- trainer_config.lr.py
+	|   |-- pass-00001...
+```
+`server.log` contains log for `pserver`. `train.log` contains log for `trainer`. Model description and snapshot is stored in `pass-0000*`.
 
-###Kubernetes Cluster Tear Down
+### Kubernetes Cluster Tear Down
 
+#### Delete EFS
 
-If you want to tear down the whole Kubernetes cluster, make sure to *delete* the EFS volume first (otherwise, you will get stucked on following steps), and then use the following command:
+Go to [EFS Console](https://us-west-2.console.aws.amazon.com/efs/home?region=us-west-2) and delete the EFS volumn that we created.
 
-```
-kube-aws destroy
-```
-It's an async call, it might take 5 min to tear down the whole cluster.
+#### Delete security group
 
-If you created any Kubernetes Services of type LoadBalancer, you must delete these first, as the CloudFormation cannot be fully destroyed if any externally-managed resources still exist.
+Go to [Security Group Console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#SecurityGroups:sort=groupId) and delete security group `paddle-efs`.
 
 
+#### Delete S3 Bucket
 
-## For Experts with Kubernetes and AWS
+Go to [S3 Console](https://console.aws.amazon.com/s3/home?region=us-west-2#) and delete the S3 bucket that we created.
 
-Sometimes we might need to create or manage the cluster on AWS manually with limited privileges, so here we will explain more on what’s going on with the Kubernetes setup script.
+#### Destroy Cluster
+
+```
+kube-aws destroy
+```
 
-### Some Presumptions
+The command will return immediately, but it might take 5 min to tear down the whole cluster.
 
-* Instances run on CoreOS, the official IAM.
-* Kubernetes node use instance storage, no EBS get mounted. Etcd is running on additional node.
-* For networking, we use Flannel network at this moment, we will use Calico solution later on.
-* When you create a service with Type=LoadBalancer, Kubernetes will create and ELB, and create a security group for the ELB.
+You can go to [CludFormation Console](https://us-west-2.console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks?filter=active) to check destroy process.
diff --git a/doc/howto/usage/k8s/k8s_basis_cn.md b/doc/howto/usage/k8s/k8s_basis_cn.md
new file mode 100644
index 00000000000000..6278dacb17a378
--- /dev/null
+++ b/doc/howto/usage/k8s/k8s_basis_cn.md
@@ -0,0 +1,75 @@
+# Kubernetes 简介
+
+[*Kubernetes*](http://kubernetes.io/)是Google开源的容器集群管理系统，其提供应用部署、维护、扩展机制等功能，利用Kubernetes能方便地管理跨机器运行容器化的应用。Kubernetes可以在物理机或虚拟机上运行，且支持部署到[AWS](http://kubernetes.io/docs/getting-started-guides/aws)，[Azure](http://kubernetes.io/docs/getting-started-guides/azure/)，[GCE](http://kubernetes.io/docs/getting-started-guides/gce)等多种公有云环境。介绍分布式训练之前，需要对[Kubernetes](http://kubernetes.io/)有一个基本的认识，下面先简要介绍一下本文用到的几个Kubernetes概念。
+
+- [*Node*](http://kubernetes.io/docs/admin/node/) 表示一个Kubernetes集群中的一个工作节点，这个节点可以是物理机或者虚拟机，Kubernetes集群就是由node节点与master节点组成的。
+
+- [*Pod*](http://kubernetes.io/docs/user-guide/pods/) 是一组(一个或多个)容器，pod是Kubernetes的最小调度单元，一个pod中的所有容器会被调度到同一个node上。Pod中的容器共享NET，PID，IPC，UTS等Linux namespace。由于容器之间共享NET namespace，所以它们使用同一个IP地址，可以通过*localhost*互相通信。不同pod之间可以通过IP地址访问。
+
+- [*Job*](http://kubernetes.io/docs/user-guide/jobs/) 描述Kubernetes上运行的作业，一次作业称为一个job，通常每个job包括一个或者多个pods，job启动后会创建这些pod并开始执行一个程序，等待这个程序执行成功并返回0则成功退出，如果执行失败，也可以配置不同的重试机制。
+
+- [*Volume*](http://kubernetes.io/docs/user-guide/volumes/) 存储卷，是pod内的容器都可以访问的共享目录，也是容器与node之间共享文件的方式，因为容器内的文件都是暂时存在的，当容器因为各种原因被销毁时，其内部的文件也会随之消失。通过volume，就可以将这些文件持久化存储。Kubernetes支持多种volume，例如hostPath(宿主机目录)，gcePersistentDisk，awsElasticBlockStore等。
+
+- [*Namespaces*](https://kubernetes.io/docs/user-guide/namespaces/) 命名空间，在kubernetes中创建的所有资源对象(例如上文的pod，job)等都属于一个命名空间，在同一个命名空间中，资源对象的名字是唯一的，不同空间的资源名可以重复，命名空间主要为了对象进行逻辑上的分组便于管理。本文只使用了默认命名空间。
+
+- [*PersistentVolume*](https://kubernetes.io/docs/user-guide/persistent-volumes/): 和[*PersistentVolumeClaim*](https://kubernetes.io/docs/user-guide/persistent-volumes/#persistentvolumeclaims)结合，将外部的存储服务在Kubernetes中描述成为统一的资源形式，便于存储资源管理和Pod引用。
+
+# 部署Kubernetes集群
+
+Kubernetes提供了多种集群部署的方案，本文档内不重复介绍。这里给出集中常见的部署方法：
+
+- [*minikube*](https://kubernetes.io/docs/getting-started-guides/minikube/): 快速在本地启动一个单机的kubernetes服务器，便于本地验证和测试。
+- [*kubeadm*](http://kubernetes.io/docs/getting-started-guides/kubeadm/): 在不同操作系统，不同主机(Bare-Metal, AWS, GCE)条件下，快速部署集群。
+- [*AWS EC2*](https://kubernetes.io/docs/getting-started-guides/aws/): 在aws上快速部署集群。
+- [*Bare-Metal*](https://kubernetes.io/docs/getting-started-guides/centos/centos_manual_config/): 在物理机上手动部署。
+
+可以参考[这个表格](https://kubernetes.io/docs/getting-started-guides/#table-of-solutions)选择适合您的场景的合适方案。
+
+# 选择存储方案
+
+容器不会保留在运行时生成的数据，job或者应用程序在容器中运行时生成的数据会在容器销毁时消失。为了完成分布式机器学习训练任务，需要有一个外部的存储服务来保存训练所需数据和训练输出。
+常见的可选存储服务包括：
+
+- [*NFS*](https://github.com/kubernetes/kubernetes/tree/master/examples/volumes/nfs): 可以将磁盘上某个目录共享给网络中其他机器访问。部署和配置比较简单，可以用于小量数据的验证。不提供分布式存储，高可用，冗余等功能。NFS的部署方法可以参考[这里](http://www.tecmint.com/how-to-setup-nfs-server-in-linux/)。
+- [*GlusterFS*](http://gluster.readthedocs.io/en/latest/Quick-Start-Guide/Quickstart/): 网络分布式文件系统，可以在Kubernetes中按照[这个](https://github.com/kubernetes/kubernetes/tree/master/examples/volumes/glusterfs)例子使用。
+- [*Ceph*](http://docs.ceph.com/docs/master/): 分布式文件系统，支持rbd，POSIX API接口(ceph fs)和对象存储API，参考[这里](https://kubernetes.io/docs/user-guide/volumes/#rbd)。
+- [*MooseFS*](https://moosefs.com/documentation.html): 一个分布式的存储系统。需要先挂载到服务器Node上再通过kubernetes hostPath Volume挂载到容器中。
+
+# 配置kubectl
+
+## 安装kubectl
+```
+# OS X
+curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/darwin/amd64/kubectl
+
+# Linux
+curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl
+
+# Windows
+curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/windows/amd64/kubectl.exe
+```
+
+## 配置kubectl访问你的kubernetes集群
+
+编辑`~/.kube/config`这个配置文件，修改`Master-IP`的地址。如果使用SSL认证，则需要配置`certificate-authority`和`users`中的用户证书。如果是使用非SSL方式访问（比如通过8080端口），也可以去掉这些证书的配置。
+```
+apiVersion: v1
+clusters:
+- cluster:
+    certificate-authority: /path/to/ca.crt
+    server: https://[Master-IP]:443
+  name: minikube
+contexts:
+- context:
+    cluster: minikube
+    user: minikube
+  name: minikube
+current-context: minikube
+kind: Config
+preferences: {}
+users:
+- name: minikube
+  user:
+    client-certificate: /path/to/apiserver.crt
+    client-key: /Users/wuyi/.minikube/apiserver.key
+```
diff --git a/doc/howto/usage/k8s/k8s_distributed_cn.md b/doc/howto/usage/k8s/k8s_distributed_cn.md
index 2063b98ca8aab9..7213a977b8a2a4 100644
--- a/doc/howto/usage/k8s/k8s_distributed_cn.md
+++ b/doc/howto/usage/k8s/k8s_distributed_cn.md
@@ -2,168 +2,50 @@
 
 前一篇文章介绍了如何在Kubernetes集群上启动一个单机PaddlePaddle训练作业 (Job)。在这篇文章里，我们介绍如何在Kubernetes集群上进行分布式PaddlePaddle训练作业。关于PaddlePaddle的分布式训练，文章 [Cluster Training](https://github.com/baidu/Paddle/blob/develop/doc/cluster/opensource/cluster_train.md)介绍了一种通过SSH远程分发任务，进行分布式训练的方法，与此不同的是，本文将介绍在Kubernetes容器管理平台上快速构建PaddlePaddle容器集群，进行分布式训练的方案。
 
-## Kubernetes 基本概念
-
-[*Kubernetes*](http://kubernetes.io/)是Google开源的容器集群管理系统，其提供应用部署、维护、 扩展机制等功能，利用Kubernetes能方便地管理跨机器运行容器化的应用。Kubernetes可以在物理机或虚拟机上运行，且支持部署到[AWS](http://kubernetes.io/docs/getting-started-guides/aws)，[Azure](http://kubernetes.io/docs/getting-started-guides/azure/)，[GCE](http://kubernetes.io/docs/getting-started-guides/gce)等多种公有云环境。介绍分布式训练之前，需要对[Kubernetes](http://kubernetes.io/)有一个基本的认识，下面先简要介绍一下本文用到的几个Kubernetes概念。
-
-- [*Node*](http://kubernetes.io/docs/admin/node/) 表示一个Kubernetes集群中的一个工作节点，这个节点可以是物理机或者虚拟机，Kubernetes集群就是由node节点与master节点组成的。
-
-- [*Pod*](http://kubernetes.io/docs/user-guide/pods/) 是一组(一个或多个)容器，pod是Kubernetes的最小调度单元，一个pod中的所有容器会被调度到同一个node上。Pod中的容器共享NET，PID，IPC，UTS等Linux namespace。由于容器之间共享NET namespace，所以它们使用同一个IP地址，可以通过*localhost*互相通信。不同pod之间可以通过IP地址访问。
-
-- [*Job*](http://kubernetes.io/docs/user-guide/jobs/) 是Kubernetes上运行的作业，一次作业称为一个job，通常每个job包括一个或者多个pods。
-
-- [*Volume*](http://kubernetes.io/docs/user-guide/volumes/) 存储卷，是pod内的容器都可以访问的共享目录，也是容器与node之间共享文件的方式，因为容器内的文件都是暂时存在的，当容器因为各种原因被销毁时，其内部的文件也会随之消失。通过volume，就可以将这些文件持久化存储。Kubernetes支持多种volume，例如hostPath(宿主机目录)，gcePersistentDisk，awsElasticBlockStore等。
-
-- [*Namespaces*](http://kubernetes.io/docs/user-guide/volumes/) 命名空间，在kubernetes中创建的所有资源对象(例如上文的pod，job)等都属于一个命名空间，在同一个命名空间中，资源对象的名字是唯一的，不同空间的资源名可以重复，命名空间主要为了对象进行逻辑上的分组便于管理。本文只使用了默认命名空间。
+有关Kubernetes相关概念以及如何搭建和配置Kubernetes集群，可以参考[k8s_basis](./k8s_basis_cn.md)。
 
 ## 整体方案
 
-### 部署Kubernetes集群
-
-首先，我们需要拥有一个Kubernetes集群，在这个集群中所有node与pod都可以互相通信。关于Kubernetes集群搭建，可以参考[官方文档](http://kubernetes.io/docs/getting-started-guides/kubeadm/)，在以后的文章中我们也会介绍AWS上搭建的方案。本文假设大家能找到几台物理机，并且可以按照官方文档在上面部署Kubernetes。在本文的环境中，Kubernetes集群中所有node都挂载了一个[MFS](http://moosefs.org/)（Moose filesystem，一种分布式文件系统）共享目录，我们通过这个目录来存放训练文件与最终输出的模型。关于MFS的安装部署，可以参考[MooseFS documentation](https://moosefs.com/documentation.html)。在训练之前，用户将配置与训练数据切分好放在MFS目录中，训练时，程序从此目录拷贝文件到容器内进行训练，将结果保存到此目录里。整体的结构图如下：
+在训练之前，用户将配置与训练数据切分好放在分布式文件系统预先分配好的目录中(不同的分布式文件系统，需要使用其制定的方式挂载后并导入数据)，训练时，程序从此目录拷贝文件到容器内进行训练，将结果保存到此目录里。整体的结构图如下：
 
 ![paddle on kubernetes结构图](src/k8s-paddle-arch.png)
 
-上图描述了一个3节点的分布式训练场景，Kubernetes集群的每个node上都挂载了一个MFS目录，这个目录可以通过volume的形式挂载到容器中。Kubernetes为这次训练创建了3个pod并且调度到了3个node上运行，每个pod包含一个PaddlePaddle容器。在容器创建后，会启动pserver与trainer进程，读取volume中的数据进行这次分布式训练。
-
-### 使用 Job
-
-我们使用Kubernetes中的job这个概念来代表一次分布式训练。Job表示一次性作业，在作业完成后，Kubernetes会销毁job产生的容器并且释放相关资源。
-
-在Kubernetes中，可以通过编写一个YAML文件，来描述这个job，在这个文件中，主要包含了一些配置信息，例如PaddlePaddle的节点个数，`paddle pserver`开放的端口个数与端口号，使用的网卡设备等，这些信息通过环境变量的形式传递给容器内的程序使用。
+上图描述了一个3节点的分布式训练场景，在每个Pod上都通过volume方式挂载分布式文件系统的一个目录用于保存训练数据和输出结果。Kubernetes为这次训练创建了3个pod并且调度到了3个node上运行，每个pod包含一个PaddlePaddle容器。在容器创建后，会启动pserver与trainer进程，读取volume中的数据进行这次分布式训练。
 
-在一次分布式训练中，用户确定好本次训练需要的PaddlePaddle节点个数，将切分好的训练数据与配置文件上传到MFS共享目录中。然后编写这次训练的job YAML文件，提交给Kubernetes集群创建并开始作业。
+根据前文的描述，要在已有的Kubernetes集群上进行PaddlePaddle的分布式训练，按照下面步骤即可：
 
-### 创建PaddlePaddle节点
-
-当Kubernetes master收到请求，解析完YAML文件后，会创建出多个pod(个数为PaddlePaddle节点数)，Kubernetes会把这些pod调度到集群的node上运行。一个pod就代表一个PaddlePaddle节点，当pod被成功分配到一台物理/虚拟机上后，Kubernetes会启动pod内的容器，这个容器会根据YAML文件中的环境变量，启动`paddle pserver`与`paddle train`进程。
-
-### 启动训练
-
-在容器启动后，会通过脚本来启动这次分布式训练，我们知道`paddle train`进程启动时需要知道其他节点的IP地址以及本节点的trainer_id，由于PaddlePaddle本身不提供类似服务发现的功能，所以在本文的启动脚本中，每个节点会根据job name向Kubernetes apiserver查询这个job对应的所有pod信息(Kubernetes默认会在每个容器的环境变量中写入apiserver的地址)。
-
-根据这些pod信息，就可以通过某种方式，为每个pod分配一个唯一的trainer_id。本文把所有pod的IP地址进行排序，将顺序作为每个PaddlePaddle节点的trainer_id。启动脚本的工作流程大致如下：
-
-  1. 查询Kubernetes apiserver获取pod信息，根据IP分配trainer_id
-  1. 从MFS共享目录中拷贝训练文件到容器内
-  1. 根据环境变量，解析出`paddle pserver`与`paddle train`的启动参数，启动进程
-  1. 训练时，PaddlePaddle会自动将结果保存在trainer_id为0的节点上，将输出路径设置为MFS目录，保存输出的文件
-
-
-## 搭建过程
-
-根据前文的描述，要在已有的Kubernetes集群上进行PaddlePaddle的分布式训练，主要分为以下几个步骤：
-
-1. 制作PaddlePaddle镜像
-1. 将训练文件与切分好的数据上传到共享存储
-1. 编写本次训练的YAML文件，创建一个Kubernetes job
-1. 训练结束后查看输出结果
+1. [制作PaddlePaddle镜像](#制作镜像)
+1. [将训练文件与切分好的数据上传到共享存储](#上传训练文件)
+1. [编写本次训练的YAML文件，创建一个Kubernetes job](#创建Job)
+1. [训练结束后查看输出结果](#查看输出)
 
 下面就根据这几个步骤分别介绍。
 
-
 ### 制作镜像
 
 PaddlePaddle镜像需要提供`paddle pserver`与`paddle train`进程的运行环境，用这个镜像创建的容器需要有以下两个功能：
 
 - 拷贝训练文件到容器内
-
 - 生成`paddle pserver`与`paddle train`进程的启动参数，并且启动训练
 
-因为官方镜像 `paddledev/paddle:cpu-latest` 内已经包含PaddlePaddle的执行程序但是还没上述功能，所以我们可以在这个基础上，添加启动脚本，制作新镜像来完成以上的工作。镜像的*Dockerfile*如下：
-
-```Dockerfile
-FROM paddledev/paddle:cpu-latest
-
-MAINTAINER zjsxzong89@gmail.com
-
-COPY start.sh /root/
-COPY start_paddle.py /root/
-CMD ["bash"," -c","/root/start.sh"]
-```
-
-[start.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/k8s/start.sh)文件拷贝训练文件到容器内，然后执行[start_paddle.py](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/k8s/start_paddle.py)脚本启动训练，前文提到的获取其他节点IP地址，分配`trainer_id`等都在`start_paddle.py`脚本中完成。
-
-`start_paddle.py`脚本开始时，会先进行参数的初始化与解析。
-
-```python
-parser = argparse.ArgumentParser(prog="start_paddle.py",
-                                     description='simple tool for k8s')
-    args, train_args_list = parser.parse_known_args()
-    train_args = refine_unknown_args(train_args_list)
-    train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2]))
-    podlist = getPodList()
-```
-
-然后通过函数`getPodList()`访问Kubernetes的接口来查询此job对应的所有pod信息。当所有pod都处于running状态（容器运行都运行）时，再通过函数`getIdMap(podlist)`获取trainer_id。
-
-```python
-    podlist = getPodList()
-    # need to wait until all pods are running
-    while not isPodAllRunning(podlist):
-        time.sleep(10)
-        podlist = getPodList()
-    idMap = getIdMap(podlist)
-```
-
-在函数`getIdMap(podlist)`内部，我们通过读取`podlist`中每个pod的IP地址，将IP排序生成的序号作为trainer_id。
-
-```python
-def getIdMap(podlist):
-    '''
-    generate tainer_id by ip
-    '''
-    ips = []
-    for pod in podlist["items"]:
-        ips.append(pod["status"]["podIP"])
-    ips.sort()
-    idMap = {}
-    for i in range(len(ips)):
-        idMap[ips[i]] = i
-    return idMap
-```
-
-在得到`idMap`后，通过函数`startPaddle(idMap, train_args_dict)`构造`paddle pserver`与`paddle train`的启动参数并执行进程。
-
-在函数`startPaddle`中，最主要的工作就是解析出`paddle pserver`与`paddle train`的启动参数。例如`paddle train`参数的解析，解析环境变量得到`PADDLE_NIC`，`PADDLE_PORT`，`PADDLE_PORTS_NUM`等参数，然后通过自身的IP地址在`idMap`中获取`trainerId`。
-
-```python
-    program = 'paddle train'
-    args = " --nics=" + PADDLE_NIC
-    args += " --port=" + str(PADDLE_PORT)
-    args += " --ports_num=" + str(PADDLE_PORTS_NUM)
-    args += " --comment=" + "paddle_process_by_paddle"
-    ip_string = ""
-    for ip in idMap.keys():
-        ip_string += (ip + ",")
-    ip_string = ip_string.rstrip(",")
-    args += " --pservers=" + ip_string
-    args_ext = ""
-    for key, value in train_args_dict.items():
-        args_ext += (' --' + key + '=' + value)
-    localIP = socket.gethostbyname(socket.gethostname())
-    trainerId = idMap[localIP]
-    args += " " + args_ext + " --trainer_id=" + \
-        str(trainerId) + " --save_dir=" + JOB_PATH_OUTPUT
-```
-
-使用 `docker build` 构建镜像：
+因为官方镜像 `paddledev/paddle:cpu-latest` 内已经包含PaddlePaddle的执行程序但是还没上述功能，所以我们可以在这个基础上，添加启动脚本，制作新镜像来完成以上的工作。参考镜像的[*Dockerfile*](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/k8s/src/k8s_train/Dockerfile)。
 
 ```bash
-docker build -t your_repo/paddle:mypaddle .
+$ cd doc/howto/usage/k8s/src/k8s_train
+$ docker build -t [YOUR_REPO]/paddle:mypaddle .
 ```
 
 然后将构建成功的镜像上传到镜像仓库。
 
 ```bash
-docker push  your_repo/paddle:mypaddle
+docker push  [YOUR_REPO]/paddle:mypaddle
 ```
 
-注意上述命令中`your_repo`表示读者所使用的Docker镜像仓库地址，读者需要替换成自己使用的仓库地址。下文使用`your_repo/paddle:mypaddle`这个地址来表示此步骤所构建出的镜像。
+注意上述命令中`[YOUR_REPO]`表示读者所使用的Docker镜像仓库地址，读者需要替换成自己使用的仓库地址。下文使用`[YOUR_REPO]/paddle:mypaddle`这个地址来表示此步骤所构建出的镜像。
 
 ### 上传训练文件
 
-本文使用PaddlePaddle官方的[recommendation demo](http://www.paddlepaddle.org/doc/demo/index.html#recommendation)作为这次训练的内容，我们将训练文件与数据放在一个job name命名的目录中，上传到MFS共享存储。完成后MFS上的文件内容大致如下：
+本文使用PaddlePaddle官方的[recommendation demo](http://www.paddlepaddle.org/doc/demo/index.html#recommendation)作为这次训练的内容，我们将训练文件与数据放在一个job name命名的目录中，上传到volume所在的共享存储（使用不同分布式存储会有不同的挂载方式，需要要先挂载这个目录，然后拷贝数据）。完成后volume中的文件内容大致如下：
 
 ```bash
 [root@paddle-kubernetes-node0 mfs]# tree -d
@@ -205,7 +87,7 @@ spec:
           path: /home/work/mfs
       containers:
       - name: trainer
-        image: your_repo/paddle:mypaddle
+        image: [YOUR_REPO]/paddle:mypaddle
         command: ["bin/bash",  "-c", "/root/start.sh"]
         env:
         - name: JOB_NAME
@@ -289,8 +171,8 @@ I1116 09:10:17.123121    50 Util.cpp:155] commandline:
     --ports_num=2 --comment=paddle_process_by_paddle
     --pservers=192.168.129.66,192.168.223.143,192.168.129.71
     --ports_num_for_sparse=2 --config=./trainer_config.py
-    --trainer_count=4 --num_passes=10 --use_gpu=0 
-    --log_period=50 --dot_period=10 --saving_period=1 
+    --trainer_count=4 --num_passes=10 --use_gpu=0
+    --log_period=50 --dot_period=10 --saving_period=1
     --local=0 --trainer_id=0
     --save_dir=/home/jobpath/paddle-cluster-job/output
 I1116 09:10:17.123440    50 Util.cpp:130] Calling runInitFunctions
@@ -310,3 +192,90 @@ I1116 09:10:18.019492    50 ParameterClient2.cpp:122] pserver 3 192.168.223.143:
 I1116 09:10:18.019716    50 ParameterClient2.cpp:122] pserver 4 192.168.129.71:7164
 I1116 09:10:18.019836    50 ParameterClient2.cpp:122] pserver 5 192.168.129.71:7165
 ```
+
+
+## 一些细节的补充
+
+### 使用环境变量
+
+使用容器方式运行训练任务的Kubernetes Job，通常会使用环境变量配置Job的配置信息`start_paddle.py`提供了一个启动脚本，将环境变量转换成paddle的命令行参数：
+```
+API = "/api/v1/namespaces/"
+JOBSELECTOR = "labelSelector=job-name="
+JOB_PATH = os.getenv("JOB_PATH") + "/" + os.getenv("JOB_NAME")
+JOB_PATH_OUTPUT = JOB_PATH + "/output"
+JOBNAME = os.getenv("JOB_NAME")
+NAMESPACE = os.getenv("JOB_NAMESPACE")
+PADDLE_NIC = os.getenv("CONF_PADDLE_NIC")
+PADDLE_PORT = os.getenv("CONF_PADDLE_PORT")
+PADDLE_PORTS_NUM = os.getenv("CONF_PADDLE_PORTS_NUM")
+PADDLE_PORTS_NUM_SPARSE = os.getenv("CONF_PADDLE_PORTS_NUM_SPARSE")
+PADDLE_SERVER_NUM = os.getenv("CONF_PADDLE_GRADIENT_NUM")
+```
+
+### Pod间通信
+`start_paddle.py`脚本开始时，会先进行参数的初始化与解析。
+
+```python
+parser = argparse.ArgumentParser(prog="start_paddle.py",
+                                     description='simple tool for k8s')
+    args, train_args_list = parser.parse_known_args()
+    train_args = refine_unknown_args(train_args_list)
+    train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2]))
+    podlist = getPodList()
+```
+
+然后通过函数`getPodList()`访问Kubernetes的接口来查询此job对应的所有pod信息。当所有pod都处于running状态（容器运行都运行）时，再通过函数`getIdMap(podlist)`获取trainer_id。
+
+```python
+    podlist = getPodList()
+    # need to wait until all pods are running
+    while not isPodAllRunning(podlist):
+        time.sleep(10)
+        podlist = getPodList()
+    idMap = getIdMap(podlist)
+```
+* *注意*: `getPodList()`会获取当前namespace下的所有pod，如果已经有pod运行，可能会导致出错。这种集群节点管理方式会在将来使用[statfulsets](https://kubernetes.io/docs/concepts/abstractions/controllers/statefulsets/)代替。
+
+在函数`getIdMap(podlist)`内部，我们通过读取`podlist`中每个pod的IP地址，将IP排序生成的序号作为trainer_id。
+
+```python
+def getIdMap(podlist):
+    '''
+    generate tainer_id by ip
+    '''
+    ips = []
+    for pod in podlist["items"]:
+        ips.append(pod["status"]["podIP"])
+    ips.sort()
+    idMap = {}
+    for i in range(len(ips)):
+        idMap[ips[i]] = i
+    return idMap
+```
+
+在得到`idMap`后，通过函数`startPaddle(idMap, train_args_dict)`构造`paddle pserver`与`paddle train`的启动参数并执行进程。
+
+### 启动任务
+
+在函数`startPaddle`中，最主要的工作就是解析出`paddle pserver`与`paddle train`的启动参数。例如`paddle train`参数的解析，解析环境变量得到`PADDLE_NIC`，`PADDLE_PORT`，`PADDLE_PORTS_NUM`等参数，然后通过自身的IP地址在`idMap`中获取`trainerId`。
+
+```python
+    program = 'paddle train'
+    args = " --nics=" + PADDLE_NIC
+    args += " --port=" + str(PADDLE_PORT)
+    args += " --ports_num=" + str(PADDLE_PORTS_NUM)
+    args += " --comment=" + "paddle_process_by_paddle"
+    ip_string = ""
+    for ip in idMap.keys():
+        ip_string += (ip + ",")
+    ip_string = ip_string.rstrip(",")
+    args += " --pservers=" + ip_string
+    args_ext = ""
+    for key, value in train_args_dict.items():
+        args_ext += (' --' + key + '=' + value)
+    localIP = socket.gethostbyname(socket.gethostname())
+    trainerId = idMap[localIP]
+    args += " " + args_ext + " --trainer_id=" + \
+        str(trainerId) + " --save_dir=" + JOB_PATH_OUTPUT
+```
diff --git a/doc/howto/usage/k8s/src/add_security_group.png b/doc/howto/usage/k8s/src/add_security_group.png
index 50eed4c6573a18..bd34f46c9b0ada 100644
Binary files a/doc/howto/usage/k8s/src/add_security_group.png and b/doc/howto/usage/k8s/src/add_security_group.png differ
diff --git a/doc/howto/usage/k8s/src/create_efs.png b/doc/howto/usage/k8s/src/create_efs.png
index f4d448d1518e11..e5f1526033d1da 100644
Binary files a/doc/howto/usage/k8s/src/create_efs.png and b/doc/howto/usage/k8s/src/create_efs.png differ
diff --git a/doc/howto/usage/k8s/src/job.yaml b/doc/howto/usage/k8s/src/job.yaml
deleted file mode 100644
index 488aad0bede4f9..00000000000000
--- a/doc/howto/usage/k8s/src/job.yaml
+++ /dev/null
@@ -1,43 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: paddle-cluster-job
-spec:
-  parallelism: 3
-  completions: 3
-  template:
-    metadata:
-      name: paddle-cluster-job
-    spec:
-      volumes:
-      - name: jobpath
-        hostPath: 
-          path: /home/work/paddle_output              
-      containers:
-      - name: trainer
-        image: registry.baidu.com/public/paddle:mypaddle
-        command: ["bin/bash",  "-c", "/root/start.sh"]        
-        env:
-        - name: JOB_NAME
-          value: paddle-cluster-job
-        - name: JOB_PATH
-          value: /home/jobpath     
-        - name: JOB_NAMESPACE
-          value: default         
-        - name: TRAIN_CONFIG_DIR
-          value: recommendation
-        - name: CONF_PADDLE_NIC
-          value: eth0  
-        - name: CONF_PADDLE_PORT
-          value: "7164"
-        - name: CONF_PADDLE_PORTS_NUM
-          value: "2"     
-        - name: CONF_PADDLE_PORTS_NUM_SPARSE
-          value: "2"  
-        - name: CONF_PADDLE_GRADIENT_NUM
-          value: "3"                                                               
-        volumeMounts:
-        - name: jobpath
-          mountPath: /home/jobpath       
-      restartPolicy: Never
-    
diff --git a/doc/howto/usage/k8s/src/k8s-paddle-arch.png b/doc/howto/usage/k8s/src/k8s-paddle-arch.png
index a8c64550b1fa7f..2183a232ad402b 100644
Binary files a/doc/howto/usage/k8s/src/k8s-paddle-arch.png and b/doc/howto/usage/k8s/src/k8s-paddle-arch.png differ
diff --git a/doc/howto/usage/k8s/src/k8s_data/Dockerfile b/doc/howto/usage/k8s/src/k8s_data/Dockerfile
new file mode 100644
index 00000000000000..6d3a12ae393aa5
--- /dev/null
+++ b/doc/howto/usage/k8s/src/k8s_data/Dockerfile
@@ -0,0 +1,7 @@
+FROM alpine
+
+RUN apk update && apk upgrade && apk add coreutils
+ADD quick_start /quick_start
+ADD get_data.sh /bin/
+RUN chmod +x /bin/get_data.sh
+ENTRYPOINT ["/bin/get_data.sh"]
diff --git a/doc/howto/usage/k8s/src/k8s_data/README.md b/doc/howto/usage/k8s/src/k8s_data/README.md
new file mode 100644
index 00000000000000..83cef7affd0ac4
--- /dev/null
+++ b/doc/howto/usage/k8s/src/k8s_data/README.md
@@ -0,0 +1,6 @@
+To build PaddlePaddle data preparation image in tutorial [Distributed PaddlePaddle Training on AWS with Kubernetes](../../k8s_aws_en.md), run following commands:
+
+```
+cp -r ../../../../../../demo/quick_start .
+docker build . -t prepare-data-image-name
+```
diff --git a/doc/howto/usage/k8s/src/k8s_data/get_data.sh b/doc/howto/usage/k8s/src/k8s_data/get_data.sh
new file mode 100755
index 00000000000000..d187ba5ac8d03f
--- /dev/null
+++ b/doc/howto/usage/k8s/src/k8s_data/get_data.sh
@@ -0,0 +1,26 @@
+#!/bin/sh
+
+out_dir=$OUT_DIR
+split_count=$SPLIT_COUNT
+
+set -e
+
+mkdir -p $out_dir
+cp -r /quick_start $out_dir/
+
+mkdir -p $out_dir/0/data
+cd $out_dir/0/data
+wget http://paddlepaddle.bj.bcebos.com/demo/quick_start_preprocessed_data/preprocessed_data.tar.gz
+tar zxvf preprocessed_data.tar.gz
+rm preprocessed_data.tar.gz
+
+split -d --number=l/$split_count -a 5 train.txt train.
+mv train.00000 train.txt
+
+cd $out_dir
+end=$(expr $split_count - 1)
+for i in $(seq 1 $end); do
+    mkdir -p $i/data
+    cp -r 0/data/* $i/data
+    mv $i/data/train.`printf %05d $i` $i/data/train.txt
+done;
diff --git a/doc/howto/usage/k8s/src/k8s_train/Dockerfile b/doc/howto/usage/k8s/src/k8s_train/Dockerfile
new file mode 100644
index 00000000000000..c0fca1f9a94592
--- /dev/null
+++ b/doc/howto/usage/k8s/src/k8s_train/Dockerfile
@@ -0,0 +1,6 @@
+FROM paddledev/paddle:cpu-latest
+
+COPY start.sh /root/
+COPY start_paddle.py /root/
+RUN chmod +x /root/start.sh
+CMD ["bash"," -c","/root/start.sh"]
diff --git a/doc/howto/usage/k8s/src/k8s_train/README.md b/doc/howto/usage/k8s/src/k8s_train/README.md
new file mode 100644
index 00000000000000..96bf65497ffa23
--- /dev/null
+++ b/doc/howto/usage/k8s/src/k8s_train/README.md
@@ -0,0 +1,5 @@
+To build PaddlePaddle training image in tutorial [Distributed PaddlePaddle Training on AWS with Kubernetes](../../k8s_aws_en.md), run following command:
+
+```
+docker build . -t train-image-name
+```
diff --git a/doc/howto/usage/k8s/src/start.sh b/doc/howto/usage/k8s/src/k8s_train/start.sh
similarity index 55%
rename from doc/howto/usage/k8s/src/start.sh
rename to doc/howto/usage/k8s/src/k8s_train/start.sh
index b3a1334174a20b..12dfe1e6386885 100755
--- a/doc/howto/usage/k8s/src/start.sh
+++ b/doc/howto/usage/k8s/src/k8s_train/start.sh
@@ -1,19 +1,19 @@
 #!/bin/sh
+
 set -eu
 
 jobconfig=${JOB_PATH}"/"${JOB_NAME}"/"${TRAIN_CONFIG_DIR}
 cd /root
-cp -rf $jobconfig .
-cd $TRAIN_CONFIG_DIR
-
+cp -rf $jobconfig/* .
 
 python /root/start_paddle.py \
   --dot_period=10 \
-  --ports_num_for_sparse=$CONF_PADDLE_PORTS_NUM \
+  --ports_num=$CONF_PADDLE_PORTS_NUM \
+  --ports_num_for_sparse=$CONF_PADDLE_PORTS_NUM_SPARSE \
   --log_period=50 \
   --num_passes=10 \
-  --trainer_count=4 \
+  --trainer_count=$TRAINER_COUNT \
   --saving_period=1 \
   --local=0 \
-  --config=./trainer_config.py \
+  --config=trainer_config.lr.py \
   --use_gpu=0
diff --git a/doc/howto/usage/k8s/src/start_paddle.py b/doc/howto/usage/k8s/src/k8s_train/start_paddle.py
similarity index 84%
rename from doc/howto/usage/k8s/src/start_paddle.py
rename to doc/howto/usage/k8s/src/k8s_train/start_paddle.py
index df00d82919faa2..f1a770ccb54fbd 100755
--- a/doc/howto/usage/k8s/src/start_paddle.py
+++ b/doc/howto/usage/k8s/src/k8s_train/start_paddle.py
@@ -23,7 +23,6 @@
 API = "/api/v1/namespaces/"
 JOBSELECTOR = "labelSelector=job-name="
 JOB_PATH = os.getenv("JOB_PATH") + "/" + os.getenv("JOB_NAME")
-JOB_PATH_DATA = JOB_PATH + "/data"
 JOB_PATH_OUTPUT = JOB_PATH + "/output"
 JOBNAME = os.getenv("JOB_NAME")
 NAMESPACE = os.getenv("JOB_NAMESPACE")
@@ -33,6 +32,8 @@
 PADDLE_PORTS_NUM_SPARSE = os.getenv("CONF_PADDLE_PORTS_NUM_SPARSE")
 PADDLE_SERVER_NUM = os.getenv("CONF_PADDLE_GRADIENT_NUM")
 
+tokenpath = '/var/run/secrets/kubernetes.io/serviceaccount/token'
+
 
 def refine_unknown_args(cmd_args):
     '''
@@ -64,6 +65,7 @@ def isPodAllRunning(podlist):
     for pod in podlist["items"]:
         if pod["status"]["phase"] == "Running":
             running += 1
+    print "waiting for pods running, require:", require, "running:", running
     if require == running:
         return True
     return False
@@ -79,8 +81,17 @@ def getPodList():
 
     pod = API + NAMESPACE + "/pods?"
     job = JOBNAME
-    return requests.get(apiserver + pod + JOBSELECTOR + job,
-                        verify=False).json()
+    if os.path.isfile(tokenpath):
+        tokenfile = open(tokenpath, mode='r')
+        token = tokenfile.read()
+        Bearer = "Bearer " + token
+        headers = {"Authorization": Bearer}
+        return requests.get(apiserver + pod + JOBSELECTOR + job,
+                            headers=headers,
+                            verify=False).json()
+    else:
+        return requests.get(apiserver + pod + JOBSELECTOR + job,
+                            verify=False).json()
 
 
 def getIdMap(podlist):
@@ -122,8 +133,8 @@ def startPaddle(idMap={}, train_args_dict=None):
     if not os.path.exists(JOB_PATH_OUTPUT):
         os.makedirs(JOB_PATH_OUTPUT)
     os.mkdir(logDir)
-    copyCommand = 'cp -rf ' + JOB_PATH_DATA + \
-        "/" + str(trainerId) + " ./data"
+    copyCommand = 'cp -rf ' + JOB_PATH + \
+        "/" + str(trainerId) + "/data/*" + " ./data/"
     os.system(copyCommand)
     startPserver = 'nohup paddle pserver' + \
         " --port=" + str(PADDLE_PORT) + \
@@ -136,9 +147,9 @@ def startPaddle(idMap={}, train_args_dict=None):
     print startPserver
     os.system(startPserver)
     # wait until pservers completely start
-    time.sleep(10)
-    startTrainer = program + args + " > " + \
-        logDir + "/train.log 2>&1 < /dev/null"
+    time.sleep(20)
+    startTrainer = program + args + " 2>&1 | tee " + \
+        logDir + "/train.log"
     print startTrainer
     os.system(startTrainer)
 
@@ -152,7 +163,7 @@ def startPaddle(idMap={}, train_args_dict=None):
     podlist = getPodList()
     # need to wait until all pods are running
     while not isPodAllRunning(podlist):
-        time.sleep(10)
+        time.sleep(20)
         podlist = getPodList()
     idMap = getIdMap(podlist)
     startPaddle(idMap, train_args_dict)
diff --git a/doc/howto/usage/k8s/src/pserver_and_trainer.png b/doc/howto/usage/k8s/src/pserver_and_trainer.png
new file mode 100644
index 00000000000000..f41fe489205903
Binary files /dev/null and b/doc/howto/usage/k8s/src/pserver_and_trainer.png differ
diff --git a/doc/howto/usage/k8s/src/route53_create_recordset.png b/doc/howto/usage/k8s/src/route53_create_recordset.png
new file mode 100644
index 00000000000000..34e476c7beac30
Binary files /dev/null and b/doc/howto/usage/k8s/src/route53_create_recordset.png differ
diff --git a/doc/howto/usage/k8s/src/route53_create_zone.png b/doc/howto/usage/k8s/src/route53_create_zone.png
new file mode 100644
index 00000000000000..25b7ddb831c5cb
Binary files /dev/null and b/doc/howto/usage/k8s/src/route53_create_zone.png differ
diff --git a/doc/howto/usage/k8s/src/worker_security_group.png b/doc/howto/usage/k8s/src/worker_security_group.png
new file mode 100644
index 00000000000000..57eb0265a34ad4
Binary files /dev/null and b/doc/howto/usage/k8s/src/worker_security_group.png differ
diff --git a/paddle/api/Arguments.cpp b/paddle/api/Arguments.cpp
index 41beed38a87601..a3f4bfffc9f074 100644
--- a/paddle/api/Arguments.cpp
+++ b/paddle/api/Arguments.cpp
@@ -38,6 +38,13 @@ Arguments* Arguments::createByPaddleArgumentVector(void* ptr) {
   return args;
 }
 
+Arguments* Arguments::createByPaddleArgument(const void* ptr) {
+  auto p = (paddle::Argument*)(ptr);
+  auto args = new Arguments();
+  args->m->outputs.push_back(*p);
+  return args;
+}
+
 Matrix* Arguments::getSlotValue(size_t idx) const throw(RangeError) {
   auto& a = m->getArg(idx);
   return Matrix::createByPaddleMatrixPtr(&a.value);
diff --git a/paddle/api/Evaluator.cpp b/paddle/api/Evaluator.cpp
index c30e09876397e3..681e3a38091233 100644
--- a/paddle/api/Evaluator.cpp
+++ b/paddle/api/Evaluator.cpp
@@ -27,3 +27,18 @@ std::string Evaluator::toString() {
   m->rawPtr->printStats(sout);
   return sout.str();
 }
+
+std::vector<std::string> Evaluator::getNames() const {
+  std::vector<std::string> retv;
+  m->rawPtr->getNames(&retv);
+  return retv;
+}
+
+double Evaluator::getValue(const std::string name) const {
+  paddle::Error err;
+  double v = m->rawPtr->getValue(name, &err);
+  if (err) {
+    throw std::runtime_error(err.msg());
+  }
+  return v;
+}
diff --git a/paddle/api/GradientMachine.cpp b/paddle/api/GradientMachine.cpp
index 66115f8293b905..538ca2999f8f05 100644
--- a/paddle/api/GradientMachine.cpp
+++ b/paddle/api/GradientMachine.cpp
@@ -144,12 +144,12 @@ Parameter* GradientMachine::getParameter(size_t i) throw(RangeError) {
 
 void GradientMachine::randParameters() { m->machine->randParameters(); }
 
-Matrix* GradientMachine::getLayerOutput(const std::string& layerName) const
+Arguments* GradientMachine::getLayerOutput(const std::string& layerName) const
     throw(UnsupportError) {
-  auto nn = std::dynamic_pointer_cast<paddle::NeuralNetwork>(m->machine);
+  auto nn = m->machine;
   if (nn) {
-    auto mat = nn->getLayerOutput(layerName);
-    return Matrix::createByPaddleMatrixPtr(&mat);
+    auto arg = nn->getLayerOutput(layerName);
+    return Arguments::createByPaddleArgument(&arg);
   } else {
     throw UnsupportError();
   }
diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
index f5af8b0035b44d..d99e9a4ad48ea4 100644
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -47,6 +47,9 @@ void setUseGpu(bool useGpu);
 /// Return true if this py_paddle is compiled in GPU Version
 bool isGpuVersion();
 
+/// Return FLAGS_trainer_count
+int getTrainerCount();
+
 /// The Error of IO Operation. Such as file not found, etc.
 class IOError {};
 
@@ -454,6 +457,7 @@ class Arguments {
 
 private:
   static Arguments* createByPaddleArgumentVector(void* ptr);
+  static Arguments* createByPaddleArgument(const void* ptr);
   void* getInternalArgumentsPtr() const;
 
 private:
@@ -769,7 +773,7 @@ class GradientMachine {
 
   void randParameters();
 
-  Matrix* getLayerOutput(const std::string& layerName) const
+  Arguments* getLayerOutput(const std::string& layerName) const
       throw(UnsupportError);
 
   /**
@@ -900,6 +904,10 @@ class Evaluator {
    */
   std::string toString();
 
+  std::vector<std::string> getNames() const;
+
+  double getValue(const std::string name) const;
+
 private:
   EvaluatorPrivate* m;
 
@@ -952,7 +960,7 @@ class Trainer {
 
   Arguments* getForwardOutput();
 
-  Matrix* getLayerOutput(const std::string& layerName);
+  Arguments* getLayerOutput(const std::string& layerName) const;
 };
 
 /// the N-Best results generated from one input sequence.
diff --git a/paddle/api/Trainer.cpp b/paddle/api/Trainer.cpp
index d83dc380beeec3..84e4ca054abb01 100644
--- a/paddle/api/Trainer.cpp
+++ b/paddle/api/Trainer.cpp
@@ -131,12 +131,11 @@ void Trainer::testOneDataBatch(size_t batchSize, const Arguments& args) {
 void TrainerPrivate::finishTestPeriod() { tester_->finishTestPeriod(); }
 void Trainer::finishTestPeriod() { m->finishTestPeriod(); }
 
-Matrix* Trainer::getLayerOutput(const std::string& layerName) {
-  auto nn = std::dynamic_pointer_cast<paddle::NeuralNetwork>(
-      this->m->getGradientMachine());
+Arguments* Trainer::getLayerOutput(const std::string& layerName) const {
+  auto nn = this->m->getGradientMachine();
   CHECK(nn) << "trainerInternal_.getGradientMachine() is not NeuralNetwork";
-  auto m = nn->getLayerOutput(layerName);
-  return Matrix::createByPaddleMatrixPtr(&m);
+  auto arg = nn->getLayerOutput(layerName);
+  return Arguments::createByPaddleArgument(&arg);
 }
 
 void Trainer::forwardOneBatch(size_t batchSize) {
diff --git a/paddle/api/Util.cpp b/paddle/api/Util.cpp
index 54d67aa62f4d87..d369df5d4e04b4 100644
--- a/paddle/api/Util.cpp
+++ b/paddle/api/Util.cpp
@@ -54,5 +54,7 @@ bool isGpuVersion() {
 #endif
 }
 
+int getTrainerCount() { return FLAGS_trainer_count; }
+
 static_assert(NUM_PARAMETER_TYPES == paddle::NUM_PARAMETER_TYPES,
               "The Parameter Type should be same in core/api and core/common");
diff --git a/paddle/api/test/testMatrix.py b/paddle/api/test/testMatrix.py
index 37666bdccc9aed..f08fbf3ccdf5d7 100644
--- a/paddle/api/test/testMatrix.py
+++ b/paddle/api/test/testMatrix.py
@@ -68,7 +68,7 @@ def test_createDenseMat(self):
 
     def test_numpyCpu(self):
         numpy_mat = np.matrix([[1, 2], [3, 4], [5, 6]], dtype="float32")
-        m = swig_paddle.Matrix.createCpuDenseFromNumpy(numpy_mat, copy=False)
+        m = swig_paddle.Matrix.createCpuDenseFromNumpy(numpy_mat, False)
         self.assertEqual((int(m.getHeight()), int(m.getWidth())),
                          numpy_mat.shape)
 
diff --git a/paddle/api/test/testTrain.py b/paddle/api/test/testTrain.py
index a90d15c272a3a2..7061a4c43bf011 100644
--- a/paddle/api/test/testTrain.py
+++ b/paddle/api/test/testTrain.py
@@ -89,9 +89,14 @@ def update_callback(param):
             except Exception as e:
                 print e
 
+        ev = m.makeEvaluator()
+        ev.start()
         m.forwardBackward(inArgs, outArgs, swig_paddle.PASS_TRAIN,
                           update_callback)
-
+        m.eval(ev)
+        ev.finish()
+        for name in ev.getNames():
+            print name, ev.getValue(name)
         for optimizer in optimizers:
             optimizer.finishBatch()
 
diff --git a/paddle/api/test/testVector.py b/paddle/api/test/testVector.py
index 1ab095c1d3d0d2..6339cf8542607b 100644
--- a/paddle/api/test/testVector.py
+++ b/paddle/api/test/testVector.py
@@ -43,7 +43,7 @@ def test_create(self):
 
     def test_cpu_numpy(self):
         vec = np.array([1, 3, 4, 65, 78, 1, 4], dtype="int32")
-        iv = swig_paddle.IVector.createCpuVectorFromNumpy(vec, copy=False)
+        iv = swig_paddle.IVector.createCpuVectorFromNumpy(vec, False)
         self.assertEqual(vec.shape[0], int(iv.__len__()))
         vec[4] = 832
         for i in xrange(len(iv)):
@@ -106,7 +106,7 @@ def testCreate(self):
 
     def testCpuNumpy(self):
         numpy_arr = np.array([1.2, 2.3, 3.4, 4.5], dtype="float32")
-        vec = swig_paddle.Vector.createCpuVectorFromNumpy(numpy_arr, copy=False)
+        vec = swig_paddle.Vector.createCpuVectorFromNumpy(numpy_arr, False)
         assert isinstance(vec, swig_paddle.Vector)
         numpy_arr[0] = 0.1
         for n, v in zip(numpy_arr, vec):
diff --git a/paddle/cuda/include/hl_matrix.h b/paddle/cuda/include/hl_matrix.h
index abd5eb3a0cf338..eb454c59c1e58c 100644
--- a/paddle/cuda/include/hl_matrix.h
+++ b/paddle/cuda/include/hl_matrix.h
@@ -69,19 +69,6 @@ extern void hl_sequence_softmax_forward(real* A_d,
                                         const int* index,
                                         int numSequence);
 
-/**
- * @brief   Matrix classification error.
- *
- * @param[in]   A_d     input matrix (M x N).
- * @param[in]   B_d     input vector (M x 1).
- * @param[out]  C_d     output vector (M x 1).
- * @param[in]   dimM    matrix height.
- * @param[in]   dimN    matrix width.
- *
- */
-extern void hl_matrix_classification_error(
-    real* A_d, int* B_d, real* C_d, int dimM, int dimN);
-
 /**
  * @brief   Matrix cross entropy.
  *
@@ -188,48 +175,6 @@ extern void hl_param_relu_backward_diff(real* grad_o,
                                         int width,
                                         int height,
                                         int partial_sum);
-/**
- * @brief cos sim forward
- *
- * @param[out]    output         output data
- * @param[in]     input1         input1 data(matrix)
- * @param[in]     input2         input2 data(matrix or vector)
- * @param[in]     width          matrix width
- * @param[in]     input1_height  input1_height
- * @param[in]     input2_height  input2_height
- * @param[in]     scale          scale factor
- */
-extern void hl_cossim(real* output,
-                      real* input1,
-                      real* input2,
-                      int width,
-                      int input1_height,
-                      int input2_height,
-                      real scale);
-/**
- * @brief cos sim derivate
- *
- * @param[in]     grad             output grad
- * @param[in]     output           output data
- * @param[in]     prevOutX         input1 data
- * @param[in]     prevOutY         input2 data
- * @param[out]    prevGradX        input1 grad
- * @param[out]    prevGradY        input2 grad
- * @param[in]     width            matrix width
- * @param[in]     input1_height    input1 height
- * @param[in]     input2_height    input2 height
- * @param[in]     scale            scale factor
- */
-extern void hl_cossim_derivative(real* grad,
-                                 real* output,
-                                 real* prevOutX,
-                                 real* prevOutY,
-                                 real* prevGradX,
-                                 real* prevGradY,
-                                 int width,
-                                 int input1_height,
-                                 int input2_height,
-                                 real scale);
 
 /**
  * @brief   Matrix addition: A_d[i][j] += scale * B_d[j/channel].
@@ -267,4 +212,16 @@ extern void hl_matrix_collect_shared_bias(real* B_d,
                                           const int dimN,
                                           real scale);
 
+/**
+ * @brief  Matrix rotation in 90 degrees
+ *
+ * @param[in]   mat       input matrix (M x N).
+ * @param[out]  matRot    output matrix (N x M).
+ * @param[in]   dimM      input matrix height.
+ * @param[in]   dimN      input matrix width.
+ * @param[in]   clockWise rotation direction
+ */
+extern void hl_matrix_rotate(
+    real* mat, real* matRot, int dimM, int dimN, bool clockWise);
+
 #endif /* HL_MATRIX_H_ */
diff --git a/paddle/cuda/include/hl_top_k.h b/paddle/cuda/include/hl_top_k.h
index 77949ed295a6ea..79ae0d0e741de0 100644
--- a/paddle/cuda/include/hl_top_k.h
+++ b/paddle/cuda/include/hl_top_k.h
@@ -58,4 +58,30 @@ extern void hl_sparse_matrix_top_k(real* topVal,
                                    int beamSize,
                                    int numSamples);
 
-#endif /* HL_TOP_K_H_ */
+/**
+ * @brief   Matrix classification error.
+ *
+ * @param[out]  topVal         top k element.
+ * @param[in]   ldv            leading dimension of topVal.
+ * @param[out]  topIds         top k index.
+ * @param[in]   src            input value.
+ * @param[in]   lds            leading dimension of src.
+ * @param[in]   dim            width of input value.
+ * @param[in]   topkSize       size of top k element.
+ * @param[in]   numSamples     height of input value.
+ * @param[in]   label          ground truth label.
+ * @param[out]  recResult      top-k classification error.
+ *
+ */
+extern void hl_matrix_classification_error(real* topVal,
+                                           int ldv,
+                                           int* topIds,
+                                           real* src,
+                                           int lds,
+                                           int dim,
+                                           int topkSize,
+                                           int numSamples,
+                                           int* label,
+                                           real* recResult);
+
+#endif  // HL_TOP_K_H_
diff --git a/paddle/cuda/include/stub/hl_matrix_stub.h b/paddle/cuda/include/stub/hl_matrix_stub.h
index 0b669f6735cb97..127cb7e27983e8 100644
--- a/paddle/cuda/include/stub/hl_matrix_stub.h
+++ b/paddle/cuda/include/stub/hl_matrix_stub.h
@@ -35,8 +35,16 @@ inline void hl_sequence_softmax_forward(real* A_d,
 inline void hl_matrix_softmax_derivative(
     real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {}
 
-inline void hl_matrix_classification_error(
-    real* A_d, int* B_d, real* C_d, int dimM, int dimN) {}
+inline void hl_matrix_classification_error(real* topVal,
+                                           int ldv,
+                                           int* topIds,
+                                           real* src,
+                                           int lds,
+                                           int dim,
+                                           int topkSize,
+                                           int numSamples,
+                                           int* label,
+                                           real* recResult) {}
 
 inline void hl_matrix_cross_entropy(
     real* A_d, real* C_d, int* label_d, int dimM, int dimN) {}
@@ -74,25 +82,6 @@ inline void hl_param_relu_backward_diff(real* grad_o,
                                         int height,
                                         int partial_sum) {}
 
-inline void hl_cossim(real* output,
-                      real* input1,
-                      real* input2,
-                      int width,
-                      int input1_height,
-                      int input2_height,
-                      real scale) {}
-
-inline void hl_cossim_derivative(real* grad,
-                                 real* output,
-                                 real* prevOutX,
-                                 real* prevOutY,
-                                 real* prevGradX,
-                                 real* prevGradY,
-                                 int width,
-                                 int input1_height,
-                                 int input2_height,
-                                 real scale) {}
-
 inline void hl_matrix_add_shared_bias(real* A_d,
                                       real* B_d,
                                       const int channel,
@@ -106,4 +95,8 @@ inline void hl_matrix_collect_shared_bias(real* B_d,
                                           const int dimM,
                                           const int dimN,
                                           real scale) {}
+
+inline void hl_matrix_rotate(
+    real* mat, real* matRot, int dimM, int dimN, bool clockWise) {}
+
 #endif  // HL_MATRIX_STUB_H_
diff --git a/paddle/cuda/src/hl_cuda_matrix.cu b/paddle/cuda/src/hl_cuda_matrix.cu
index 2b4c6f7c39cff7..9bcc7fb7de44b2 100644
--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
@@ -265,59 +265,6 @@ void hl_matrix_softmax_derivative(real *grad_d,
   CHECK_SYNC("hl_matrix_softmax_derivative failed");
 }
 
-template<int blockSize>
-__global__ void KeMatrixClassificationError(real* in_A,
-                                            int* in_B,
-                                            real* out_C,
-                                            int dimN) {
-  __shared__ real max_s[blockSize];
-  __shared__ int max_l[blockSize];
-  const int tid = threadIdx.x;
-  const int rowId = blockIdx.x;
-
-  max_s[tid] = -1e30f;
-  in_A += rowId * dimN;
-  real tmp;
-  for (int colId = tid; colId < dimN; colId += blockSize) {
-    tmp = in_A[colId];
-    if (max_s[tid] < tmp) {
-      max_s[tid] = tmp;
-      max_l[tid] = colId;
-    }
-  }
-  __syncthreads();
-
-  for (int stride = blockSize/2; stride > 0; stride = stride/2) {
-    if (tid < stride) {
-      if (max_s[tid] < max_s[tid + stride]) {
-        max_s[tid] = max_s[tid + stride];
-        max_l[tid] = max_l[tid + stride];
-      }
-    }
-    __syncthreads();
-  }
-  __syncthreads();
-
-  if (tid == 0) {
-    out_C[rowId] = (max_l[0] == in_B[rowId] ? 0 : 1.0f);
-  }
-}
-
-void hl_matrix_classification_error(real* A_d,
-                                    int* B_d,
-                                    real* C_d,
-                                    int dimM,
-                                    int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-  CHECK_NOTNULL(C_d);
-
-  // each sample is calculated by one block
-  KeMatrixClassificationError<1024><<< dimM, 1024, 0, STREAM_DEFAULT >>>
-    (A_d, B_d, C_d, dimN);
-  CHECK_SYNC("hl_matrix_classification_error");
-}
-
 __global__ void KeMatrixMultiBinaryCrossEntropy(real* output,
                                                 real* entropy,
                                                 int* row,
@@ -584,177 +531,6 @@ void hl_param_relu_backward_diff(real* grad_o,
   CHECK_SYNC("hl_param_relu_backward_diff failed");
 }
 
-template<int blockSize>
-__global__ void KeCosSim(real* output,
-                         real* input1,
-                         real* input2,
-                         int width,
-                         int input1_height,
-                         int input2_height,
-                         real scale) {
-  const int ty = blockIdx.y;
-  int tid = threadIdx.x;
-
-  __shared__ real xx[blockSize];
-  __shared__ real yy[blockSize];
-  __shared__ real xy[blockSize];
-
-  xx[tid] = 0.0;
-  yy[tid] = 0.0;
-  xy[tid] = 0.0;
-  __syncthreads();
-
-  input1 += ty * width;
-  if (input2_height > 1) {
-    input2 += ty * width;
-  }
-  for (int index = tid; index < width; index += blockSize) {
-    real x = input1[index];
-    real y = input2[index];
-    xx[tid] += x * x;
-    yy[tid] += y * y;
-    xy[tid] += x * y;
-  }
-  __syncthreads();
-
-  for (int s = blockSize / 2; s > 0; s >>= 1) {
-    if (tid < s) {
-      xx[tid] += xx[tid + s];
-      yy[tid] += yy[tid + s];
-      xy[tid] += xy[tid + s];
-    }
-    __syncthreads();
-  }
-  if (tid == 0) {
-    output[ty] = scale * xy[0] / (sqrt(xx[0]) * sqrt(yy[0]));
-  }
-}
-
-void hl_cossim(real* output,
-               real* input1,
-               real* input2,
-               int width,
-               int input1_height,
-               int input2_height,
-               real scale) {
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(input1);
-  CHECK_NOTNULL(input2);
-  const int blockSize = 256;
-  dim3 threads(blockSize, 1);
-  dim3 grid(1, input1_height);
-
-  KeCosSim<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>>
-    (output, input1, input2, width, input1_height, input2_height, scale);
-  CHECK_SYNC("hl_cossim failed");
-}
-
-template<int blockSize>
-__global__ void KeCosSimDerivative(real* grad,
-                                   real* output,
-                                   real* prevOutX,
-                                   real* prevOutY,
-                                   real* prevGradX,
-                                   real* prevGradY,
-                                   int width,
-                                   int input1_height,
-                                   int input2_height,
-                                   real scale) {
-  const int ty = blockIdx.y;
-  int tid = threadIdx.x;
-
-  __shared__ real xx[blockSize];
-  __shared__ real yy[blockSize];
-  __shared__ real xy[blockSize];
-
-  xx[tid] = 0.0;
-  yy[tid] = 0.0;
-  xy[tid] = 0.0;
-  __syncthreads();
-
-  prevOutX += ty * width;
-  prevGradX += ty * width;
-  if (input2_height > 1) {
-    prevOutY += ty * width;
-    prevGradY += ty * width;
-  }
-  for (int index = tid; index < width; index += blockSize) {
-    real x = prevOutX[index];
-    real y = prevOutY[index];
-    xx[tid] += x * x;
-    yy[tid] += y * y;
-    xy[tid] += x * y;
-  }
-  __syncthreads();
-
-  for (int s = blockSize / 2; s > 0; s >>= 1) {
-    if (tid < s) {
-      xx[tid] += xx[tid + s];
-      yy[tid] += yy[tid + s];
-      xy[tid] += xy[tid + s];
-    }
-    __syncthreads();
-  }
-  if (xy[0] == 0) {
-    real reciprocal = 1.0 / (sqrt(xx[0]) * sqrt(yy[0]));
-    for (int index = tid; index < width; index += blockSize) {
-      prevGradX[index] +=
-        scale * grad[ty] * prevOutY[index] * reciprocal;
-      if (input2_height > 1) {
-        prevGradY[index] +=
-          scale * grad[ty] * prevOutX[index] * reciprocal;
-      } else {
-        paddle::paddleAtomicAdd(prevGradY + index,
-          scale * grad[ty] * prevOutX[index] * reciprocal);
-      }
-    }
-  } else {
-    real reciprocalXY = 1.0 / xy[0];
-    real reciprocalSquareSumX = 1.0 / xx[0];
-    real reciprocalSquareSumY = 1.0 / yy[0];
-    for (int index = tid; index < width; index += blockSize) {
-      prevGradX[index] += output[ty] * grad[ty] *
-        (prevOutY[index] * reciprocalXY -
-         prevOutX[index] * reciprocalSquareSumX);
-      if (input2_height > 1) {
-        prevGradY[index] += output[ty] * grad[ty] *
-          (prevOutX[index] * reciprocalXY -
-           prevOutY[index] * reciprocalSquareSumY);
-      } else {
-        paddle::paddleAtomicAdd(prevGradY + index, output[ty] * grad[ty] *
-          (prevOutX[index] * reciprocalXY -
-           prevOutY[index] * reciprocalSquareSumY));
-      }
-    }
-  }
-}
-
-
-void hl_cossim_derivative(real* grad,
-                          real* output,
-                          real* prevOutX,
-                          real* prevOutY,
-                          real* prevGradX,
-                          real* prevGradY,
-                          int width,
-                          int input1_height,
-                          int input2_height,
-                          real scale) {
-  CHECK_NOTNULL(grad);
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(prevOutX);
-  CHECK_NOTNULL(prevOutY);
-  CHECK_NOTNULL(prevGradX);
-  CHECK_NOTNULL(prevGradY);
-  const int blockSize = 256;
-  dim3 threads(blockSize, 1);
-  dim3 grid(1, input1_height);
-  KeCosSimDerivative<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>>
-    (grad, output, prevOutX, prevOutY, prevGradX, prevGradY, width,
-        input1_height, input2_height, scale);
-  CHECK_SYNC("hl_cossim_derivate failed");
-}
-
 __global__ void KeMatrixAddSharedBias(real* A,
                                       real* B,
                                       const int channel,
@@ -840,3 +616,28 @@ void hl_matrix_collect_shared_bias(real* B_d,
       (B_d, A_d, channel, dimM, dimN, dim, limit, scale);
   CHECK_SYNC("hl_matrix_collect_shared_bias failed");
 }
+
+__global__ void keMatrixRotate(real* mat, real* matRot,
+                               int dimM, int dimN, bool clockWise) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < dimM * dimN) {
+        int i = idx / dimN;
+        int j = idx % dimN;
+        if (clockWise) {
+            matRot[j * dimM + i] = mat[(dimM - i - 1) * dimN + j];
+        } else {
+            matRot[j * dimM + i] = mat[i * dimN + (dimN - j - 1)];
+        }
+    }
+}
+
+void hl_matrix_rotate(real *mat, real* matRot,
+                      int dimM, int dimN, bool clockWise) {
+    CHECK_NOTNULL(mat);
+    CHECK_NOTNULL(matRot);
+    const int threads = 512;
+    const int blocks = DIVUP(dimM * dimN, threads);
+    keMatrixRotate<<< blocks, threads, 0, STREAM_DEFAULT >>>
+            (mat, matRot, dimM, dimN, clockWise);
+    CHECK_SYNC("hl_matrix_rotate failed");
+}
diff --git a/paddle/cuda/src/hl_top_k.cu b/paddle/cuda/src/hl_top_k.cu
index f0ef0cc3c51f9e..4f0bbfcf4e3aa5 100644
--- a/paddle/cuda/src/hl_top_k.cu
+++ b/paddle/cuda/src/hl_top_k.cu
@@ -384,3 +384,81 @@ void hl_sparse_matrix_top_k(real* topVal, int ldv,
   CHECK_SYNC("hl_sparse_matrix_top_k failed");
 }
 
+/**
+ * Each block compute one sample.
+ * In a block:
+ * 1. every thread get top maxLength value;
+ * 2. merge to shTopK, block reduce and get max value;
+ * 3. go to the second setp, until one thread's topK value is null;
+ * 4. go to the first setp, until get the topK value.
+ */
+template<int maxLength, int blockSize>
+__global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
+                                                int * topIds,
+                                                real* src, int lds,
+                                                int dim,
+                                                int beamSize,
+                                                int* label,
+                                                real* recResult) {
+  __shared__ Pair shTopK[blockSize];
+  __shared__ int maxId[blockSize / 2];
+  const int tid = threadIdx.x;
+  const int warp = threadIdx.x / 32;
+  src += blockIdx.x * lds;
+  topVal += blockIdx.x * ldv;
+  topIds += blockIdx.x * beamSize;
+
+  Pair topK[maxLength]; // NOLINT
+  int beam = maxLength;
+  Pair max;
+  bool isEmpty = false;
+  bool firstStep = true;
+  int topkSize = beamSize;
+
+  for (int k = 0; k < maxLength; k++) {
+    topK[k].set(-HL_FLOAT_MAX, -1);
+  }
+
+  while (beamSize) {
+    threadGetTopK<maxLength, blockSize>
+      (topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
+
+    shTopK[tid] = topK[0];
+    blockReduce<maxLength, blockSize>
+      (shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
+  }
+
+  __syncthreads();
+  if (tid == 0) {
+    for (int i = 0; i < topkSize; i++) {
+        if (*--topIds == label[blockIdx.x]) {
+            recResult[blockIdx.x] = 0;
+            break;
+        }
+        recResult[blockIdx.x] = 1.0f;
+    }
+  }
+}
+
+void hl_matrix_classification_error(real* topVal, int ldv,
+                                   int* topIds,
+                                   real* src, int lds,
+                                   int dim,
+                                   int topkSize,
+                                   int numSamples,
+                                   int* label,
+                                   real* recResult) {
+  CHECK_NOTNULL(topVal);
+  CHECK_NOTNULL(topIds);
+  CHECK_NOTNULL(src);
+
+  if (topkSize > dim) topkSize = dim;
+
+  dim3 threads(256, 1);
+  dim3 grid(numSamples, 1);
+  KeMatrixTopKClassificationError<5, 256>
+  <<< grid, threads, 0, STREAM_DEFAULT >>>
+  (topVal, ldv, topIds, src, lds, dim, topkSize, label, recResult);
+
+  CHECK_SYNC("hl_matrix_top_k classification error failed");
+}
diff --git a/paddle/cuda/src/hl_warpctc_wrap.cc b/paddle/cuda/src/hl_warpctc_wrap.cc
index 55b940ca67acce..f57efb2b46797c 100644
--- a/paddle/cuda/src/hl_warpctc_wrap.cc
+++ b/paddle/cuda/src/hl_warpctc_wrap.cc
@@ -54,22 +54,26 @@ DYNAMIC_LOAD_WARPCTC_WRAP(get_workspace_size)
 #define WARPCTC_GET_VERSION dynload::get_warpctc_version
 #define WARPCTC_GET_STATUS_STRING dynload::ctcGetStatusString
 
+static int g_warpctcVersion = -1;
 #ifndef PADDLE_TYPE_DOUBLE
 #define WARPCTC_COMPUTE_LOSS dynload::compute_ctc_loss
 #define WARPCTC_GET_WORKSPACE_SIZE dynload::get_workspace_size
 #else
-#define WARPCTC_LOG_FATAL                                \
-  LOG(FATAL) << "warp-ctc [version " << g_warpctcVersion \
-             << "] Error: not support double precision."
-#define WARPCTC_COMPUTE_LOSS(...) WARPCTC_LOG_FATAL(__VA_ARGS__)
-#define WARPCTC_GET_WORKSPACE_SIZE(...) WARPCTC_LOG_FATAL(__VA_ARGS__)
+hl_warpctc_status_t fatal(...) {
+  LOG(FATAL) << "warp-ctc [version " << g_warpctcVersion
+             << "] Error: not support double precision.";
+  // both of get_warpctc_version() and get_workspace_size() return an ctcStatus
+  // type value
+  return CTC_STATUS_EXECUTION_FAILED;
+}
+#define WARPCTC_COMPUTE_LOSS fatal
+#define WARPCTC_GET_WORKSPACE_SIZE fatal
 #endif
 
 /**
  * Check build-in warp-ctc function using glog and it also
  * support << operator for more details error info.
  */
-static int g_warpctcVersion = -1;
 #define CHECK_WARPCTC(warpctcStat)                \
   CHECK_EQ(CTC_STATUS_SUCCESS, warpctcStat)       \
       << "warp-ctc [version " << g_warpctcVersion \
diff --git a/paddle/function/BufferArg.cpp b/paddle/function/BufferArg.cpp
index fde48a73b61c31..2b70036e3ff7de 100644
--- a/paddle/function/BufferArg.cpp
+++ b/paddle/function/BufferArg.cpp
@@ -20,23 +20,33 @@ limitations under the License. */
 namespace paddle {
 
 const SequenceArg& BufferArg::sequence() const {
-  // CHECK_EQ(bufferType_, TENSOR_SEQUENCE_DATA);
+  CHECK_EQ(bufferType_, TENSOR_SEQUENCE_DATA);
   return dynamic_cast<const SequenceArg&>(*this);
 }
 
 const SparseMatrixArg& BufferArg::sparse() const {
-  // CHECK_EQ(bufferType_, TENSOR_SPARSE);
+  CHECK_EQ(bufferType_, TENSOR_SPARSE);
   return dynamic_cast<const SparseMatrixArg&>(*this);
 }
 
 SparseMatrixArg::SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType)
     : BufferArg(sparse, argType),
       row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
-      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {}
+      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32),
+      nnz_(sparse.getElementCnt()),
+      format_(static_cast<SparseDataFormat>(sparse.getFormat())),
+      type_(static_cast<SparseDataType>(sparse.getValueType())) {
+  bufferType_ = TENSOR_SPARSE;
+}
 
 SparseMatrixArg::SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType)
     : BufferArg(sparse, argType),
       row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
-      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {}
+      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32),
+      nnz_(sparse.getElementCnt()),
+      format_(static_cast<SparseDataFormat>(sparse.getFormat())),
+      type_(static_cast<SparseDataType>(sparse.getValueType())) {
+  bufferType_ = TENSOR_SPARSE;
+}
 
 }  // namespace paddle
diff --git a/paddle/function/BufferArg.h b/paddle/function/BufferArg.h
index 12352ba29e3392..0dc7792f646457 100644
--- a/paddle/function/BufferArg.h
+++ b/paddle/function/BufferArg.h
@@ -23,23 +23,16 @@ limitations under the License. */
 namespace paddle {
 
 enum BufferType {
-  TENSOR_NORMAL = 0,
-  TENSOR_SEQUENCE_ID = 1,
-  TENSOR_SEQUENCE_DATA = 2,
-  TENSOR_SPARSE = 3
+  TENSOR_UNKNOWN = 0,
+  TENSOR_NORMAL = 1,
+  TENSOR_SEQUENCE_ID = 2,
+  TENSOR_SEQUENCE_DATA = 3,
+  TENSOR_SPARSE = 4
 };
 
-enum SparseDataType {
-  SPARSE_NO_VALUE = 0,  // do not need value pointer, all values are 1
-  SPARSE_FLOAT_VALUE = 1
-};
-
-enum SparseDataFormat { SPARSE_CSR_FORMAT = 0, SPARSE_CSC_FORMAT = 1 };
-
 class BufferArg;
 class SequenceArg;
 class SparseMatrixArg;
-typedef std::shared_ptr<BufferArg> BufferArgPtr;
 
 /**
  * \brief BufferArg used as the argument type of Function.
@@ -50,6 +43,11 @@ typedef std::shared_ptr<BufferArg> BufferArgPtr;
  * 3. SequenceArg for a Buffer of sequence data.
  * 4. SparseMatrixArg for a Buffer of sparse matrix.
  *
+ * Buffer shape
+ * For most buffers, the first dimension `shape()[0]` represents
+ * the size of the mini-batch.
+ *
+ * Buffer argType
  * There is an ArgType property for the BufferArg used as Function Output.
  * Whether the result of the Function calculation is assigned to the
  * output Buffer or added to the output Buffer is determined by the
@@ -71,14 +69,24 @@ class BufferArg {
   ArgType getArgType() const { return argType_; }
 
 public:
+  BufferArg(ValueType valueType,
+            const TensorShape& shape,
+            ArgType argType = UNSPECIFIED)
+      : buf_(nullptr), valueType_(valueType), shape_(shape), argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
+  }
+
   BufferArg(void* buf,
             ValueType valueType,
             const TensorShape& shape,
             ArgType argType = UNSPECIFIED)
-      : buf_(buf), valueType_(valueType), shape_(shape), argType_(argType) {}
+      : buf_(buf), valueType_(valueType), shape_(shape), argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
+  }
 
-  BufferArg(void* buf, ValueType valueType)
-      : buf_(buf), valueType_(valueType) {}
+  BufferArg(void* buf, ValueType valueType) : buf_(buf), valueType_(valueType) {
+    bufferType_ = TENSOR_NORMAL;
+  }
 
   BufferArg(const Matrix& matrix, ArgType argType = UNSPECIFIED)
       : buf_(
@@ -86,6 +94,7 @@ class BufferArg {
         valueType_(DataType<real>::value),
         shape_(2),
         argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
     shape_.setDim(0, matrix.getHeight());
     shape_.setDim(1, matrix.getWidth());
   }
@@ -98,6 +107,7 @@ class BufferArg {
         valueType_(DataType<real>::value),
         shape_(shape),
         argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
     CHECK_EQ(matrix.getElementCnt(), shape.getElements());
   }
 
@@ -107,6 +117,7 @@ class BufferArg {
         valueType_(DataType<real>::value),
         shape_(1),
         argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
     shape_.setDim(0, vector.getSize());
   }
 
@@ -116,6 +127,7 @@ class BufferArg {
         valueType_(VALUE_TYPE_INT32),
         shape_(1),
         argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
     shape_.setDim(0, vector.getSize());
   }
 
@@ -150,6 +162,9 @@ class BufferArg {
   ValueType valueType() const { return valueType_; }
   BufferType bufferType() const { return bufferType_; }
   const TensorShape& shape() const { return shape_; }
+  bool isSparseArg() const { return TENSOR_SPARSE == bufferType_; }
+  bool isSequenceArg() const { return TENSOR_SEQUENCE_DATA == bufferType_; }
+  virtual size_t numElements() const { return shape_.getElements(); }
 
   const SequenceArg& sequence() const;
   const SparseMatrixArg& sparse() const;
@@ -158,8 +173,9 @@ class BufferArg {
   void* buf_;
   ValueType valueType_;
   TensorShape shape_;
-  BufferType bufferType_;
-  ArgType argType_ = UNSPECIFIED;
+  BufferType bufferType_{TENSOR_UNKNOWN};
+  ArgType argType_{UNSPECIFIED};
+  // TODO(tianbing), add deviceType_
   // leading dimensions. The size is dims_.size()
   // Dims lds_;
 };
@@ -170,15 +186,25 @@ class BufferArg {
 // if a < b then value_.buf_[a] < value_.buf_[b]
 class SequenceIdArg : public BufferArg {
 public:
+  SequenceIdArg(const TensorShape& shape, ArgType argType = UNSPECIFIED)
+      : BufferArg(VALUE_TYPE_INT32, shape, argType) {
+    bufferType_ = TENSOR_SEQUENCE_ID;
+    CHECK_EQ(shape_.ndims(), 1UL);
+    CHECK_GE(shape_[0], 1UL);
+    numSeqs_ = shape_[0] - 1;
+  }
+
   SequenceIdArg(void* buf,
                 const TensorShape& shape,
                 ArgType argType = UNSPECIFIED)
       : BufferArg(buf, VALUE_TYPE_INT32, shape, argType) {
-    CHECK_EQ(shape_.ndims(), (size_t)1);
+    bufferType_ = TENSOR_SEQUENCE_ID;
+    CHECK_EQ(shape_.ndims(), 1UL);
     numSeqs_ = shape_[0] - 1;
   }
 
   SequenceIdArg(const IVector& vector) : BufferArg(vector) {
+    bufferType_ = TENSOR_SEQUENCE_ID;
     numSeqs_ = shape_[0] - 1;
   }
 
@@ -190,26 +216,44 @@ class SequenceIdArg : public BufferArg {
   size_t numSeqs_;
 };
 
-// sequence data
+// sequences data
+// For mini-batch calculate,
+// one batch can contain more than one sequence of data.
+// SequenceArg can be used to represent sequences that contain multiple
+// unequal lengths.
 class SequenceArg : public BufferArg {
 public:
+  SequenceArg(ValueType valueType,
+              const TensorShape& shape,
+              ArgType argType = UNSPECIFIED)
+      : BufferArg(valueType, shape, argType),
+        startPositions_(TensorShape({shape[0]})) {
+    bufferType_ = TENSOR_SEQUENCE_DATA;
+  }
+
   SequenceArg(void* buf,
               ValueType valueType,
               const TensorShape& shape,
               const SequenceIdArg& startPositions,
               ArgType argType = UNSPECIFIED)
       : BufferArg(buf, valueType, shape, argType),
-        startPositions_(startPositions) {}
+        startPositions_(startPositions) {
+    bufferType_ = TENSOR_SEQUENCE_DATA;
+  }
 
   SequenceArg(const Matrix& matrix,
               const IVector& vector,
               ArgType argType = UNSPECIFIED)
-      : BufferArg(matrix, argType), startPositions_(vector) {}
+      : BufferArg(matrix, argType), startPositions_(vector) {
+    bufferType_ = TENSOR_SEQUENCE_DATA;
+  }
 
   ~SequenceArg() {}
 
   void* getIdBuf() const { return startPositions_.data(); }
   size_t numSeqs() const { return startPositions_.numSeqs(); }
+  SequenceIdArg& getSequenceId() { return startPositions_; }
+  const SequenceIdArg& getSequenceId() const { return startPositions_; }
 
 private:
   SequenceIdArg startPositions_;
@@ -226,30 +270,75 @@ class SparseMatrixArg : public BufferArg {
                   const BufferArg& row,
                   const BufferArg& col,
                   size_t nnz,
-                  SparseDataFormat format,
-                  SparseDataType type,
+                  SparseFormat format,
+                  SparseValueType type,
                   ArgType argType = UNSPECIFIED)
       : BufferArg(buf, valueType, shape, argType),
         row_(row),
         col_(col),
         nnz_(nnz),
-        format_(format),
-        type_(type) {
+        format_(static_cast<SparseDataFormat>(format)),
+        type_(static_cast<SparseDataType>(type)) {
+    bufferType_ = TENSOR_SPARSE;
     CHECK((valueType == VALUE_TYPE_FLOAT) || (valueType == VALUE_TYPE_DOUBLE));
-    CHECK_EQ(shape_.ndims(), (size_t)2);
-    CHECK_EQ(row_.shape().ndims(), (size_t)1);
-    CHECK_EQ(col_.shape().ndims(), (size_t)1);
-    if (format == SPARSE_CSR_FORMAT) {
+    CHECK_EQ(shape_.ndims(), 2UL);
+    CHECK_EQ(row_.shape().ndims(), 1UL);
+    CHECK_EQ(col_.shape().ndims(), 1UL);
+    if (format_ == T_SPARSE_CSR) {
       CHECK_EQ(nnz, col.shape()[0]);
-    } else if (format == SPARSE_CSC_FORMAT) {
+    } else if (format_ == T_SPARSE_CSC) {
       CHECK_EQ(nnz, row.shape()[0]);
     }
   }
 
+  SparseMatrixArg(ValueType valueType,
+                  const TensorShape& shape,
+                  size_t nnz,
+                  SparseFormat format,
+                  SparseValueType type,
+                  ArgType argType = UNSPECIFIED)
+      : BufferArg(valueType, shape, argType),
+        row_(BufferArg(nullptr, VALUE_TYPE_INT32)),
+        col_(BufferArg(nullptr, VALUE_TYPE_INT32)),
+        nnz_(nnz),
+        format_(static_cast<SparseDataFormat>(format)),
+        type_(static_cast<SparseDataType>(type)) {
+    bufferType_ = TENSOR_SPARSE;
+    CHECK((valueType == VALUE_TYPE_FLOAT) || (valueType == VALUE_TYPE_DOUBLE));
+    CHECK_EQ(shape_.ndims(), 2UL);
+
+    /// len of row_ : height + 1 (CSR) or nnz (CSC), buf_ == nullptr
+    row_ = (format_ == T_SPARSE_CSR
+                ? BufferArg(VALUE_TYPE_INT32, TensorShape{shape_[0] + 1})
+                : BufferArg(VALUE_TYPE_INT32, TensorShape{nnz}));
+    /// len of col_ :  width + 1 (CSC) or nnz (CSR), buf_ == nullptr
+    col_ = (format_ == T_SPARSE_CSR
+                ? BufferArg(VALUE_TYPE_INT32, TensorShape{nnz})
+                : BufferArg(VALUE_TYPE_INT32, TensorShape{shape_[1] + 1}));
+  }
+
   SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED);
 
   SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED);
 
+  template <DeviceType DType>
+  typename Tensor<real, DType>::SparseMatrix SparseMatrix() const {
+    CHECK(buf_);
+    CHECK(valueType_ == DataType<real>::value);
+    // CHECK(deviceType_ == DType);
+    CHECK_EQ(2UL, shape_.ndims());
+    return typename Tensor<real, DType>::SparseMatrix(
+        reinterpret_cast<real*>(buf_),
+        reinterpret_cast<int*>(row_.data()),
+        reinterpret_cast<int*>(col_.data()),
+        shape_[0],
+        shape_[1],
+        nnz_,
+        static_cast<SparseValueType>(type_),
+        static_cast<SparseFormat>(format_),
+        false);
+  }
+
   ~SparseMatrixArg() {}
 
   void* getRowBuf() const { return row_.data(); }
@@ -258,6 +347,8 @@ class SparseMatrixArg : public BufferArg {
 
   size_t nnz() const { return nnz_; }
 
+  size_t numElements() const override { return nnz_; }
+
   SparseDataFormat dataFormat() const { return format_; }
 
   SparseDataType dataType() const { return type_; }
diff --git a/paddle/function/BufferArgTest.cpp b/paddle/function/BufferArgTest.cpp
index b345597435c991..1744f377808f13 100644
--- a/paddle/function/BufferArgTest.cpp
+++ b/paddle/function/BufferArgTest.cpp
@@ -14,9 +14,7 @@ limitations under the License. */
 
 #include "BufferArg.h"
 #include <gtest/gtest.h>
-#include "Function.h"
 #include "paddle/math/MemoryHandle.h"
-#include "paddle/math/SparseMatrix.h"
 
 namespace paddle {
 
@@ -37,55 +35,4 @@ TEST(BufferTest, SequenceIdArg) {
   EXPECT_EQ(buffer.numSeqs(), 9);
 }
 
-TEST(BufferTest, asArgument) {
-  MatrixPtr matrix = Matrix::create(100, 200);
-  VectorPtr vector = Vector::create(100, false);
-  CpuSparseMatrix sparse(200, 300, 50);
-
-  // prepare arguments
-  BufferArgs argments;
-  argments.addArg(*matrix);
-  argments.addArg(*vector);
-  argments.addArg(sparse);
-
-  // function
-  auto function = [=](const BufferArgs& inputs) {
-    EXPECT_EQ(inputs.size(), 3);
-
-    // check inputs[0]
-    EXPECT_EQ(inputs[0].shape().ndims(), 2);
-    EXPECT_EQ(inputs[0].shape()[0], 100);
-    EXPECT_EQ(inputs[0].shape()[1], 200);
-    EXPECT_EQ(inputs[0].data(), matrix->getData());
-
-    EXPECT_EQ(inputs[0].matrix<DEVICE_TYPE_CPU>().getHeight(),
-              matrix->getHeight());
-    EXPECT_EQ(inputs[0].matrix<DEVICE_TYPE_CPU>().getWidth(),
-              matrix->getWidth());
-    EXPECT_EQ(inputs[0].matrix<DEVICE_TYPE_CPU>().getData(), matrix->getData());
-
-    // check inputs[1]
-    EXPECT_EQ(inputs[1].shape().ndims(), 1);
-    EXPECT_EQ(inputs[1].shape()[0], 100);
-    EXPECT_EQ(inputs[1].data(), vector->getData());
-    CpuVector inVector = inputs[1].vector<real, DEVICE_TYPE_CPU>();
-    EXPECT_EQ(inVector.getSize(), vector->getSize());
-    EXPECT_EQ(inVector.getData(), vector->getData());
-
-    // check inputs[2]
-    EXPECT_EQ(inputs[2].shape().ndims(), 2);
-    EXPECT_EQ(inputs[2].shape()[0], 200);
-    EXPECT_EQ(inputs[2].shape()[1], 300);
-    EXPECT_EQ(inputs[2].data(), sparse.getData());
-    // CHECK_EQ(inputs[2].sparse().nnz(), 50);
-    // CHECK_EQ(inputs[2].sparse().dataFormat(), SPARSE_CSR_FORMAT);
-    // CHECK_EQ(inputs[2].sparse().dataType(), SPARSE_FLOAT_VALUE);
-    EXPECT_EQ(inputs[2].sparse().getRowBuf(), sparse.getRows());
-    EXPECT_EQ(inputs[2].sparse().getColBuf(), sparse.getCols());
-  };
-
-  // call function
-  function(argments);
-}
-
 }  // namespace paddle
diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 75a2acc55ec3d3..1522510e8bb981 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -19,12 +19,15 @@ if(WITH_TESTING)
     # TODO:
     # file(GLOB test_files . *OpTest.cpp)
     # add_executable(${test_bin} EXCLUDE_FROM_ALL ${test_files})
-    # add_simple_unittest(CrossMapNormalOpTest)
+    add_simple_unittest(CrossMapNormalOpTest)
     add_simple_unittest(TensorShapeTest)
     add_simple_unittest(TensorTypeTest)
     add_simple_unittest(BufferArgTest)
     add_simple_unittest(FunctionTest)
-    # add_simple_unittest(ContextProjectionOpTest)
+    add_simple_unittest(ContextProjectionOpTest)
+    add_simple_unittest(PadOpTest)
+    add_simple_unittest(MulOpTest)
+    add_simple_unittest(CosSimOpTest)
 endif()
 endif()
 
diff --git a/paddle/function/ContextProjectionOp.cpp b/paddle/function/ContextProjectionOp.cpp
index cb448562ebb370..b87750b74247bd 100644
--- a/paddle/function/ContextProjectionOp.cpp
+++ b/paddle/function/ContextProjectionOp.cpp
@@ -17,7 +17,10 @@ limitations under the License. */
 #include "paddle/math/Vector.h"
 
 namespace paddle {
-
+/**
+ * Context Projection Forward with CPU Matrix Device.
+ *
+ */
 template <>
 void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
                                                const CpuMatrix& input_mat,
@@ -70,10 +73,30 @@ void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
 }
 
 /**
- * \param inputs[0] input value.
- * \param inputs[1] input weight.
- * \param inputs[2] input sequence.
- * \param outputs[0] output value.
+ * Paddle Function for Context Projection Forward.
+ * Calculate the output layer value sequence after context projection.
+ *
+ * What is Context Projection for a sequence?
+ * For example, assumed input (x) has 4 words and the dimension of each word
+ * representation is 2. If we use zero to pad instead of learned weight to pad,
+ * and the context_lenth is 3, the output (y) is:
+ *
+ * @code
+ *  x = [a1, a2;
+ *       b1, b2;
+ *       c1, c2;
+ *       d1, d2]
+ *  y = [0,  0,  a1, a2, b1, b2;
+ *       a1, a2, b1, b2, c1, c2;
+ *       b1, b2, c1, c2, d1, d2;
+ *       c1, c2, d1, d2, 0,  0]
+ * @endcode
+ *
+ * \param outputs[0].matrix   output layer value, n * (d * l)
+ * \param outputs[0].vector   start position sequence, n * 1
+ * \param inputs[0].matrix    input layer value, n * d
+ * \param inputs[0].vector    start position sequence, n * 1
+ * \param inputs[1].matrix    input layer weight, pad * d
  */
 template <DeviceType Device>
 class ContextProjectionForwardFunc : public FunctionBase {
@@ -85,28 +108,35 @@ class ContextProjectionForwardFunc : public FunctionBase {
   }
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ((size_t)3, inputs.size());
-    CHECK_EQ((size_t)1, outputs.size());
+    CHECK(1UL == inputs.size() || 2UL == inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
+        << "SequenceArg required here";
+    const auto val_seqs = dynamic_cast<const SequenceArg&>(inputs[0]);
+    auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
 
-    CHECK(outputs[0].data() && inputs[0].data() && inputs[2].data());
-    CHECK_EQ(outputs[0].shape().ndims(), (size_t)2);
-    CHECK_EQ(inputs[0].shape().ndims(), (size_t)2);
-    CHECK_EQ(inputs[1].shape().ndims(), (size_t)2);
-    CHECK_EQ(inputs[2].shape().ndims(), (size_t)1);
+    CHECK(out_seq.data() && val_seqs.data() && val_seqs.getSequenceId().data());
+    CHECK_EQ(out_seq.shape().ndims(), 2UL);
+    CHECK_EQ(val_seqs.shape().ndims(), 2UL);
     /// dim of output = dim of input * context_length
-    CHECK_EQ(outputs[0].shape()[1], inputs[0].shape()[1] * context_length_);
-    /// dim of input == dim of weight
-    CHECK_EQ(inputs[0].shape()[1], inputs[1].shape()[1]);
+    CHECK_EQ(out_seq.shape()[1], val_seqs.shape()[1] * context_length_);
     /// input and output has the same batch_size
-    CHECK_EQ(inputs[0].shape()[0], outputs[0].shape()[0]);
+    CHECK_EQ(val_seqs.shape()[0], out_seq.shape()[0]);
+    if (2UL == inputs.size()) {
+      CHECK_EQ(inputs[1].shape().ndims(), 2UL);
+      /// dim of input == dim of weight
+      CHECK_EQ(val_seqs.shape()[1], inputs[1].shape()[1]);
+    }
+
+    CHECK_EQ(out_seq.getArgType(), ADD_TO);
+    auto out_mat = out_seq.matrix<Device>();
+    const auto in_mat = val_seqs.matrix<Device>();
+    const auto w_mat =
+        (2UL == inputs.size() && inputs[1].data())
+            ? inputs[1].matrix<Device>()
+            : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
+    const auto seq_vec = val_seqs.getSequenceId().vector<int, Device>();
 
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-    auto out_mat = outputs[0].matrix<Device>();
-    auto in_mat = inputs[0].matrix<Device>();
-    auto w_mat = !inputs[1].data()
-                     ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
-                     : inputs[1].matrix<Device>();
-    auto seq_vec = inputs[2].vector<int, Device>();
     ContextProjectionForward<Device>(out_mat,
                                      in_mat,
                                      w_mat,
@@ -122,8 +152,12 @@ class ContextProjectionForwardFunc : public FunctionBase {
   size_t begin_pad_;
 };
 
+/**
+ * Context Projection Backward with CPU Matrix Device.
+ *
+ */
 template <>
-void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix& out_grad_mat,
+void ContextProjectionBackward<DEVICE_TYPE_CPU>(const CpuMatrix& out_grad_mat,
                                                 CpuMatrix& in_grad_mat,
                                                 CpuMatrix& w_grad_mat,
                                                 const CpuIVector& seq_vec,
@@ -146,7 +180,8 @@ void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix& out_grad_mat,
         int64_t pad_size =
             std::min(starts[i] - begin, starts[i + 1] - starts[i]);
         if (is_padding && w_grad_mat) {
-          MatrixPtr mat = out_grad_mat.subMatrix(starts[i], pad_size);
+          MatrixPtr mat = const_cast<CpuMatrix&>(out_grad_mat)
+                              .subMatrix(starts[i], pad_size);
           MatrixPtr sub = w_grad_mat.subMatrix(j, pad_size);
           sub->addAtOffset(*mat, j * input_dim);
         }
@@ -157,8 +192,8 @@ void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix& out_grad_mat,
         int64_t pad_size =
             std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
         if (is_padding && w_grad_mat) {
-          MatrixPtr mat =
-              out_grad_mat.subMatrix(starts[i + 1] - pad_size, pad_size);
+          MatrixPtr mat = const_cast<CpuMatrix&>(out_grad_mat)
+                              .subMatrix(starts[i + 1] - pad_size, pad_size);
           MatrixPtr sub = w_grad_mat.subMatrix(
               begin_pad + context_start + j - pad_size, pad_size);
           sub->addAtOffset(*mat, j * input_dim);
@@ -169,17 +204,22 @@ void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix& out_grad_mat,
       if (end <= begin) continue;
       if (!in_grad_mat) continue;
       MatrixPtr src = in_grad_mat.subMatrix(begin, end - begin);
-      MatrixPtr dst = out_grad_mat.subMatrix(dst_begin, dst_end - dst_begin);
+      MatrixPtr dst = const_cast<CpuMatrix&>(out_grad_mat)
+                          .subMatrix(dst_begin, dst_end - dst_begin);
       src->addAtOffset(*dst, j * input_dim);
     }
   }
 }
 
 /**
- * \param inputs[0] input grad.
- * \param inputs[1] weight grad.
- * \param inputs[2] input sequence.
- * \param outputs[0] output value.
+ * Context Projection Backward Function.
+ * Update the weight gradient and input layer gradient with backprop
+ *
+ * \param inputs[0].matrix          output layer grad, n * (d * l)
+ * \param inputs[0].vector          start position sequence, n * 1
+ * \param outputs[0].matrix         input layer grad, n * d
+ * \param outputs[0].vector         start position sequence, n * 1
+ * \param outputs[1]                weight grad, pad * d
  */
 template <DeviceType Device>
 class ContextProjectionBackwardFunc : public FunctionBase {
@@ -193,32 +233,40 @@ class ContextProjectionBackwardFunc : public FunctionBase {
   }
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ((size_t)3, inputs.size());
-    CHECK_EQ((size_t)1, outputs.size());
-
-    CHECK(outputs[0].data() && inputs[2].data());
-    CHECK_EQ(outputs[0].shape().ndims(), (size_t)2);
-    CHECK_EQ(inputs[0].shape().ndims(), (size_t)2);
-    CHECK_EQ(inputs[1].shape().ndims(), (size_t)2);
-    CHECK_EQ(inputs[2].shape().ndims(), (size_t)1);
+    CHECK_EQ(1UL, inputs.size());
+    CHECK(1UL == outputs.size() || 2UL == outputs.size());
+    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
+        << "SequenceArg required here";
+    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
+    auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
+    CHECK(in_seq.data() && in_seq.getSequenceId().data());
+    CHECK_EQ(in_seq.shape().ndims(), 2UL);
+    CHECK_EQ(out_seq.shape().ndims(), 2UL);
+    CHECK_EQ(out_seq.getSequenceId().shape().ndims(), 1UL);
 
-    /// dim of input == dim of weight
-    CHECK_EQ(inputs[0].shape()[1], inputs[1].shape()[1]);
-    /// input and output has the same batch_size
-    CHECK_EQ(inputs[0].shape()[0], outputs[0].shape()[0]);
-    /// dim of output = dim of input * context_length
-    CHECK_EQ(outputs[0].shape()[1], inputs[0].shape()[1] * context_length_);
+    /// input and output grad has the same batch_size
+    CHECK_EQ(out_seq.shape()[0], in_seq.shape()[0]);
+    /// dim of output grad = dim of input grad * context_length
+    CHECK_EQ(in_seq.shape()[1], out_seq.shape()[1] * context_length_);
+    CHECK_EQ(out_seq.getArgType(), ADD_TO);
 
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    if (2UL == outputs.size()) {
+      CHECK_EQ(outputs[1].shape().ndims(), 2UL);
+      /// dim of input grad == dim of weight
+      CHECK_EQ(out_seq.shape()[1], outputs[1].shape()[1]);
+      CHECK_EQ(outputs[1].getArgType(), ADD_TO);
+    }
 
-    auto out_grad_mat = outputs[0].matrix<Device>();
+    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
+    const auto out_grad_mat = in_seq.matrix<Device>();
     auto in_grad_mat =
-        !inputs[0].data() ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
-                          : inputs[0].matrix<Device>();
-    auto w_grad_mat = !inputs[1].data()
-                          ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
-                          : inputs[1].matrix<Device>();
-    auto seq_vec = inputs[2].vector<int, Device>();
+        !out_seq.data() ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
+                        : out_seq.matrix<Device>();
+    auto w_grad_mat =
+        (2UL == outputs.size() && outputs[1].data())
+            ? outputs[1].matrix<Device>()
+            : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
+
     ContextProjectionBackward<Device>(out_grad_mat,
                                       in_grad_mat,
                                       w_grad_mat,
@@ -238,11 +286,16 @@ class ContextProjectionBackwardFunc : public FunctionBase {
   size_t total_pad_;
 };
 
-#if 0
 /**
- * \param inputs[0] input grad.
- * \param inputs[1] input sequence.
- * \param outputs[0] output grad.
+ * Context Projection Backward Data Function
+ * Update input layer grad
+ * input:  sequence of output layer grad
+ * output: sequence of input layer grad
+ *
+ * \param outputs[0].matrix              input layer grad, n * d
+ * \param outputs[0].vector              start position sequence, n * 1
+ * \param inputs[0].matrix               output layer grad, n * (d * l)
+ * \param inputs[0].vector               start positon sequence, n * 1
  */
 template <DeviceType Device>
 class ContextProjectionBackwardDataFunc : public FunctionBase {
@@ -252,32 +305,30 @@ class ContextProjectionBackwardDataFunc : public FunctionBase {
     context_start_ = config.get<int>("context_start");
   }
 
-  void calc(const Arguments& inputs,
-            const Arguments& outputs,
-            const Arguments& inouts) override {
-    CHECK_EQ(2, static_cast<int>(inputs.size()));
-    CHECK_EQ(1, static_cast<int>(outputs.size()));
-    CHECK_EQ(0, static_cast<int>(inouts.size()));
-    CHECK(inputs[0].getData() && outputs[0].getData() && inputs[1].getData());
-    CHECK_EQ(static_cast<int>(outputs[0].dims_.size()), 2);
-    CHECK_EQ(static_cast<int>(inputs[0].dims_.size()), 2);
-    CHECK_EQ(static_cast<int>(inputs[1].dims_.size()), 1);
-    CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
+        << "SequenceArg required here";
+    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
+    const auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
+
+    CHECK(in_seq.data() && out_seq.data() && in_seq.getSequenceId().data());
+    CHECK_EQ(out_seq.shape().ndims(), 2UL);
+    CHECK_EQ(in_seq.shape().ndims(), 2UL);
+    CHECK_EQ(in_seq.getSequenceId().shape().ndims(), 1UL);
+    /// output layer grad dim == input layer grad dim * context_length_
+    CHECK_EQ(in_seq.shape().ndims(), out_seq.shape().ndims() * context_length_);
     /// input and output has the same batch_size
-    CHECK_EQ(inputs[0].dims_[0], outputs[0].dims_[0]);
+    CHECK_EQ(in_seq.shape()[0], out_seq.shape()[0]);
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
 
-    auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
-        outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
-    const auto in_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
-        inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
-    typename SequenceT<Device>::type seq_vec(
-        inputs[1].dims_[0], reinterpret_cast<int*>(inputs[1].getData()));
+    const auto out_grad_mat = in_seq.matrix<Device>();
+    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
+    auto in_grad_mat = out_seq.matrix<Device>();
 
-    ContextProjectionBackwardData<Device>(out_grad_mat.get(),
-                                          in_grad_mat.get(),
-                                          seq_vec,
-                                          context_length_,
-                                          context_start_);
+    ContextProjectionBackwardData<Device>(
+        out_grad_mat, in_grad_mat, seq_vec, context_length_, context_start_);
   }
 
 private:
@@ -286,9 +337,14 @@ class ContextProjectionBackwardDataFunc : public FunctionBase {
 };
 
 /**
- * \param inputs[0] weight grad.
- * \param inputs[1] input sequence.
- * \param outputs[0] output grad.
+ * Context Projection Backward Weight Function
+ * Update weight grad by backprop
+ * input:  sequence of output layer grad
+ * output: weight grad
+ *
+ * \param outputs[0]                   weight grad, pad * d
+ * \param inputs[0].matrix             output layer grad, n * (d * l)
+ * \param inputs[0].vecotr             start positon sequence, n * 1
  */
 template <DeviceType Device>
 class ContextProjectionBackwardWeightFunc : public FunctionBase {
@@ -300,28 +356,25 @@ class ContextProjectionBackwardWeightFunc : public FunctionBase {
     total_pad_ = config.get<size_t>("total_pad");
   }
 
-  void calc(const Arguments& inputs,
-            const Arguments& outputs,
-            const Arguments& inouts) override {
-    CHECK_EQ(2, static_cast<int>(inputs.size()));
-    CHECK_EQ(1, static_cast<int>(outputs.size()));
-    CHECK_EQ(0, static_cast<int>(inouts.size()));
-
-    CHECK(inputs[0].getData() && outputs[0].getData() && inputs[1].getData());
-    CHECK_EQ(static_cast<int>(outputs[0].dims_.size()), 2);
-    CHECK_EQ(static_cast<int>(inputs[0].dims_.size()), 2);
-    CHECK_EQ(static_cast<int>(inputs[1].dims_.size()), 1);
-    CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
-
-    auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
-        outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
-    auto w_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
-        inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
-    typename SequenceT<Device>::type seq_vec(
-        inputs[1].dims_[0], reinterpret_cast<int*>(inputs[1].getData()));
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK(inputs[0].isSequenceArg()) << "SequenceArg required here";
+    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
+    CHECK(in_seq.data() && in_seq.getSequenceId().data() && outputs[0].data());
+    CHECK_EQ(outputs[0].shape().ndims(), 2UL);
+    CHECK_EQ(in_seq.shape().ndims(), 2UL);
+    CHECK_EQ(in_seq.getSequenceId().shape().ndims(), 1UL);
+    CHECK_EQ(in_seq.shape()[0], outputs[0].shape()[0]);
+    /// output layer grad dim == weight dim * context_length_
+    CHECK_EQ(in_seq.shape()[1], outputs[0].shape()[1] * context_length_);
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
 
-    ContextProjectionBackwardWeight<Device>(out_grad_mat.get(),
-                                            w_grad_mat.get(),
+    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
+    const auto out_grad_mat = in_seq.matrix<Device>();
+    auto w_grad_mat = outputs[0].matrix<Device>();
+    ContextProjectionBackwardWeight<Device>(out_grad_mat,
+                                            w_grad_mat,
                                             seq_vec,
                                             context_length_,
                                             context_start_,
@@ -335,7 +388,6 @@ class ContextProjectionBackwardWeightFunc : public FunctionBase {
   size_t begin_pad_;
   size_t total_pad_;
 };
-#endif
 
 REGISTER_TYPED_FUNC(ContextProjectionForward,
                     CPU,
@@ -350,7 +402,6 @@ REGISTER_TYPED_FUNC(ContextProjectionForward,
 REGISTER_TYPED_FUNC(ContextProjectionBackward,
                     GPU,
                     ContextProjectionBackwardFunc);
-#if 0
 REGISTER_TYPED_FUNC(ContextProjectionBackwardData,
                     GPU,
                     ContextProjectionBackwardDataFunc);
@@ -358,5 +409,4 @@ REGISTER_TYPED_FUNC(ContextProjectionBackwardWeight,
                     GPU,
                     ContextProjectionBackwardWeightFunc);
 #endif
-#endif
 }  // namespace paddle
diff --git a/paddle/function/ContextProjectionOp.h b/paddle/function/ContextProjectionOp.h
index a558df5e072f2f..6f7d936379a537 100644
--- a/paddle/function/ContextProjectionOp.h
+++ b/paddle/function/ContextProjectionOp.h
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-
 #include "Function.h"
 
 namespace paddle {
@@ -21,14 +20,14 @@ namespace paddle {
 /**
  * \brief   Context Projection Forward.
  *
- * \param[out]  outputs           output data.
- * \param[in]   input             input data.
- * \param[in]   weight            input weight.
- * \param[in]   sequence          input data.
- * \param[in]   context_length    consecutive rows for concatenation.
- * \param[in]   context_start     context start position.
- * \param[in]   begin_pad         begining pad position.
- * \param[in]   is_padding        whether padding 0 or not.
+ * \param[in/out]  outputs           output data.
+ * \param[in]      input             input data.
+ * \param[in]      weight            input weight.
+ * \param[in]      sequence          input data.
+ * \param[in]      context_length    consecutive rows for concatenation.
+ * \param[in]      context_start     context start position.
+ * \param[in]      begin_pad         begining pad position.
+ * \param[in]      is_padding        whether padding 0 or not.
  *
  */
 template <DeviceType DType>
@@ -56,7 +55,7 @@ void ContextProjectionForward(
  */
 template <DeviceType DType>
 void ContextProjectionBackward(
-    typename Tensor<real, DType>::Matrix& out_grad,
+    const typename Tensor<real, DType>::Matrix& out_grad,
     typename Tensor<real, DType>::Matrix& in_grad,
     typename Tensor<real, DType>::Matrix& w_grad,
     const typename Tensor<int, DType>::Vector& seq_vec,
@@ -68,7 +67,7 @@ void ContextProjectionBackward(
 
 template <DeviceType DType>
 void ContextProjectionBackwardData(
-    typename Tensor<real, DType>::Matrix& out_grad,
+    const typename Tensor<real, DType>::Matrix& out_grad,
     typename Tensor<real, DType>::Matrix& in_grad,
     const typename Tensor<int, DType>::Vector& sequence,
     size_t context_length,
@@ -76,7 +75,7 @@ void ContextProjectionBackwardData(
 
 template <DeviceType DType>
 void ContextProjectionBackwardWeight(
-    typename Tensor<real, DType>::Matrix& out_grad,
+    const typename Tensor<real, DType>::Matrix& out_grad,
     typename Tensor<real, DType>::Matrix& w_grad,
     const typename Tensor<int, DType>::Vector& seq_vec,
     size_t context_length,
diff --git a/paddle/function/ContextProjectionOpGpu.cu b/paddle/function/ContextProjectionOpGpu.cu
index 6a4a01a6510416..1a5b4042402df3 100644
--- a/paddle/function/ContextProjectionOpGpu.cu
+++ b/paddle/function/ContextProjectionOpGpu.cu
@@ -138,10 +138,10 @@ void ContextProjectionForward<DEVICE_TYPE_GPU>(GpuMatrix& output,
                                 begin_pad);
 }
 
-__global__ void KeContextProjectionBackwardData(real* out_grad,
+__global__ void KeContextProjectionBackwardData(const real* out_grad,
                                                 const int* sequence,
                                                 real* in_grad,
-                                                int input_dim,
+                                                size_t input_dim,
                                                 int context_length,
                                                 int context_start) {
   int idx = threadIdx.x;
@@ -152,7 +152,8 @@ __global__ void KeContextProjectionBackwardData(real* out_grad,
   real value = 0;
 
   int instances = seq_end - seq_start + context_length - 1;
-  out_grad += seq_start * input_dim * context_length;
+  auto out = const_cast<real*>(out_grad);
+  out += seq_start * input_dim * context_length;
   in_grad += seq_start * input_dim;
   for (int k = 0; k <= input_dim / block_size; k++) {
     if (idx < input_dim) {
@@ -169,7 +170,7 @@ __global__ void KeContextProjectionBackwardData(real* out_grad,
         int outx = (i - context_length) < 0 ? i : (context_length - 1);
         int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1));
         real* output_r =
-          out_grad + outy * input_dim * context_length + outx * input_dim;
+          out + outy * input_dim * context_length + outx * input_dim;
         for (int j = outy; j < seq_end - seq_start; j++) {
           value += output_r[idx];
           if (j - outy == outx) break;
@@ -194,7 +195,7 @@ __global__ void KeContextProjectionBackwardData(real* out_grad,
  * @param[in]   context_start    context start.
  *
  */
-void hl_context_projection_backward_data(real* out_grad,
+void hl_context_projection_backward_data(const real* out_grad,
                                          const int* sequence,
                                          real* input_grad,
                                          size_t num_sequences,
@@ -216,7 +217,7 @@ void hl_context_projection_backward_data(real* out_grad,
 }
 
 template <>
-void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(GpuMatrix& out_grad,
+void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
                                                     GpuMatrix& in_grad,
                                                     const GpuIVector& sequence,
                                                     size_t context_length,
@@ -231,7 +232,7 @@ void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(GpuMatrix& out_grad,
 }
 
 template<int THREADS_X, int THREADS_Y>
-__global__ void KeContextProjectionBackwardWeight(real* out_grad,
+__global__ void KeContextProjectionBackwardWeight(const real* out_grad,
                                                   const int* sequence,
                                                   real* w_grad,
                                                   int num_sequences,
@@ -254,7 +255,8 @@ __global__ void KeContextProjectionBackwardWeight(real* out_grad,
     for (int seqId = idy; seqId < num_sequences; seqId += THREADS_Y) {
       int seq_start = sequence[seqId];
       int seq_end = sequence[seqId+1];
-      output_r = out_grad + seq_start * w_dim * context_length;
+      output_r = const_cast<real*>(out_grad)
+                    + seq_start * w_dim * context_length;
 
       if (context_start < 0) {
         if (padId + context_start < 0) {
@@ -318,7 +320,7 @@ __global__ void KeContextProjectionBackwardWeight(real* out_grad,
  * beginning.
  *
  */
-void hl_context_projection_backward_weight(real* out_grad,
+void hl_context_projection_backward_weight(const real* out_grad,
                                            const int* sequence,
                                            real* w_grad,
                                            size_t num_sequences,
@@ -346,7 +348,7 @@ void hl_context_projection_backward_weight(real* out_grad,
 
 template <>
 void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
-        GpuMatrix& out_grad,
+        const GpuMatrix& out_grad,
         GpuMatrix& w_grad,
         const GpuIVector& seq_vec,
         size_t context_length,
@@ -365,7 +367,7 @@ void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
 }
 
 template <>
-void ContextProjectionBackward<DEVICE_TYPE_GPU>(GpuMatrix& out_grad,
+void ContextProjectionBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
                                                 GpuMatrix& in_grad,
                                                 GpuMatrix& w_grad,
                                                 const GpuIVector& sequence,
diff --git a/paddle/function/ContextProjectionOpTest.cpp b/paddle/function/ContextProjectionOpTest.cpp
index 6223d2fd23ac3b..0f5d6a848d406d 100644
--- a/paddle/function/ContextProjectionOpTest.cpp
+++ b/paddle/function/ContextProjectionOpTest.cpp
@@ -28,52 +28,26 @@ void testMatrixProjectionForward(int context_start,
                std::max(0, (int)(context_start + context_length - 1));
   if (pad == 0) is_padding = false;
 
-  FunctionCompare compare("ContextProjectionForward",
-                          FuncConfig()
-                              .set("context_length", context_length)
-                              .set("context_start", context_start)
-                              .set("begin_pad", std::max(0, -context_start)));
-
-  CpuMatrix cpu_in(batch_size, input_dim);
-  cpu_in.randomizeUniform();
-  GpuMatrix gpu_in(batch_size, input_dim);
-  gpu_in.copyFrom(cpu_in);
-  auto cpu_weight =
-      is_padding ? std::make_shared<CpuMatrix>(pad, input_dim) : nullptr;
-  auto gpu_weight =
-      is_padding ? std::make_shared<GpuMatrix>(pad, input_dim) : nullptr;
-  if (is_padding) {
-    cpu_weight->randomizeUniform();
-    gpu_weight->copyFrom(*cpu_weight);
+  FunctionCompare test("ContextProjectionForward",
+                       FuncConfig()
+                           .set("context_length", context_length)
+                           .set("context_start", context_start)
+                           .set("begin_pad", std::max(0, -context_start)));
+
+  // prepare input arguments
+  test.addSequence(SequenceIdArg(TensorShape{batch_size}));
+  test.addInputs(
+      SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batch_size, input_dim}));
+  if (is_padding) {  // weight
+    test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{pad, input_dim}));
   }
-  IVectorPtr cpu_seq;
-  generateSequenceStartPositions(batch_size, cpu_seq);
-  IVectorPtr gpu_seq = IVector::create(cpu_seq->getSize(), true);
-  gpu_seq->copyFrom(*cpu_seq);
-
-  CpuMatrix cpu_out(batch_size, input_dim * context_length);
-  GpuMatrix gpu_out(batch_size, input_dim * context_length);
-  cpu_out.randomizeUniform();
-  gpu_out.copyFrom(cpu_out);
-
-  compare.getCpuFunction()->calc(
-      {Tensor(cpu_in.getData(), Dims{batch_size, input_dim}),
-       Tensor(cpu_weight ? cpu_weight->getData() : nullptr,
-              Dims{pad, input_dim}),
-       Tensor(reinterpret_cast<real*>(cpu_seq->getData()),
-              Dims{cpu_seq->getSize()})},
-      {Tensor(cpu_out.getData(), Dims{batch_size, input_dim * context_length})},
-      {});
-  compare.getGpuFunction()->calc(
-      {Tensor(gpu_in.getData(), Dims{batch_size, input_dim}),
-       Tensor(gpu_weight ? gpu_weight->getData() : nullptr,
-              Dims{pad, input_dim}),
-       Tensor(reinterpret_cast<real*>(gpu_seq->getData()),
-              Dims{gpu_seq->getSize()})},
-      {Tensor(gpu_out.getData(), Dims{batch_size, input_dim * context_length})},
-      {});
+  test.addOutputs(
+      SequenceArg(VALUE_TYPE_FLOAT,
+                  TensorShape{batch_size, input_dim * context_length}),
+      ADD_TO);
 
-  autotest::TensorCheckEqual(cpu_out, gpu_out);
+  // run Function
+  test.run();
 }
 
 void testMatrixProjectionBackward(int context_start,
@@ -85,65 +59,31 @@ void testMatrixProjectionBackward(int context_start,
                std::max(0, (int)(context_start + context_length - 1));
   if (pad == 0) is_padding = false;
 
-  FunctionCompare compare("ContextProjectionBackward",
-                          FuncConfig()
-                              .set("context_length", context_length)
-                              .set("context_start", context_start)
-                              .set("begin_pad", std::max(0, -context_start))
-                              .set("is_padding", is_padding)
-                              .set("total_pad", pad));
-
-  CpuMatrix cpu_in_grad(batch_size, input_dim);
-  cpu_in_grad.randomizeUniform();
-  GpuMatrix gpu_in_grad(batch_size, input_dim);
-  gpu_in_grad.copyFrom(cpu_in_grad);
-
-  CpuMatrix cpu_out_grad(batch_size, input_dim * context_length);
-  cpu_out_grad.randomizeUniform();
-  GpuMatrix gpu_out_grad(batch_size, input_dim * context_length);
-  gpu_out_grad.copyFrom(cpu_out_grad);
-
-  IVectorPtr cpu_seq;
-  generateSequenceStartPositions(batch_size, cpu_seq);
-  IVectorPtr gpu_seq = IVector::create(cpu_seq->getSize(), true);
-  gpu_seq->copyFrom(*cpu_seq);
-
-  auto cpu_w_grad =
-      is_padding ? std::make_shared<CpuMatrix>(pad, input_dim) : nullptr;
-  auto gpu_w_grad =
-      is_padding ? std::make_shared<GpuMatrix>(pad, input_dim) : nullptr;
-  if (is_padding) {
-    cpu_w_grad->randomizeUniform();
-    gpu_w_grad->copyFrom(*cpu_w_grad);
+  FunctionCompare test("ContextProjectionBackward",
+                       FuncConfig()
+                           .set("context_length", context_length)
+                           .set("context_start", context_start)
+                           .set("begin_pad", std::max(0, -context_start))
+                           .set("is_padding", is_padding)
+                           .set("total_pad", pad));
+
+  // prepare input arguments
+  test.addSequence(SequenceIdArg(TensorShape{batch_size}));
+  test.addInputs(SequenceArg(
+      VALUE_TYPE_FLOAT, TensorShape{batch_size, input_dim * context_length}));
+  test.addOutputs(
+      SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batch_size, input_dim}),
+      ADD_TO);
+  if (is_padding) {  // weight
+    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{pad, input_dim}),
+                    ADD_TO);
   }
 
-  compare.getCpuFunction()->calc(
-      {Tensor(cpu_in_grad.getData(), Dims{batch_size, input_dim}),
-       Tensor(cpu_w_grad ? cpu_w_grad->getData() : nullptr,
-              Dims{pad, input_dim}),
-       Tensor(reinterpret_cast<real*>(cpu_seq->getData()),
-              Dims{cpu_seq->getSize()})},
-      {Tensor(cpu_out_grad.getData(),
-              Dims{batch_size, input_dim * context_length})},
-      {});
-
-  compare.getGpuFunction()->calc(
-      {Tensor(gpu_in_grad.getData(), Dims{batch_size, input_dim}),
-       Tensor(gpu_w_grad ? gpu_w_grad->getData() : nullptr,
-              Dims{pad, input_dim}),
-       Tensor(reinterpret_cast<real*>(gpu_seq->getData()),
-              Dims{gpu_seq->getSize()})},
-      {Tensor(gpu_out_grad.getData(),
-              Dims{batch_size, input_dim * context_length})},
-      {});
-
-  autotest::TensorCheckErr(cpu_in_grad, gpu_in_grad);
-  if (is_padding) {
-    autotest::TensorCheckErr(*cpu_w_grad, *gpu_w_grad);
-  }
+  // run Function
+  test.run();
 }
 
-TEST(ContextProjection, projection) {
+TEST(ContextProjection, Projection) {
   for (auto context_start : {-5, -3, -1, 0, 3}) {
     for (auto context_length : {1, 2, 5, 7}) {
       for (auto trainable_padding : {false, true}) {
diff --git a/paddle/function/CosSimOp.cpp b/paddle/function/CosSimOp.cpp
new file mode 100644
index 00000000000000..7ece7b2dfedaf4
--- /dev/null
+++ b/paddle/function/CosSimOp.cpp
@@ -0,0 +1,240 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "CosSimOp.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/Vector.h"
+
+namespace paddle {
+/**
+ * Cosine Similarity for CpuMatrix
+ *
+ * \param out_mat, output value, size: nSamples * 1.
+ * \param in1_mat, input value 1, size: nSamples * dim.
+ * \param in2_mat, input value 2, size: n2 * dim (n2 == 1 or n2 == nSamples).
+ * \param scale, default 1.0
+ *
+ */
+template <>
+void CosSimForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
+                                    const CpuMatrix& in1_mat,
+                                    const CpuMatrix& in2_mat,
+                                    real scale) {
+  CHECK(out_mat.getData() && in1_mat.getData() && in2_mat.getData());
+  size_t num_samples = out_mat.getHeight();
+  size_t dim = in1_mat.getWidth();
+  /// column vector [nSamples, 1]
+  real* out = out_mat.getData();
+  const real* x = in1_mat.getData();
+  const real* y = in2_mat.getData();
+
+  /// in2 might only have one row or full rows
+  CHECK(in2_mat.getHeight() == 1LU || in2_mat.getHeight() == num_samples);
+  size_t inc = (in2_mat.getHeight() == 1LU) ? 0 : dim;
+  for (size_t i = 0; i < num_samples; ++i, x += dim, y += inc) {
+    real square_sum_x = 0;
+    real square_sum_y = 0;
+    real xy = 0;
+    for (size_t j = 0; j < dim; ++j) {
+      square_sum_x += x[j] * x[j];
+      square_sum_y += y[j] * y[j];
+      xy += x[j] * y[j];
+    }
+    CHECK(square_sum_x > 0 && square_sum_y > 0);
+    out[i] = scale * xy / (std::sqrt(square_sum_x) * std::sqrt(square_sum_y));
+  }
+}
+
+/**
+ * Cosine Similarity
+ * for each row i,
+ *   out[i] = scale * cos(input1[i], input2[i])
+ *      = scale * <input1[i], input2[i]>/sqrt(|input1[i]|^2 * |input2[i]|^2)
+ * when input2 only has one row, then for each row i,
+ *   out[i] = cos(input1[i], input2[0])
+ *
+ * \param inputs[0] input matrix 1, size: nSamples * dim.
+ * \param inputs[1] input matrix 2, size: n2 * dim (n2 == 1 or n2 == nSamples).
+ * \param outputs[0] output matrix, size : nSamples * 1.
+ */
+
+template <DeviceType Device>
+class CosSimForwardFunc : public FunctionBase {
+  void init(const FuncConfig& config) override {
+    scale_ = config.get<real>("scale");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(inputs.size(), 2UL);
+    CHECK_EQ(outputs.size(), 1UL);
+
+    CHECK_EQ(inputs[0].shape().ndims(), 2UL);
+    CHECK_EQ(inputs[1].shape().ndims(), 2UL);
+    CHECK_EQ(outputs[0].shape().ndims(), 2UL);
+
+    CHECK_EQ(inputs[0].shape()[0], outputs[0].shape()[0]);
+    CHECK_EQ(inputs[0].shape()[1], inputs[1].shape()[1]);
+    CHECK_EQ(outputs[0].shape()[1], 1UL);
+
+    CHECK(outputs[0].data() && inputs[0].data() && inputs[1].data());
+
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+    auto out_mat = outputs[0].matrix<Device>();
+    const auto in1_mat = inputs[0].matrix<Device>();
+    const auto in2_mat = inputs[1].matrix<Device>();
+
+    CosSimForward<Device>(out_mat, in1_mat, in2_mat, scale_);
+  }
+
+private:
+  real scale_;
+};
+
+/**
+ * Cosine Similarity Derivative for CpuMatrix
+ *
+ * \param in1_grad  forward input grad 1, size: nSamples * dim.
+ * \param in2_grad  forward input grad 2,
+ *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
+ *
+ * \param out_grad  backward loss output grad, size : nSamples * 1.
+ * \param out_val   forward output value, size: nSamples * 1.
+ * \param in1_val   forward input value 1, size: nSamples * dim.
+ * \param in2_val   forward input value 2,
+ *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
+ * \param scale,    default 1.0
+ */
+template <>
+void CosSimBackward<DEVICE_TYPE_CPU>(const CpuMatrix& out_grad,
+                                     const CpuMatrix& out_val,
+                                     const CpuMatrix& in1_val,
+                                     const CpuMatrix& in2_val,
+                                     CpuMatrix& in1_grad,
+                                     CpuMatrix& in2_grad,
+                                     real scale) {
+  CHECK(out_grad.getData() && out_val.getData() && in1_val.getData() &&
+        in2_val.getData() && in1_grad.getData() && in2_grad.getData());
+  CHECK_EQ(out_val.useGpu_, false) << "Matrix type are GPU, CPU required";
+
+  const real* grad = out_grad.getData();
+  const real* out = out_val.getData();
+  const real* prev_out_x = in1_val.getData();
+  const real* prev_out_y = in2_val.getData();
+  real* prev_grad_x = in1_grad.getData();
+  real* prev_grad_y = in2_grad.getData();
+
+  size_t num_samples = out_grad.getHeight();
+  size_t dim = in1_val.getWidth();
+  CHECK_EQ(in2_val.getHeight(), in2_grad.getHeight());
+  CHECK(in2_val.getHeight() == 1LU || in2_val.getHeight() == num_samples);
+  size_t inc = (in2_val.getHeight() == 1LU) ? 0 : dim;
+  for (size_t i = 0; i < num_samples; ++i,
+              prev_out_x += dim,
+              prev_out_y += inc,
+              prev_grad_x += dim,
+              prev_grad_y += inc) {
+    real square_sum_x = 0;
+    real square_sum_y = 0;
+    real xy = 0;
+    for (size_t j = 0; j < dim; ++j) {
+      square_sum_x += prev_out_x[j] * prev_out_x[j];
+      square_sum_y += prev_out_y[j] * prev_out_y[j];
+      xy += prev_out_x[j] * prev_out_y[j];
+    }
+    CHECK(square_sum_x > 0 && square_sum_y > 0);
+    if (xy == 0) {
+      real reciprocal =
+          1.0f / (std::sqrt(square_sum_x) * std::sqrt(square_sum_y));
+      for (size_t j = 0; j < dim; ++j) {
+        prev_grad_x[j] += scale * grad[i] * prev_out_y[j] * reciprocal;
+        prev_grad_y[j] += scale * grad[i] * prev_out_x[j] * reciprocal;
+      }
+    } else {
+      real reciprocal_xy = 1.0f / xy;
+      real reciprocal_square_sum_x = 1.0f / square_sum_x;
+      real reciprocal_square_sum_y = 1.0f / square_sum_y;
+      for (size_t j = 0; j < dim; ++j) {
+        prev_grad_x[j] +=
+            out[i] * grad[i] * (prev_out_y[j] * reciprocal_xy -
+                                prev_out_x[j] * reciprocal_square_sum_x);
+        prev_grad_y[j] +=
+            out[i] * grad[i] * (prev_out_x[j] * reciprocal_xy -
+                                prev_out_y[j] * reciprocal_square_sum_y);
+      }
+    }
+  }
+}
+
+/**
+ * Cosine Similarity backward Derivative
+ *
+ * \param outputs[0] forward input grad 1, size: nSamples * dim.
+ * \param outputs[1] forward input grad 2,
+ *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
+ *
+ * \param inputs[0] backward loss output grad, size : nSamples * 1.
+ * \param inputs[1] forward output value, size: nSamples * 1.
+ * \param inputs[2] forward input value 1, size: nSamples * dim.
+ * \param inputs[3] forward input value 2,
+ *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
+ */
+template <DeviceType Device>
+class CosSimBackwardFunc : public FunctionBase {
+  void init(const FuncConfig& config) override {
+    scale_ = config.get<real>("scale");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(inputs.size(), 4UL);
+    CHECK_EQ(outputs.size(), 2UL);
+    /// dim of out_grad and out_val == 1, column vector
+    CHECK_EQ(inputs[0].shape()[1], 1UL);
+    CHECK_EQ(inputs[1].shape()[1], 1UL);
+    /// nSamples of out_grad == out_val == in_val1 == in_grad1
+    CHECK_EQ(inputs[1].shape()[0], inputs[0].shape()[0]);
+    CHECK_EQ(inputs[0].shape()[0], inputs[0].shape()[0]);
+    CHECK_EQ(outputs[0].shape()[0], inputs[0].shape()[0]);
+    /// dim of in1_val1 == in_val2 == in_grad1 == in_grad2
+    CHECK_EQ(inputs[3].shape()[1], inputs[2].shape()[1]);
+    CHECK_EQ(outputs[0].shape()[1], inputs[2].shape()[1]);
+    CHECK_EQ(outputs[1].shape()[1], inputs[2].shape()[1]);
+
+    CHECK(inputs[0].data() && inputs[1].data() && inputs[2].data() &&
+          inputs[3].data() && outputs[0].data() && outputs[1].data());
+
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    CHECK_EQ(outputs[1].getArgType(), ADD_TO);
+
+    const auto out_grad = inputs[0].matrix<Device>();
+    const auto out_val = inputs[1].matrix<Device>();
+    const auto in1_val = inputs[2].matrix<Device>();
+    const auto in2_val = inputs[3].matrix<Device>();
+    auto in1_grad = outputs[0].matrix<Device>();
+    auto in2_grad = outputs[1].matrix<Device>();
+
+    CosSimBackward<Device>(
+        out_grad, out_val, in1_val, in2_val, in1_grad, in2_grad, scale_);
+  }
+
+private:
+  real scale_;
+};
+
+REGISTER_TYPED_FUNC(CosSimForward, CPU, CosSimForwardFunc);
+REGISTER_TYPED_FUNC(CosSimBackward, CPU, CosSimBackwardFunc);
+#ifndef PADDLE_ONLY_CPU
+REGISTER_TYPED_FUNC(CosSimForward, GPU, CosSimForwardFunc);
+REGISTER_TYPED_FUNC(CosSimBackward, GPU, CosSimBackwardFunc);
+#endif
+}  // namespace paddle
diff --git a/paddle/function/CosSimOp.h b/paddle/function/CosSimOp.h
new file mode 100644
index 00000000000000..be73064e6375bf
--- /dev/null
+++ b/paddle/function/CosSimOp.h
@@ -0,0 +1,61 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Function.h"
+
+namespace paddle {
+
+/**
+ * \brief   Cosine Similarity Forward.
+ * for each row i,
+ * out[i] = scale * cos(in1[i], in2[i])
+ *        = scale * \sum_j (in1[i][j] * in2[i][j]) /
+ *                  sqrt(sum_j (in1[i][j]^2) * sum_j (in2[i][j])^2)
+ *
+ * \param[out]  output            output value.
+ * \param[in]   intput1           input value.
+ * \param[in]   intput2           input value.
+ * \param[in]   scale             default 1.0.
+ *
+ */
+template <DeviceType Device>
+void CosSimForward(typename Tensor<real, Device>::Matrix& output,
+                   const typename Tensor<real, Device>::Matrix& input1,
+                   const typename Tensor<real, Device>::Matrix& input2,
+                   real scale);
+
+/**
+ * \brief   Cosine Similarity BackWard for Derivative.
+ *
+ * \param[in]       output grad           backward loss output grad.
+ * \param[in]       output val            forward-output value.
+ * \param[in]       input val1            forward input value 1.
+ * \param[in]       input val2            forward input value 2.
+ * \param[in/out]   input grad            forward input grad 1.
+ * \param[in/out]   input grad            forward input grad 2.
+ * \param[in]       scale                 default 1.0.
+ *
+ */
+template <DeviceType Device>
+void CosSimBackward(const typename Tensor<real, Device>::Matrix& out_grad,
+                    const typename Tensor<real, Device>::Matrix& out_value,
+                    const typename Tensor<real, Device>::Matrix& in1_value,
+                    const typename Tensor<real, Device>::Matrix& in2_value,
+                    typename Tensor<real, Device>::Matrix& in1_grad,
+                    typename Tensor<real, Device>::Matrix& in2_grad,
+                    real scale);
+
+}  // namespace paddle
diff --git a/paddle/function/CosSimOpGpu.cu b/paddle/function/CosSimOpGpu.cu
new file mode 100644
index 00000000000000..c62ab39551f022
--- /dev/null
+++ b/paddle/function/CosSimOpGpu.cu
@@ -0,0 +1,240 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_base.h"
+#include "hl_device_functions.cuh"
+#include "CosSimOp.h"
+
+namespace paddle {
+
+template<int block_size>
+__global__ void KeCosSim(real* output,
+                         const real* input1,
+                         const real* input2,
+                         int width,
+                         int input1_height,
+                         int input2_height,
+                         real scale) {
+  const int ty = blockIdx.y;
+  int tid = threadIdx.x;
+
+  __shared__ real xx[block_size];
+  __shared__ real yy[block_size];
+  __shared__ real xy[block_size];
+
+  xx[tid] = 0.0;
+  yy[tid] = 0.0;
+  xy[tid] = 0.0;
+  __syncthreads();
+
+  input1 += ty * width;
+  if (input2_height > 1) {
+    input2 += ty * width;
+  }
+  for (int index = tid; index < width; index += block_size) {
+    real x = input1[index];
+    real y = input2[index];
+    xx[tid] += x * x;
+    yy[tid] += y * y;
+    xy[tid] += x * y;
+  }
+  __syncthreads();
+
+  for (int s = block_size / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      xx[tid] += xx[tid + s];
+      yy[tid] += yy[tid + s];
+      xy[tid] += xy[tid + s];
+    }
+    __syncthreads();
+  }
+  if (tid == 0) {
+    output[ty] = scale * xy[0] / (sqrt(xx[0]) * sqrt(yy[0]));
+  }
+}
+
+void hlCossim(real* output,
+              const real* input1,
+              const real* input2,
+              size_t width,
+              size_t input1_height,
+              size_t input2_height,
+              real scale) {
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(input1);
+  CHECK_NOTNULL(input2);
+  const int block_size = 256;
+  dim3 threads(block_size, 1);
+  dim3 grid(1, input1_height);
+
+  KeCosSim<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>
+    (output, input1, input2, width, input1_height, input2_height, scale);
+  CHECK_SYNC("hlCossim failed");
+}
+
+template <>
+void CosSimForward<DEVICE_TYPE_GPU>(GpuMatrix& out_mat,
+                                    const GpuMatrix& in1_mat,
+                                    const GpuMatrix& in2_mat,
+                                    real scale) {
+  CHECK(out_mat.getData() && in1_mat.getData() && in2_mat.getData());
+  CHECK(in1_mat.useGpu_ == true && in2_mat.useGpu_ == true)
+      << "Matrix type are not GPU";
+
+  size_t dim = in1_mat.getWidth();
+  real* out = out_mat.getData();
+  const real* x = in1_mat.getData();
+  const real* y = in2_mat.getData();
+  hlCossim(out, x, y, dim, in1_mat.getHeight(), in2_mat.getHeight(), scale);
+}
+
+template<int block_size>
+__global__ void KeCosSimDerivative(const real* grad,
+                                   const real* output,
+                                   const real* prev_out_x,
+                                   const real* prev_out_y,
+                                   real* prev_grad_x,
+                                   real* prev_grad_y,
+                                   size_t width,
+                                   size_t input1_height,
+                                   size_t input2_height,
+                                   real scale) {
+  const int ty = blockIdx.y;
+  int tid = threadIdx.x;
+
+  __shared__ real xx[block_size];
+  __shared__ real yy[block_size];
+  __shared__ real xy[block_size];
+
+  xx[tid] = 0.0;
+  yy[tid] = 0.0;
+  xy[tid] = 0.0;
+  __syncthreads();
+
+  prev_out_x += ty * width;
+  prev_grad_x += ty * width;
+  if (input2_height > 1) {
+    prev_out_y += ty * width;
+    prev_grad_y += ty * width;
+  }
+  for (int index = tid; index < width; index += block_size) {
+    real x = prev_out_x[index];
+    real y = prev_out_y[index];
+    xx[tid] += x * x;
+    yy[tid] += y * y;
+    xy[tid] += x * y;
+  }
+  __syncthreads();
+
+  for (int s = block_size / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      xx[tid] += xx[tid + s];
+      yy[tid] += yy[tid + s];
+      xy[tid] += xy[tid + s];
+    }
+    __syncthreads();
+  }
+  if (xy[0] == 0) {
+    real reciprocal = 1.0 / (sqrt(xx[0]) * sqrt(yy[0]));
+    for (int index = tid; index < width; index += block_size) {
+      prev_grad_x[index] +=
+        scale * grad[ty] * prev_out_y[index] * reciprocal;
+      if (input2_height > 1) {
+        prev_grad_y[index] +=
+          scale * grad[ty] * prev_out_x[index] * reciprocal;
+      } else {
+        paddle::paddleAtomicAdd(prev_grad_y + index,
+          scale * grad[ty] * prev_out_x[index] * reciprocal);
+      }
+    }
+  } else {
+    real reciprocalXY = 1.0 / xy[0];
+    real reciprocalSquareSumX = 1.0 / xx[0];
+    real reciprocalSquareSumY = 1.0 / yy[0];
+    for (int index = tid; index < width; index += block_size) {
+      prev_grad_x[index] += output[ty] * grad[ty] *
+        (prev_out_y[index] * reciprocalXY -
+         prev_out_x[index] * reciprocalSquareSumX);
+      if (input2_height > 1) {
+        prev_grad_y[index] += output[ty] * grad[ty] *
+          (prev_out_x[index] * reciprocalXY -
+           prev_out_y[index] * reciprocalSquareSumY);
+      } else {
+        paddle::paddleAtomicAdd(prev_grad_y + index, output[ty] * grad[ty] *
+          (prev_out_x[index] * reciprocalXY -
+           prev_out_y[index] * reciprocalSquareSumY));
+      }
+    }
+  }
+}
+
+void hlCossimDerivative(const real* grad,
+                        const real* output,
+                        const real* prev_out_x,
+                        const real* prev_out_y,
+                        real* prev_grad_x,
+                        real* prev_grad_y,
+                        size_t width,
+                        size_t input1_height,
+                        size_t input2_height,
+                        real scale) {
+  CHECK_NOTNULL(grad);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(prev_out_x);
+  CHECK_NOTNULL(prev_out_y);
+  CHECK_NOTNULL(prev_grad_x);
+  CHECK_NOTNULL(prev_grad_y);
+  const int block_size = 256;
+  dim3 threads(block_size, 1);
+  dim3 grid(1, input1_height);
+  KeCosSimDerivative<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>
+    (grad, output, prev_out_x, prev_out_y, prev_grad_x, prev_grad_y, width,
+        input1_height, input2_height, scale);
+  CHECK_SYNC("hlCossimDerivate failed");
+}
+
+template <>
+void CosSimBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
+                                     const GpuMatrix& out_val,
+                                     const GpuMatrix& in1_val,
+                                     const GpuMatrix& in2_val,
+                                     GpuMatrix& in1_grad,
+                                     GpuMatrix& in2_grad,
+                                     real scale) {
+  CHECK(out_grad.getData() && out_val.getData() && in1_val.getData() &&
+        in2_val.getData() && in1_grad.getData() && in2_grad.getData());
+  CHECK(out_grad.useGpu_ && out_val.useGpu_ && in1_val.useGpu_
+        && in2_val.useGpu_ && in1_grad.useGpu_ && in2_grad.useGpu_)
+        << "Matrix types are not equally GPU";
+
+  size_t dim = in1_val.getWidth();
+  const real* grad = out_grad.getData();
+  const real* out = out_val.getData();
+  const real* prev_out_x = in1_val.getData();
+  const real* prev_out_y = in2_val.getData();
+  real* prev_grad_x = in1_grad.getData();
+  real* prev_grad_y = in2_grad.getData();
+  hlCossimDerivative(grad,
+                     out,
+                     prev_out_x,
+                     prev_out_y,
+                     prev_grad_x,
+                     prev_grad_y,
+                     dim,
+                     in1_val.getHeight(),
+                     in2_val.getHeight(),
+                     scale);
+}
+
+}  // namespace paddle
diff --git a/paddle/function/CosSimOpTest.cpp b/paddle/function/CosSimOpTest.cpp
new file mode 100644
index 00000000000000..48c815f027161b
--- /dev/null
+++ b/paddle/function/CosSimOpTest.cpp
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+#include "paddle/math/Matrix.h"
+
+using namespace paddle;  // NOLINT
+
+void testCosSimForward(size_t height_x,
+                       size_t height_y,
+                       size_t width,
+                       real scale) {
+  FunctionCompare test("CosSimForward", FuncConfig().set("scale", scale));
+  // prepare input arguments
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, width}));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_y, width}));
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}),
+                  ASSIGN_TO);
+  // run Function
+  test.run();
+}
+
+void testCosSimBackward(size_t height_x,
+                        size_t height_y,
+                        size_t width,
+                        real scale) {
+  FunctionCompare test("CosSimBackward", FuncConfig().set("scale", scale));
+  // prepare input arguments
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, width}));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_y, width}));
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, width}),
+                  ADD_TO);
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_y, width}),
+                  ADD_TO);
+  // run Function
+  test.run();
+}
+
+TEST(Matrix, cosSim) {
+  for (auto height_x : {10, 100, 1000}) {
+    for (auto height_y : {1, height_x}) {
+      for (auto width : {10, 100, 1000}) {
+        for (auto scale : {1.0, 2.0}) {
+          testCosSimForward(height_x, height_y, width, scale);
+          testCosSimBackward(height_x, height_y, width, scale);
+        }
+      }
+    }
+  }
+}
diff --git a/paddle/function/CrossMapNormalOp.cpp b/paddle/function/CrossMapNormalOp.cpp
index 92980c503fdaaa..ef878bfbba961b 100644
--- a/paddle/function/CrossMapNormalOp.cpp
+++ b/paddle/function/CrossMapNormalOp.cpp
@@ -112,48 +112,114 @@ void CrossMapNormalGrad<DEVICE_TYPE_CPU>(real* inputsGrad,
 }
 
 /**
- * \brief {o_0, o_1} = calc(i_0)
+ * \brief Normalization with across maps.
  *
- * \param inputs[0] input value.
- * \param outputs[0] output value.
- * \param outputs[1] denoms.
+ * This Function comes from the paper
+ * "ImageNet Classification with Deep Convolutional Neural Networks".
+ *
+ * The original formula is:
+ *
+ *                                Input(i, x, y)
+ * Output(i, x, y) = ----------------------------------------------
+ *                                 -- upper
+ *                    (k + alpha * >  (Input(j, x, y))^2) ^ (beta)
+ *                                 -- j = lower
+ *
+ * upper is `min(C, c + N/2)`
+ * lower if `max(0, c - N/2)`
+ *
+ * Function implementation:
+ *
+ * inputs and outpus is NCHW format, while input.shape.ndims() is equal 4.
+ * And the meaning of each dimension(0-3) is respectively batch size,
+ * feature maps, rows and columns.
+ *
+ * Input and Output in the above formula is for each map(i) of one image, and
+ * Input(i, x, y), Output(i, x, y) represents an element in an image.
+ *
+ * C is the number of feature maps of one image, and N is a hyper-parameters
+ * is configured when Function is initialized. The sum in the denominator
+ * is the sum of the same position in the neighboring maps.
+ *
+ * In the implementation of Function, k is equal to 1,
+ * so Function has no argument for k.
+ *
+ * Function Arguments:
+ *
+ * \param size_      represent N
+ * \param scale_     represent alpha
+ * \param pow_       represent beta
+ * \param inputs[0]  represent Input
+ * \param outputs[0] represent Output
+ * \param outputs[1] represent The denominator in the formula(except beta)
+ *
+ * Note:
+ * Save output[1] is to simplify the backward calculation.
+ * TODO, if only consider the forward calculation, we can optimize to
+ * remove the output[1].
  */
 template <DeviceType Device>
 class CrossMapNormalFunc : public FunctionBase {
 public:
   void init(const FuncConfig& config) override {
+    // function arguments
     size_ = config.get<size_t>("size");
     scale_ = config.get<real>("scale");
     pow_ = config.get<real>("pow");
+
+    // number of inputs and outputs
+    numInputs_ = 1;
+    numOutputs_ = 2;
   }
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ((size_t)1, inputs.size());
-    CHECK_EQ((size_t)2, outputs.size());
-
-    CHECK_EQ(inputs[0].shape().ndims(), (size_t)4);
-    CHECK(inputs[0].shape() == outputs[0].shape());
-    CHECK(inputs[0].shape() == outputs[1].shape());
-
+    check(inputs, outputs);
+    // ArgType check still on here,
+    // not sure whether it is better to put inside the check.
     CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
     CHECK_EQ(outputs[1].getArgType(), ASSIGN_TO);
-    size_t samples = inputs[0].shape()[0];
-    size_t channels = inputs[0].shape()[1];
-    size_t height = inputs[0].shape()[2];
-    size_t width = inputs[0].shape()[3];
+    size_t batchSize = inputs[0].shape()[0];
+    size_t maps = inputs[0].shape()[1];
+    size_t rows = inputs[0].shape()[2];
+    size_t columns = inputs[0].shape()[3];
 
     CrossMapNormal<Device>(outputs[0].data<real>(),
                            outputs[1].data<real>(),
                            inputs[0].data<real>(),
-                           samples,
-                           channels,
-                           height,
-                           width,
+                           batchSize,
+                           maps,
+                           rows,
+                           columns,
                            size_,
                            scale_,
                            pow_);
   }
 
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+
+    CHECK_EQ(inputs[0].shape().ndims(), (size_t)4);
+    CHECK(inputs[0].shape() == outputs[0].shape());
+    CHECK(inputs[0].shape() == outputs[1].shape());
+  }
+
+  // Only need the shape of the input, can calculate the
+  // floating-point operation.
+  size_t ops(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ((size_t)numInputs_, inputs.size());
+    size_t batchSize = inputs[0].shape()[0];
+    size_t maps = inputs[0].shape()[1];
+    size_t rows = inputs[0].shape()[2];
+    size_t columns = inputs[0].shape()[3];
+
+    // number of floating-point operations
+    // an approximate value
+    size_t ops = batchSize * maps * rows * columns * (size_ * 2 + 3);
+
+    return ops;
+  }
+
 private:
   size_t size_;
   real scale_;
@@ -161,55 +227,107 @@ class CrossMapNormalFunc : public FunctionBase {
 };
 
 /**
- * \brief {o_0} = calc(i_0, i_1, i_2, i_3)
+ * \brief Backward calculation for normalization with across maps.
  *
- * \param inputs[0] input value.
- * \param inputs[1] output value.
- * \param inputs[2] output grad.
- * \param inputs[3] denoms.
- * \param outputs[0] input grad.
+ * Function implementation:
+ *
+ * The implementation of this Function is derived from the
+ * CrossMapNormalFunc implementation.
+ *
+ * InputGrad = OutputGrad * denoms ^ (-beta)
+ *    -- upper
+ *  + > (OutputGrad * OutputValue * (-2 * alpha * beta) / denoms) * InputValue
+ *    -- lower
+ *
+ * The data of inputs/outputs format is the same as the forward interface
+ * and is NCHW.
+ *
+ * The upper and lower is the same as forward. The logic of the sum
+ * is also the same as forward.
+ *
+ * Function Arguments:
+ *
+ * \param size_      represent N
+ * \param scale_     represent alpha
+ * \param pow_       represent beta
+ * \param inputs[0]  represent InputValue, inputs[0] of CrossMapNormalFunc
+ * \param inputs[1]  represent OutputValue, outputs[0] of CrossMapNormalFunc
+ * \param inputs[2]  represent OutputGrad
+ * \param inputs[3]  represent denoms, outputs[1] of CrossMapNormalFunc
+ *                   This is the intermediate result that is
+ *                   preserved in the forward calculation.
+ * \param outputs[0] represent InputGrad
  */
 template <DeviceType Device>
 class CrossMapNormalGradFunc : public FunctionBase {
 public:
   void init(const FuncConfig& config) override {
+    // function arguments
     size_ = config.get<size_t>("size");
     scale_ = config.get<real>("scale");
     pow_ = config.get<real>("pow");
+
+    // number of inputs and outputs
+    numInputs_ = 4;
+    numOutputs_ = 1;
   }
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ((size_t)4, inputs.size());
-    CHECK_EQ((size_t)1, outputs.size());
-
-    CHECK_EQ(inputs[0].shape().ndims(), (size_t)4);
-    CHECK(inputs[0].shape() == inputs[1].shape());
-    CHECK(inputs[0].shape() == inputs[2].shape());
-    CHECK(inputs[0].shape() == inputs[3].shape());
-    CHECK(inputs[0].shape() == outputs[0].shape());
-
-    // TODO(hedaoyuan): need support ASSIGN_TO mode.
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    check(inputs, outputs);
+    if (outputs[0].getArgType() != ADD_TO) {
+      // Currently, some algorithm implementations are ASSIGN_TO mode,
+      // if need to support the ADD_TO calculation, need to clear the output.
+      typename Tensor<real, Device>::Vector tmp(
+          outputs[0].shape().getElements(), outputs[0].data<real>());
+      tmp.zero();
+    }
 
-    size_t samples = inputs[0].shape()[0];
-    size_t channels = inputs[0].shape()[1];
-    size_t height = inputs[0].shape()[2];
-    size_t width = inputs[0].shape()[3];
+    size_t batchSize = inputs[0].shape()[0];
+    size_t maps = inputs[0].shape()[1];
+    size_t rows = inputs[0].shape()[2];
+    size_t columns = inputs[0].shape()[3];
 
     CrossMapNormalGrad<Device>(outputs[0].data<real>(),
                                inputs[0].data<real>(),
                                inputs[1].data<real>(),
                                inputs[2].data<real>(),
                                inputs[3].data<real>(),
-                               samples,
-                               channels,
-                               height,
-                               width,
+                               batchSize,
+                               maps,
+                               rows,
+                               columns,
                                size_,
                                scale_,
                                pow_);
   }
 
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+
+    CHECK_EQ(inputs[0].shape().ndims(), (size_t)4);
+    CHECK(inputs[0].shape() == inputs[1].shape());
+    CHECK(inputs[0].shape() == inputs[2].shape());
+    CHECK(inputs[0].shape() == inputs[3].shape());
+    CHECK(inputs[0].shape() == outputs[0].shape());
+  }
+
+  // Only need the shape of one input, can calculate the
+  // floating-point operation.
+  size_t ops(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_LT((size_t)1, inputs.size());
+    size_t batchSize = inputs[0].shape()[0];
+    size_t maps = inputs[0].shape()[1];
+    size_t rows = inputs[0].shape()[2];
+    size_t columns = inputs[0].shape()[3];
+
+    // number of floating-point operations
+    // an approximate value
+    size_t ops = batchSize * maps * rows * columns * (size_ * 4 + 2);
+
+    return ops;
+  }
+
 private:
   size_t size_;
   real scale_;
diff --git a/paddle/function/CrossMapNormalOpTest.cpp b/paddle/function/CrossMapNormalOpTest.cpp
index d65d9310affd7c..51f5da81bfc9ae 100644
--- a/paddle/function/CrossMapNormalOpTest.cpp
+++ b/paddle/function/CrossMapNormalOpTest.cpp
@@ -27,15 +27,19 @@ TEST(CrossMapNormal, real) {
                     << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
                     << " size=" << size;
 
-            FunctionCompare compare("CrossMapNormal",
-                                    FuncConfig()
-                                        .set("size", size)
-                                        .set("scale", (real)1.5)
-                                        .set("pow", (real)0.5));
-            Dims dims{numSamples, channels, imgSizeH, imgSizeW};
-            compare.cmpWithArg({Tensor(nullptr, dims)},
-                               {Tensor(nullptr, dims), Tensor(nullptr, dims)},
-                               {});
+            // init Test object
+            FunctionCompare test("CrossMapNormal",
+                                 FuncConfig()
+                                     .set("size", size)
+                                     .set("scale", (real)1.5)
+                                     .set("pow", (real)0.5));
+            // prepare input arguments
+            TensorShape shape{numSamples, channels, imgSizeH, imgSizeW};
+            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            // run Function
+            test.run();
           }
         }
       }
@@ -53,18 +57,19 @@ TEST(CrossMapNormalGrad, real) {
                     << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
                     << " size=" << size;
 
-            FunctionCompare compare("CrossMapNormalGrad",
-                                    FuncConfig()
-                                        .set("size", size)
-                                        .set("scale", (real)1.5)
-                                        .set("pow", (real)0.5));
-            Dims dims{numSamples, channels, imgSizeH, imgSizeW};
-            compare.cmpWithArg({Tensor(nullptr, dims),
-                                Tensor(nullptr, dims),
-                                Tensor(nullptr, dims),
-                                Tensor(nullptr, dims)},
-                               {Tensor(nullptr, dims)},
-                               {});
+            FunctionCompare test("CrossMapNormalGrad",
+                                 FuncConfig()
+                                     .set("size", size)
+                                     .set("scale", (real)1.5)
+                                     .set("pow", (real)0.5));
+            TensorShape shape{numSamples, channels, imgSizeH, imgSizeW};
+            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            // run Function
+            test.run();
           }
         }
       }
diff --git a/paddle/function/Function.cpp b/paddle/function/Function.cpp
index dbe3a4e9f608df..f47d55a4ade97d 100644
--- a/paddle/function/Function.cpp
+++ b/paddle/function/Function.cpp
@@ -79,15 +79,25 @@ FuncConfig& FuncConfig::set<bool>(const std::string& key, bool v) {
 void BufferArgs::addArg(const Matrix& arg,
                         const TensorShape& shape,
                         ArgType argType) {
-  args_.push_back(std::make_shared<BufferArg>(arg, shape, argType));
+  _args_.push_back(new BufferArg(arg, shape, argType));
+  addArg(*_args_.back());
 }
 
 void BufferArgs::addArg(const CpuSparseMatrix& arg, ArgType argType) {
-  args_.push_back(std::make_shared<SparseMatrixArg>(arg, argType));
+  _args_.push_back(new SparseMatrixArg(arg, argType));
+  addArg(*_args_.back());
 }
 
 void BufferArgs::addArg(const GpuSparseMatrix& arg, ArgType argType) {
-  args_.push_back(std::make_shared<SparseMatrixArg>(arg, argType));
+  _args_.push_back(new SparseMatrixArg(arg, argType));
+  addArg(*_args_.back());
+}
+
+void BufferArgs::addArg(const Matrix& matrix,
+                        const IVector& vector,
+                        ArgType argType) {
+  _args_.push_back(new SequenceArg(matrix, vector, argType));
+  addArg(*_args_.back());
 }
 
 ClassRegistrar<FunctionBase> FunctionBase::funcRegistrar_;
diff --git a/paddle/function/Function.h b/paddle/function/Function.h
index 249f8f9cfad58b..3bbeb6e525f85b 100644
--- a/paddle/function/Function.h
+++ b/paddle/function/Function.h
@@ -50,19 +50,44 @@ class FuncConfig {
  * Argument type for Function::calc().
  * A BufferArgs contains a set of BufferArg,
  * because Function can have multiple inputs and outputs.
+ *
+ * addArg() with Matix object used to adapt Layer Argument.
+ * Will create a BufferArg object in addArg(),
+ * and free in destructor of BufferArgs.
+ *
+ * addArg() with BufferArg object, just save BufferArg object address,
+ * and the caller needs to guarantee the validity of the BufferArg object
+ * in the BufferArgs life time.
  */
 class BufferArgs {
 public:
   BufferArgs() {}
+
+  ~BufferArgs() {
+    for (auto arg : _args_) {
+      delete arg;
+    }
+  }
+
   size_t size() const { return args_.size(); }
 
   // add argument into BufferArgs
   // Tensor can be Matrix, Vector, IVector.
   // For inputs, do not need argType.
   // For outputs, the argType needs to be specified as ASSIGN_TO or ADD_TO.
-  template <typename Tensor>
-  void addArg(const Tensor& arg, ArgType argType = UNSPECIFIED) {
-    args_.push_back(std::make_shared<BufferArg>(arg, argType));
+  void addArg(const Matrix& arg, ArgType argType = UNSPECIFIED) {
+    _args_.push_back(new BufferArg(arg, argType));
+    addArg(*_args_.back());
+  }
+
+  void addArg(const Vector& arg, ArgType argType = UNSPECIFIED) {
+    _args_.push_back(new BufferArg(arg, argType));
+    addArg(*_args_.back());
+  }
+
+  void addArg(const IVector& arg, ArgType argType = UNSPECIFIED) {
+    _args_.push_back(new BufferArg(arg, argType));
+    addArg(*_args_.back());
   }
 
   // Add arg into BufferArgs and reshape the arg.
@@ -77,20 +102,37 @@ class BufferArgs {
   void addArg(const CpuSparseMatrix& arg, ArgType argType = UNSPECIFIED);
   void addArg(const GpuSparseMatrix& arg, ArgType argType = UNSPECIFIED);
 
+  void addArg(const Matrix& matrix,
+              const IVector& vector,
+              ArgType argType = UNSPECIFIED);
+
   // get argument
   const BufferArg& operator[](size_t num) const {
     CHECK_LT(num, args_.size());
     return *args_[num];
   }
 
+  void addArg(BufferArg& arg) { args_.push_back(&arg); }
+
+  void addArg(SequenceIdArg& arg) { args_.push_back(&arg); }
+
+  void addArg(SequenceArg& arg) { args_.push_back(&arg); }
+
+  void addArg(SparseMatrixArg& arg) { args_.push_back(&arg); }
+
 private:
-  std::vector<BufferArgPtr> args_;
+  std::vector<BufferArg*> args_;
+  // The BufferArg object is constructed and freed by BufferArgs.
+  std::vector<BufferArg*> _args_;
 };
 
 /**
  * \brief Base class for Function.
  * The basic Function implementation requires override init and calc interfaces.
  *
+ * The caller needs to ensure the validity of the arguments
+ * during Function execution.
+ *
  * Function inputs are readonly, Function outputs have two modes: ASSIGN_TO
  * and ADD_TO.
  * If output.getArgType() == ASSIGN_TO, this is assign mode, and the calculation
@@ -111,7 +153,36 @@ class FunctionBase {
 
   virtual void calc(const BufferArgs& inputs, const BufferArgs& outputs) {}
 
+  // This member function is used to check whether the BufferType and shape of
+  // the inputs and outputs arguments of the Function are correct.
+  // General calc function which will call this check to do arguments check.
+  // And before the calc called, the caller can also check their own arguments.
+  virtual void check(const BufferArgs& inputs, const BufferArgs& outputs) {}
+
+  // Calculate the number of floating-point operations of this Function.
+  // The inputs and outputs arguments do not need to contain the actual data,
+  // only the shape.
+  // And some Functions have the same input and output shapes,
+  // so you may not need to enter the complete number of arguments.
+  // But entering the full arguments is always correct for this interface.
+  virtual size_t ops(const BufferArgs& inputs, const BufferArgs& outputs) {
+    return 0;
+  }
+
+  int getNumInputs() const { return numInputs_; }
+
+  int getNumOutputs() const { return numOutputs_; }
+
   static ClassRegistrar<FunctionBase> funcRegistrar_;
+
+protected:
+  // numInputs_ and numOutputs_ represents the maximum
+  // input and output supported by Function.
+  // Some functions are optimized for input and output,
+  // so when comparing the number of arguments, for these functions
+  // inputs.size() <= numInputs_ or outputs.size() <= numOutputs_
+  size_t numInputs_;
+  size_t numOutputs_;
 };
 
 #define FUNC_NAME(typeName, deviceName) #typeName "-" #deviceName
diff --git a/paddle/function/FunctionTest.cpp b/paddle/function/FunctionTest.cpp
index 7ce908320a6f6f..fdf7e631e5ab8c 100644
--- a/paddle/function/FunctionTest.cpp
+++ b/paddle/function/FunctionTest.cpp
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "Function.h"
 #include <gtest/gtest.h>
+#include "paddle/math/SparseMatrix.h"
 
 namespace paddle {
 
@@ -56,4 +57,110 @@ TEST(Function, BufferArgs) {
   Function<DEVICE_TYPE_GPU>(gpuArgments);
 }
 
+/**
+ * Some tests case are used to check the consistency between the BufferArg type
+ * argument received by Function and the original type argument.
+ *
+ * Use Case:
+ *  TEST() {
+ *    Matrix matrix(...);
+ *    CheckBufferArg lambda = [=](const BufferArg& arg) {
+ *      // check matrix and arg are equivalent
+ *      EXPECT_EQ(matrix, arg);
+ *    }
+ *
+ *   BufferArgs argments{matrix...};
+ *   std::vector<CheckBufferArg> checkFunc{lambda...};
+ *   testBufferArgs(argments, checkFunc);
+ *  }
+ */
+typedef std::function<void(const BufferArg&)> CheckBufferArg;
+
+void testBufferArgs(const BufferArgs& inputs,
+                    const std::vector<CheckBufferArg>& check) {
+  EXPECT_EQ(inputs.size(), check.size());
+  for (size_t i = 0; i < inputs.size(); i++) {
+    check[i](inputs[i]);
+  }
+}
+
+void testBufferArgs(const BufferArgs& inputs, const CheckBufferArg& check) {
+  EXPECT_EQ(inputs.size(), 1);
+  check(inputs[0]);
+}
+
+TEST(Arguments, Matrix) {
+  MatrixPtr matrix = Matrix::create(100, 200);
+  CheckBufferArg check = [=](const BufferArg& arg) {
+    EXPECT_EQ(arg.shape().ndims(), 2);
+    EXPECT_EQ(arg.shape()[0], 100);
+    EXPECT_EQ(arg.shape()[1], 200);
+    EXPECT_EQ(arg.data(), matrix->getData());
+
+    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getHeight(), matrix->getHeight());
+    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getWidth(), matrix->getWidth());
+    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getData(), matrix->getData());
+  };
+
+  BufferArgs argments;
+  argments.addArg(*matrix);
+  std::vector<CheckBufferArg> checkFunc;
+  checkFunc.push_back(check);
+  testBufferArgs(argments, checkFunc);
+}
+
+TEST(Arguments, Vector) {
+  VectorPtr vector = Vector::create(100, false);
+  CheckBufferArg check = [=](const BufferArg& arg) {
+    EXPECT_EQ(arg.shape().ndims(), 1);
+    EXPECT_EQ(arg.shape()[0], 100);
+    EXPECT_EQ(arg.data(), vector->getData());
+
+    CpuVector inVector = arg.vector<real, DEVICE_TYPE_CPU>();
+    EXPECT_EQ(inVector.getSize(), vector->getSize());
+    EXPECT_EQ(inVector.getData(), vector->getData());
+  };
+
+  BufferArgs argments;
+  argments.addArg(*vector);
+  std::vector<CheckBufferArg> checkFunc;
+  checkFunc.push_back(check);
+  testBufferArgs(argments, checkFunc);
+}
+
+TEST(Arguments, CpuSparseMatrix) {
+  CpuSparseMatrix sparse(200, 300, 50);
+  CheckBufferArg check = [=](const BufferArg& arg) {
+    EXPECT_EQ(arg.shape().ndims(), 2);
+    EXPECT_EQ(arg.shape()[0], 200);
+    EXPECT_EQ(arg.shape()[1], 300);
+    EXPECT_EQ(arg.data(), sparse.getData());
+    // CHECK_EQ(arg.sparse().nnz(), 50);
+    // CHECK_EQ(arg.sparse().dataFormat(), SPARSE_CSR_FORMAT);
+    // CHECK_EQ(arg.sparse().dataType(), SPARSE_FLOAT_VALUE);
+    EXPECT_EQ(arg.sparse().getRowBuf(), sparse.getRows());
+    EXPECT_EQ(arg.sparse().getColBuf(), sparse.getCols());
+  };
+
+  BufferArgs argments;
+  argments.addArg(sparse);
+  std::vector<CheckBufferArg> checkFunc;
+  checkFunc.push_back(check);
+  testBufferArgs(argments, checkFunc);
+}
+
+TEST(Arguments, BufferArg) {
+  BufferArg arg(nullptr, VALUE_TYPE_FLOAT, {1, 2, 3});
+  CheckBufferArg check = [=](const BufferArg& arg) {
+    EXPECT_EQ(arg.shape().ndims(), 3);
+    EXPECT_EQ(arg.shape()[0], 1);
+    EXPECT_EQ(arg.shape()[1], 2);
+    EXPECT_EQ(arg.shape()[2], 3);
+  };
+
+  BufferArgs argments;
+  argments.addArg(arg);
+  testBufferArgs(argments, check);
+}
+
 }  // namespace paddle
diff --git a/paddle/function/FunctionTest.h b/paddle/function/FunctionTest.h
index 32131037f6de4a..0cfafdb27f55a3 100644
--- a/paddle/function/FunctionTest.h
+++ b/paddle/function/FunctionTest.h
@@ -13,97 +13,336 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "Function.h"
-#include "paddle/math/Vector.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/SparseMatrix.h"
 #include "paddle/math/tests/TensorCheck.h"
+#include "paddle/testing/TestUtil.h"
 
 namespace paddle {
 
+typedef std::shared_ptr<BufferArg> BufferArgPtr;
+
+/**
+ * \brief A class for comparing CPU and GPU implementations of Function.
+ *
+ *
+ * Use case:
+ *  // Initializes a test object, the corresponding cpu and gpu Function
+ *  // are constructed according to FunctionName and FuncConfig.
+ *  FunctionCompare test(FunctionName, FuncConfig);
+ *  // Prepare inputs and outputs arguments.
+ *  // Here the input and output can not contain real data,
+ *  // only contains the argument type and shape.
+ *  test.addInputs(input1);
+ *  test.addInputs(input2);
+ *  test.addOutputs(output1);
+ *  test.addOutputs(output2);
+ *  // Run.
+ *  // Will according to the type and shape of arguments(inputs_/outputs_),
+ *  // automatic initialization cpu and gpu function required arguments
+ *  // (cpuInputs_/cpuOutputs_/gpuInputs_/gpuOutputs_).
+ *  // Call the CPU and GPU Function calculation results.
+ *  // Compares CPU and GPU calculation results for consistency.
+ *  test.run();
+ */
 class FunctionCompare {
 public:
   FunctionCompare(const std::string& name, const FuncConfig& config)
-      : cpu(FunctionBase::funcRegistrar_.createByType(name + "-CPU")),
-        gpu(FunctionBase::funcRegistrar_.createByType(name + "-GPU")) {
-    cpu->init(config);
-    gpu->init(config);
+      : cpuFunc_(FunctionBase::funcRegistrar_.createByType(name + "-CPU")),
+        gpuFunc_(FunctionBase::funcRegistrar_.createByType(name + "-GPU")) {
+    cpuFunc_->init(config);
+    gpuFunc_->init(config);
   }
 
-  void cmpWithArg(const Arguments& inputs,
-                  const Arguments& outputs,
-                  const Arguments& inouts) {
-    // init cpu and gpu arguments
-    auto initArgs = [=](
-        Arguments& cpuArgs, Arguments& gpuArgs, const Arguments& inArgs) {
-      for (const auto arg : inArgs) {
-        size_t size = sizeof(real);
-        for (const auto dim : arg.dims_) {
-          size *= dim;
-        }
-        if (arg.getData()) {
-          // todo(tianbing), waste unnecessary mem here
-          cpuMemory.emplace_back(std::make_shared<CpuMemoryHandle>(size));
-          gpuMemory.emplace_back(std::make_shared<GpuMemoryHandle>(size));
-          cpuArgs.emplace_back(Tensor((real*)arg.getData(), arg.dims_));
-          gpuArgs.emplace_back(Tensor((real*)arg.getData(), arg.dims_));
-          // already init outside
-        } else {
-          cpuMemory.emplace_back(std::make_shared<CpuMemoryHandle>(size));
-          gpuMemory.emplace_back(std::make_shared<GpuMemoryHandle>(size));
-          cpuArgs.emplace_back(
-              Tensor((real*)cpuMemory.back()->getBuf(), arg.dims_));
-          gpuArgs.emplace_back(
-              Tensor((real*)gpuMemory.back()->getBuf(), arg.dims_));
-          // will use an api to refactor this code.
-          CpuVector cpuVector(size / sizeof(real),
-                              (real*)cpuArgs.back().getData());
-          GpuVector gpuVector(size / sizeof(real),
-                              (real*)gpuArgs.back().getData());
-          cpuVector.uniform(0.001, 1);
-          gpuVector.copyFrom(cpuVector);
-        }
-      }
-    };
-    initArgs(cpuInputs, gpuInputs, inputs);
-    initArgs(cpuOutputs, gpuOutputs, outputs);
-    initArgs(cpuInouts, gpuInouts, inouts);
+  ~FunctionCompare() {}
+
+  // input need only contains shape, do not contains data.
+  void addInputs(const BufferArg& input) {
+    size_t size =
+        input.shape().getElements() * sizeOfValuType(input.valueType());
+    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+
+    cpuInputs_.emplace_back(std::make_shared<BufferArg>(
+        cpuMemory_.back()->getBuf(), input.valueType(), input.shape()));
+    gpuInputs_.emplace_back(std::make_shared<BufferArg>(
+        gpuMemory_.back()->getBuf(), input.valueType(), input.shape()));
+  }
+
+  // assume one copy of sequence is shared by different SequenceArgs
+  void addSequence(const SequenceIdArg& input) {
+    CHECK_EQ(input.shape().ndims(), 1UL);
+    size_t batchSize = input.shape()[0];
+    size_t numSeqs = batchSize / 10 + 1;
+    size_t sizeId = (numSeqs + 1) * sizeOfValuType(VALUE_TYPE_INT32);
+    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(sizeId));
+    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(sizeId));
+    cpuSeq_ = std::make_shared<SequenceIdArg>(cpuMemory_.back()->getBuf(),
+                                              TensorShape{numSeqs + 1});
+    gpuSeq_ = std::make_shared<SequenceIdArg>(gpuMemory_.back()->getBuf(),
+                                              TensorShape{numSeqs + 1});
+    /// init sequence Id
+    initArg(*cpuSeq_, batchSize);
+
+    // todo(tianbing), delete it
+    CHECK_EQ(cpuSeq_->shape().getElements(), cpuSeq_->numSeqs() + 1);
+
+    CpuIVector cpuSeq(cpuSeq_->shape().getElements(), (int*)cpuSeq_->data());
+    GpuIVector gpuSeq(gpuSeq_->shape().getElements(), (int*)gpuSeq_->data());
+    gpuSeq.copyFrom(cpuSeq);
+  }
+
+  void addInputs(const SequenceArg& input) {
+    CHECK_EQ(input.shape().ndims(), 2UL);
+    size_t batchSize = input.shape()[0];
+    if (!cpuSeq_ || !gpuSeq_) {  // sequence not exist
+      addSequence(SequenceIdArg(TensorShape{batchSize}));
+    }
+
+    size_t size =
+        input.shape().getElements() * sizeOfValuType(input.valueType());
+    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+
+    /// SequenceArg
+    cpuInputs_.emplace_back(
+        std::make_shared<SequenceArg>(cpuMemory_.back()->getBuf(),
+                                      input.valueType(),
+                                      input.shape(),
+                                      *cpuSeq_));
+    gpuInputs_.emplace_back(
+        std::make_shared<SequenceArg>(gpuMemory_.back()->getBuf(),
+                                      input.valueType(),
+                                      input.shape(),
+                                      *gpuSeq_));
+  }
+
+  // output need only contains shape, do not contains data.
+  void addOutputs(const BufferArg& output, ArgType argType = ASSIGN_TO) {
+    size_t size =
+        output.shape().getElements() * sizeOfValuType(output.valueType());
+    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+
+    cpuOutputs_.emplace_back(
+        std::make_shared<BufferArg>(cpuMemory_.back()->getBuf(),
+                                    output.valueType(),
+                                    output.shape(),
+                                    argType));
+    gpuOutputs_.emplace_back(
+        std::make_shared<BufferArg>(gpuMemory_.back()->getBuf(),
+                                    output.valueType(),
+                                    output.shape(),
+                                    argType));
+  }
+
+  /// add and init output sparse matrix
+  void addOutputs(const SparseMatrixArg& output, ArgType argType = ASSIGN_TO) {
+    cpuSparse_ = std::make_shared<CpuSparseMatrix>(
+        output.shape()[0],
+        output.shape()[1],
+        output.nnz(),
+        static_cast<SparseValueType>(output.dataType()),
+        static_cast<SparseFormat>(output.dataFormat()));
+
+    gpuSparse_ = std::make_shared<GpuSparseMatrix>(
+        output.shape()[0],
+        output.shape()[1],
+        output.nnz(),
+        static_cast<SparseValueType>(output.dataType()),
+        static_cast<SparseFormat>(output.dataFormat()));
+
+    /// init sparse matrix
+    hl_stream_t stream(HPPL_STREAM_1);
+    cpuSparse_->randomizeUniform();
+    gpuSparse_->copyFrom(*cpuSparse_, stream);
+    hl_stream_synchronize(stream);
+
+    cpuOutputs_.emplace_back(
+        std::make_shared<SparseMatrixArg>(*cpuSparse_, argType));
+    gpuOutputs_.emplace_back(
+        std::make_shared<SparseMatrixArg>(*gpuSparse_, argType));
+  }
+
+  void addOutputs(const SequenceArg& output, ArgType argType = ASSIGN_TO) {
+    CHECK_EQ(output.shape().ndims(), 2UL);
+    size_t batchSize = output.shape()[0];
+
+    if (!cpuSeq_ || !gpuSeq_) {  // sequence not exist
+      addSequence(SequenceIdArg(TensorShape{batchSize}));
+    }
+    size_t size =
+        output.shape().getElements() * sizeOfValuType(output.valueType());
+    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+
+    /// SequenceArg
+    cpuOutputs_.emplace_back(
+        std::make_shared<SequenceArg>(cpuMemory_.back()->getBuf(),
+                                      output.valueType(),
+                                      output.shape(),
+                                      *cpuSeq_,
+                                      argType));
+    gpuOutputs_.emplace_back(
+        std::make_shared<SequenceArg>(gpuMemory_.back()->getBuf(),
+                                      output.valueType(),
+                                      output.shape(),
+                                      *gpuSeq_,
+                                      argType));
+  }
+
+  void addInputs(const SparseMatrixArg& input) {
+    cpuSparse_ = std::make_shared<CpuSparseMatrix>(
+        input.shape()[0],
+        input.shape()[1],
+        input.nnz(),
+        static_cast<SparseValueType>(input.dataType()),
+        static_cast<SparseFormat>(input.dataFormat()));
+
+    gpuSparse_ = std::make_shared<GpuSparseMatrix>(
+        input.shape()[0],
+        input.shape()[1],
+        input.nnz(),
+        static_cast<SparseValueType>(input.dataType()),
+        static_cast<SparseFormat>(input.dataFormat()));
+
+    /// init sparse matrix
+    hl_stream_t stream(HPPL_STREAM_1);
+    cpuSparse_->randomizeUniform();
+    gpuSparse_->copyFrom(*cpuSparse_, stream);
+    hl_stream_synchronize(stream);
 
+    cpuInputs_.emplace_back(std::make_shared<SparseMatrixArg>(*cpuSparse_));
+    gpuInputs_.emplace_back(std::make_shared<SparseMatrixArg>(*gpuSparse_));
+  }
+
+  void run() {
+    // prepare cpu/gpu arguments
+    initInputs();
+
+    initOutputs();
     // function calculate
-    cpu->calc(cpuInputs, cpuOutputs, cpuInouts);
-    gpu->calc(gpuInputs, gpuOutputs, gpuInouts);
-
-    // check outputs and inouts
-    auto checkArgs = [=](const Arguments& cpuArgs, const Arguments& gpuArgs) {
-      for (size_t i = 0; i < cpuArgs.size(); i++) {
-        auto cpu = cpuArgs[i];
-        auto gpu = gpuArgs[i];
-        size_t size = 1;
-        for (auto dim : cpu.dims_) {
-          size *= dim;
-        }
-        CpuVector cpuVector(size, (real*)cpu.getData());
-        GpuVector gpuVector(size, (real*)gpu.getData());
-
-        autotest::TensorCheckErr(cpuVector, gpuVector);
+    auto callFunction = [](FunctionBase* function,
+                           std::vector<BufferArgPtr>& inputs,
+                           std::vector<BufferArgPtr>& outputs) {
+      BufferArgs inArgs;
+      BufferArgs outArgs;
+      for (auto arg : inputs) {
+        inArgs.addArg(*arg);
+      }
+      for (auto arg : outputs) {
+        outArgs.addArg(*arg);
       }
+      function->calc(inArgs, outArgs);
     };
-    checkArgs(cpuOutputs, gpuOutputs);
-    checkArgs(cpuInouts, gpuInouts);
+
+    callFunction(cpuFunc_.get(), cpuInputs_, cpuOutputs_);
+    callFunction(gpuFunc_.get(), gpuInputs_, gpuOutputs_);
+
+    // check outputs
+    compareOutputs();
+  }
+
+  std::shared_ptr<FunctionBase> getCpuFunction() const { return cpuFunc_; }
+
+  std::shared_ptr<FunctionBase> getGpuFunction() const { return gpuFunc_; }
+
+protected:
+  // only init cpu argument, gpu argument copy from cpu argument.
+  void initArg(BufferArg& arg) {
+    CpuVector vector(arg.shape().getElements(), (real*)arg.data());
+    vector.uniform(0.001, 1);
+  }
+
+  void initArg(SequenceArg& arg) {
+    /// init only matrix
+    CpuVector vector(arg.shape().getElements(), (real*)arg.data());
+    vector.uniform(0.001, 1);
+  }
+
+  void initArg(SequenceIdArg& arg, size_t batchSize) {
+    size_t numSeqs = arg.numSeqs();
+    int* buf = reinterpret_cast<int*>(arg.data());
+    int pos = 0;
+    size_t maxLen = 2 * batchSize / numSeqs;
+    for (int i = 0; i < (int)numSeqs; ++i) {
+      int len = 1 + uniformRandom(std::min<int64_t>(
+                        maxLen, batchSize - pos - numSeqs + i));
+      buf[i] = pos;
+      pos += len;
+      VLOG(1) << " len=" << len;
+    }
+    buf[numSeqs] = batchSize;
+  }
+
+  void initInputs() {
+    for (size_t i = 0; i < cpuInputs_.size(); i++) {
+      if (cpuInputs_[i]->isSparseArg()) {
+        continue;  /// sparse matrix already init
+      }
+
+      if (cpuInputs_[i]->isSequenceArg()) {
+        initArg(dynamic_cast<SequenceArg&>(*cpuInputs_[i]));
+      } else {
+        initArg(*cpuInputs_[i]);
+      }
+      // TODO: Need a BufferCopy used to copy from one BufferArg to another.
+      CpuVector cpuVector(cpuInputs_[i]->shape().getElements(),
+                          (real*)cpuInputs_[i]->data());
+      GpuVector gpuVector(gpuInputs_[i]->shape().getElements(),
+                          (real*)gpuInputs_[i]->data());
+
+      gpuVector.copyFrom(cpuVector);
+    }
   }
 
-  std::shared_ptr<FunctionBase> getCpuFunction() const { return cpu; }
+  void initOutputs() {
+    for (size_t i = 0; i < cpuOutputs_.size(); i++) {
+      if (cpuOutputs_[i]->isSparseArg()) {
+        continue;  /// sparse matrix already init
+      }
+
+      if (cpuOutputs_[i]->isSequenceArg()) {
+        initArg(dynamic_cast<SequenceArg&>(*cpuOutputs_[i]));
+      } else {
+        initArg(*cpuOutputs_[i]);
+      }
+
+      // TODO: Need a BufferCopy used to copy from one BufferArg to another.
+      CpuVector cpuVector(cpuOutputs_[i]->shape().getElements(),
+                          (real*)cpuOutputs_[i]->data());
+      GpuVector gpuVector(gpuOutputs_[i]->shape().getElements(),
+                          (real*)gpuOutputs_[i]->data());
 
-  std::shared_ptr<FunctionBase> getGpuFunction() const { return gpu; }
+      gpuVector.copyFrom(cpuVector);
+    }
+  }
+
+  void compareOutputs() {
+    for (size_t i = 0; i < cpuOutputs_.size(); i++) {
+      // TODO, Need a BufferCheck used to compare the two buffers.
+      const auto cpu = cpuOutputs_[i];
+      const auto gpu = gpuOutputs_[i];
+      CHECK_EQ(cpu->numElements(), gpu->numElements());
+      CpuVector cpuVector(cpu->numElements(), (real*)cpu->data());
+      GpuVector gpuVector(gpu->numElements(), (real*)gpu->data());
+      autotest::TensorCheckErr(cpuVector, gpuVector);
+    }
+  }
 
 protected:
-  std::shared_ptr<FunctionBase> cpu;
-  std::shared_ptr<FunctionBase> gpu;
-  std::vector<CpuMemHandlePtr> cpuMemory;
-  std::vector<GpuMemHandlePtr> gpuMemory;
-  Arguments cpuInputs;
-  Arguments cpuOutputs;
-  Arguments cpuInouts;
-  Arguments gpuInputs;
-  Arguments gpuOutputs;
-  Arguments gpuInouts;
+  std::shared_ptr<FunctionBase> cpuFunc_;
+  std::shared_ptr<FunctionBase> gpuFunc_;
+  std::vector<CpuMemHandlePtr> cpuMemory_;
+  std::vector<GpuMemHandlePtr> gpuMemory_;
+  std::vector<BufferArgPtr> cpuInputs_;
+  std::vector<BufferArgPtr> cpuOutputs_;
+  std::vector<BufferArgPtr> gpuInputs_;
+  std::vector<BufferArgPtr> gpuOutputs_;
+  std::shared_ptr<CpuSparseMatrix> cpuSparse_;
+  std::shared_ptr<GpuSparseMatrix> gpuSparse_;
+  std::shared_ptr<SequenceIdArg> cpuSeq_;
+  std::shared_ptr<SequenceIdArg> gpuSeq_;
 };
 
 }  // namespace paddle
diff --git a/paddle/function/MulOp.cpp b/paddle/function/MulOp.cpp
new file mode 100644
index 00000000000000..91b4b8ed91b605
--- /dev/null
+++ b/paddle/function/MulOp.cpp
@@ -0,0 +1,354 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MulOp.h"
+/// todo(tianbing), delete it
+#include <iostream>
+#include "paddle/math/MathFunctions.h"
+#include "paddle/math/SIMDFunctions.h"
+#include "paddle/utils/ThreadLocal.h"
+
+#ifndef PADDLE_TYPE_DOUBLE
+#define GEMM paddle::gemm<float>
+#else
+#define GEMM paddle::gemm<double>
+#endif
+
+namespace {
+inline void vecAddTo(real* a, const real* b, real scaleB, size_t len) {
+  for (unsigned int i = 0; i < len; ++i) {
+    a[i] += (1.0 == scaleB) ? b[i] : scaleB * b[i];
+  }
+}
+
+inline void colVecAddTo(
+    real* a, real* b, real c, size_t len, size_t aWidth, size_t bWidth) {
+  for (unsigned int i = 0; i < len; ++i) {
+    a[i * aWidth] += (1.0 == c) ? b[i * bWidth] : b[i * bWidth] * c;
+  }
+}
+}  // namespace
+
+namespace paddle {
+/// sparse matrix (+)= dense matrix * dense matrix
+template <>
+void MulOp<DEVICE_TYPE_CPU>(CpuSparseMatrix& out,
+                            const CpuMatrix& a,
+                            const CpuMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  CHECK_EQ(out.getValueType(), FLOAT_VALUE);
+  if (scaleT == 0) {
+    out.zeroMem();
+  }
+  const real* A = a.getData();
+  const real* B = b.getData();
+  real* C = out.getValue();
+  int* rows = out.getRows();
+  int* cols = out.getCols();
+  size_t width = out.getWidth();
+  size_t height = out.getHeight();
+
+  /// SPARSE_CSC, {a any, b not trans}
+  if (out.getFormat() == SPARSE_CSC) {
+    /// b not trans and a any
+    CHECK(!bTrans);
+    size_t m = !aTrans ? a.getWidth() : a.getHeight();
+    for (size_t i = 0; i < width; i++) {
+      size_t start = out.getColStartIdx(i);
+      size_t end = out.getColStartIdx(i + 1);
+      for (size_t j = start; j < end; j++) {
+        real sum = 0;
+        size_t rowIdx = rows[j];
+        for (size_t k = 0; k < m; k++) {
+          sum += (!aTrans ? A[rowIdx * m + k] : A[k * height + rowIdx]) *
+                 B[k * width + i];
+        }
+        C[j] = scaleAB * sum + scaleT * C[j];
+      }
+    }
+    return;
+  }
+
+  /// SPARSE_CSR, {a any, b not trans} or {a not trans, b trans}
+  if (out.getFormat() == SPARSE_CSR) {
+    /// a and b can not both transpose
+    CHECK(!(aTrans && bTrans));
+    size_t m = a.getWidth();
+    for (size_t i = 0; i < height; i++) {
+      size_t start = out.getRowStartIdx(i);
+      size_t end = out.getRowStartIdx(i + 1);
+      for (size_t j = start; j < end; j++) {
+        real sum = 0;
+        size_t colIdx = cols[j];
+        for (size_t k = 0; k < m; k++) {
+          sum += (!aTrans ? A[i * m + k] : A[k * height + i]) *
+                 (!bTrans ? B[k * width + colIdx] : B[colIdx * m + k]);
+        }
+        C[j] = scaleAB * sum + scaleT * C[j];
+      }
+    }
+    return;
+  }
+}
+
+/// dense matrix (+)= dense matrix * dense matrix
+template <>
+void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
+                            const CpuMatrix& a,
+                            const CpuMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  GEMM(aTrans ? CblasTrans : CblasNoTrans,
+       bTrans ? CblasTrans : CblasNoTrans,
+       out.getHeight(),
+       out.getWidth(),
+       !aTrans ? a.getWidth() : a.getHeight(),
+       scaleAB,
+       a.getData(),
+       a.getStride(),
+       b.getData(),
+       b.getStride(),
+       scaleT,
+       out.getData(),
+       out.getStride());
+}
+
+/// dense matrix (+)= sparse matrix * dense matrix
+template <>
+void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
+                            const CpuSparseMatrix& a,
+                            const CpuMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  if (scaleT == 0) {
+    out.zeroMem();
+  }
+  const real* B = b.getData();
+  real* C = out.getData();
+  if (out.getWidth() % 32 == 0) {
+    CHECK_EQ((size_t)B % 32, 0UL);
+    CHECK_EQ((size_t)C % 32, 0UL);
+  }
+
+  int* cols = a.getCols();
+  real* values = a.getValue();
+  for (size_t i = 0; i < a.getHeight(); ++i) {
+    const int start = a.getRowStartIdx(i);
+    const int end = a.getRowStartIdx(i + 1);
+    for (int j = start; j < end; ++j) {
+      vecAddTo(!aTrans ? out.getRow(i) : out.getRow(cols[j]),
+               !aTrans ? const_cast<CpuMatrix&>(b).getRow(cols[j])
+                       : const_cast<CpuMatrix&>(b).getRow(i),
+               (a.getValueType() == FLOAT_VALUE) ? values[j] : (real)1.0,
+               out.getWidth());
+    }
+  }
+}
+
+/// dense matrix (+)= dense matrix * sparse matrix
+template <>
+void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
+                            const CpuMatrix& a,
+                            const CpuSparseMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  if (scaleT == 0) {
+    out.zeroMem();
+  }
+  real* A = const_cast<real*>(a.getData());
+  real* B = const_cast<real*>(b.getValue());
+  real* C = out.getData();
+  int* rows = b.getRows();
+  int* cols = b.getCols();
+
+  /// SPARSE_CSC format
+  if (b.getFormat() == SPARSE_CSC) {
+    for (size_t j = 0; j < b.getWidth(); ++j) {
+      int start = b.getColStartIdx(j);
+      int end = b.getColStartIdx(j + 1);
+      for (int i = start; i < end; ++i) {
+        colVecAddTo(!bTrans ? C + j : C + rows[i],
+                    !bTrans ? A + rows[i] : A + j,
+                    (b.getValueType() == NO_VALUE) ? (real)1.0 : B[i],
+                    out.getHeight(),
+                    out.getWidth(),
+                    a.getWidth());
+      }
+    }
+    return;
+  }
+
+  /// SPARSE_CSR format
+  if (b.getFormat() == SPARSE_CSR) {
+    for (size_t j = 0; j < b.getHeight(); ++j) {
+      int start = b.getRowStartIdx(j);
+      int end = b.getRowStartIdx(j + 1);
+      for (int i = start; i < end; ++i) {
+        colVecAddTo(!bTrans ? C + cols[i] : C + j,
+                    !bTrans ? A + j : A + cols[i],
+                    (b.getValueType() == NO_VALUE) ? (real)1.0 : B[i],
+                    out.getHeight(),
+                    out.getWidth(),
+                    a.getWidth());
+      }
+    }
+    return;
+  }
+}
+
+/**
+ * mul operator
+ * out = scaleT * out + scaleAB * (A * B)
+ * here, scaleT in {0, 1}, scaleAB == 1,
+ * out = A * B, ASSIGN_TO
+ * out += A * B, ADD_TO
+ *
+ *
+ * \param outputs[0]      output matrix (out), M * N,
+ *                        could be either Sparse or Dense Matrix
+ *                        M is num of rows, N is num of columns
+ * \param inputs[0]       first input matrix (A),  M * K (if non-trans)
+ *                        could be either Sparse or Dense Matrix
+ *                        M is num of rows, K is num of columns
+ * \param inputs[1]       second input matrix (B), K * N (if non-trans)
+ *                        could be either Sparse or Dense Matrix
+ *                        K is num of rows, N is num of columns
+ *
+ * Support eight Mul operators, with both GPU and CPU devices
+ * For each device, four Mul operators are supported:
+ * 1. dense (out) = dense (A) * dense (B)
+ * 2. dense (out) = sparse (A) * dense (B)
+ *    sparse matrix only support SPARSE_CSR format
+ * 3. dense (out) = dense (A) * sparse (B)
+ *    sparse matrix support SPARSE_CSC and SPARSE_CSR formats
+ * 4. sparse (out) = dense (A) * dense (B)
+ *    sparse matrix support SPARSE_CSC and SPARSE_CSR formats
+ *
+ */
+template <DeviceType Device>
+class MulFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    aTrans_ = config.get<bool>("aTrans");
+    bTrans_ = config.get<bool>("bTrans");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK(!aTrans_ || !bTrans_)
+        << "Not support both a and b are transpose matrices";
+
+    CHECK_EQ((size_t)2, inputs.size());
+    CHECK_EQ((size_t)1, outputs.size());
+    CHECK(inputs[0].data() && inputs[1].data() && outputs[0].data());
+    CHECK_EQ(inputs[0].shape().ndims(), (size_t)2);
+    CHECK_EQ(inputs[1].shape().ndims(), (size_t)2);
+    CHECK_EQ(outputs[0].shape().ndims(), (size_t)2);
+
+    size_t aRow = !aTrans_ ? inputs[0].shape()[0] : inputs[0].shape()[1];
+    size_t aCol = !aTrans_ ? inputs[0].shape()[1] : inputs[0].shape()[0];
+    size_t bRow = !bTrans_ ? inputs[1].shape()[0] : inputs[1].shape()[1];
+    size_t bCol = !bTrans_ ? inputs[1].shape()[1] : inputs[1].shape()[0];
+    /// C = A * B, or C += A * B, for matrix format
+    CHECK_EQ(aCol, bRow);
+    CHECK_EQ(aRow, outputs[0].shape()[0]);
+    CHECK_EQ(bCol, outputs[0].shape()[1]);
+
+    /// only support C = A * B (ASSIGN_TO) or C += A * B (ADD_TO)
+    real scaleT = (outputs[0].getArgType() == ADD_TO) ? 1.0 : 0.0;
+
+    /// support dense = not both sparse * sparse
+    /// or sparse = dense * dense
+    CHECK((!outputs[0].isSparseArg() &&
+           !(inputs[0].isSparseArg() && inputs[1].isSparseArg())) ||
+          (outputs[0].isSparseArg() && !inputs[0].isSparseArg() &&
+           !inputs[1].isSparseArg()));
+
+    auto outMat = outputs[0].matrix<Device>();
+    /// dense matrix = dense matrix * dense matrix
+    if (!inputs[0].isSparseArg() && !inputs[1].isSparseArg() &&
+        !outputs[0].isSparseArg()) {
+      MulOp<Device>(outMat,
+                    inputs[0].matrix<Device>(),
+                    inputs[1].matrix<Device>(),
+                    1.0,  // scaleAB
+                    scaleT,
+                    aTrans_,
+                    bTrans_);
+      return;
+    }
+
+    /// dense matrix = dense matrix * sparse matrix
+    if (!inputs[0].isSparseArg() && inputs[1].isSparseArg() &&
+        !outputs[0].isSparseArg()) {
+      CHECK(!aTrans_) << "Not supported a transpose";
+      MulOp<Device>(outMat,
+                    inputs[0].matrix<Device>(),
+                    inputs[1].sparse().SparseMatrix<Device>(),
+                    1.0,  // scaleAB
+                    scaleT,
+                    aTrans_,
+                    bTrans_);
+      return;
+    }
+
+    /// dense matrix = sparse matrix * dense matrix
+    if (inputs[0].isSparseArg() && !inputs[1].isSparseArg() &&
+        !outputs[0].isSparseArg()) {
+      CHECK(!bTrans_) << "Not supported b transpose";
+      CHECK_EQ(inputs[0].sparse().dataFormat(), T_SPARSE_CSR)
+          << "Only supported SPARSE_CSR format for sparse matrix a";
+      MulOp<Device>(outMat,
+                    inputs[0].sparse().SparseMatrix<Device>(),
+                    inputs[1].matrix<Device>(),
+                    1.0,  // scaleAB
+                    scaleT,
+                    aTrans_,
+                    bTrans_);
+      return;
+    }
+
+    /// sparse matrix = dense matrix * dense matrix
+    auto outSparseMat = outputs[0].sparse().SparseMatrix<Device>();
+    if (!inputs[0].isSparseArg() && !inputs[1].isSparseArg() &&
+        outputs[0].isSparseArg()) {
+      MulOp<Device>(outSparseMat,
+                    inputs[0].matrix<Device>(),
+                    inputs[1].matrix<Device>(),
+                    1.0,  // scaleAB
+                    scaleT,
+                    aTrans_,
+                    bTrans_);
+      return;
+    }
+  }
+
+private:
+  bool aTrans_;
+  bool bTrans_;
+};
+
+REGISTER_TYPED_FUNC(MulOp, CPU, MulFunc);
+#ifndef PADDLE_ONLY_CPU
+REGISTER_TYPED_FUNC(MulOp, GPU, MulFunc);
+#endif
+}  // namespace paddle
diff --git a/paddle/function/MulOp.h b/paddle/function/MulOp.h
new file mode 100644
index 00000000000000..b6016a6ab6e9d6
--- /dev/null
+++ b/paddle/function/MulOp.h
@@ -0,0 +1,102 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Function.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/SparseMatrix.h"
+
+namespace paddle {
+/// CPU, dense matrix (+)= dense matrix * dense matrix
+template <DeviceType DType>
+void MulOp(CpuMatrix& out,
+           const CpuMatrix& a,
+           const CpuMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+/// CPU, dense matrix (+)= sparse matrix * dense matrix
+template <DeviceType DType>
+void MulOp(CpuMatrix& out,
+           const CpuSparseMatrix& a,
+           const CpuMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+/// CPU, dense matrix (+)= dense matrix * sparse matrix
+template <DeviceType DType>
+void MulOp(CpuMatrix& out,
+           const CpuMatrix& a,
+           const CpuSparseMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+/// CPU, sparse matrix (+)= dense matrix * dense matrix
+template <DeviceType DType>
+void MulOp(CpuSparseMatrix& out,
+           const CpuMatrix& a,
+           const CpuMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+/// GPU, dense matrix (+)= dense matrix * dense matrix
+template <DeviceType DType>
+void MulOp(GpuMatrix& out,
+           const GpuMatrix& a,
+           const GpuMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+/// GPU, dense matrix (+)= sparse matrix * dense matrix
+template <DeviceType DType>
+void MulOp(GpuMatrix& out,
+           const GpuSparseMatrix& a,
+           const GpuMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+/// GPU, dense matrix (+)= dense matrix * sparse matrix
+template <DeviceType DType>
+void MulOp(GpuMatrix& out,
+           const GpuMatrix& a,
+           const GpuSparseMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+/// GPU, sparse matrix (+)= dense matrix * dense matrix
+template <DeviceType DType>
+void MulOp(GpuSparseMatrix& out,
+           const GpuMatrix& a,
+           const GpuMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+}  // namespace paddle
diff --git a/paddle/function/MulOpGpu.cu b/paddle/function/MulOpGpu.cu
new file mode 100644
index 00000000000000..dcfcb2325d7dae
--- /dev/null
+++ b/paddle/function/MulOpGpu.cu
@@ -0,0 +1,130 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_base.h"
+#include "MulOp.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/SparseMatrix.h"
+
+namespace paddle {
+/// dense matrix (+)= dense matrix * dense matrix
+template <>
+void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
+                            const GpuMatrix& a,
+                            const GpuMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
+  hl_matrix_mul(const_cast<real*>(a.getData()),
+                !aTrans ? HPPL_OP_N : HPPL_OP_T,
+                const_cast<real*>(b.getData()),
+                !bTrans ? HPPL_OP_N : HPPL_OP_T,
+                const_cast<real*>(out.getData()),
+                out.getHeight(),
+                out.getWidth(),
+                !aTrans ? a.getWidth() : a.getHeight(),
+                scaleAB,
+                scaleT,
+                a.getStride(),
+                b.getStride(),
+                out.getStride());
+}
+
+/// dense matrix (+)= sparse matrix * dense matrix
+template <>
+void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
+                            const GpuSparseMatrix& a,
+                            const GpuMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  CHECK(out.isContiguous());
+  CHECK(b.isContiguous());
+  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
+  hl_matrix_csr_mul_dense(a.sMatrix_.get(),
+                          aTrans ? HPPL_OP_T : HPPL_OP_N,
+                          const_cast<real*>(b.getData()),
+                          HPPL_OP_N,
+                          const_cast<real*>(out.getData()),
+                          out.getHeight(),
+                          out.getWidth(),
+                          b.getHeight(),
+                          scaleAB,
+                          scaleT);
+}
+
+/// dense matrix (+)= dense matrix * sparse matrix
+template <>
+void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
+                            const GpuMatrix& a,
+                            const GpuSparseMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  CHECK(out.isContiguous());
+  CHECK(a.isContiguous());
+  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
+
+  if (b.format_ == SPARSE_CSC) {
+    hl_matrix_dense_mul_csc(const_cast<real*>(a.getData()),
+                            HPPL_OP_N,
+                            b.sMatrix_.get(),
+                            bTrans ? HPPL_OP_T : HPPL_OP_N,
+                            const_cast<real*>(out.getData()),
+                            out.getHeight(),
+                            out.getWidth(),
+                            a.getWidth(),
+                            scaleAB,
+                            scaleT);
+  } else {
+    hl_matrix_dense_mul_csr(const_cast<real*>(a.getData()),
+                            HPPL_OP_N,
+                            b.sMatrix_.get(),
+                            bTrans ? HPPL_OP_T : HPPL_OP_N,
+                            const_cast<real*>(out.getData()),
+                            out.getHeight(),
+                            out.getWidth(),
+                            a.getWidth(),
+                            scaleAB,
+                            scaleT);
+  }
+}
+
+/// sparse matrix (+)= dense matrix * dense matrix
+template <>
+void MulOp<DEVICE_TYPE_GPU>(GpuSparseMatrix& out,
+                            const GpuMatrix& a,
+                            const GpuMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
+  hl_sparse_matrix_mul(const_cast<real*>(a.getData()),
+                       aTrans ? HPPL_OP_T : HPPL_OP_N,
+                       const_cast<real*>(b.getData()),
+                       bTrans ? HPPL_OP_T : HPPL_OP_N,
+                       out.sMatrix_.get(),
+                       out.getHeight(),
+                       out.getWidth(),
+                       !bTrans ? b.getHeight() : b.getWidth(),
+                       scaleAB,
+                       scaleT);
+}
+
+}  // namespace paddle
diff --git a/paddle/function/MulOpTest.cpp b/paddle/function/MulOpTest.cpp
new file mode 100644
index 00000000000000..8748eb0d79fa0f
--- /dev/null
+++ b/paddle/function/MulOpTest.cpp
@@ -0,0 +1,212 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/SparseMatrix.h"
+#include "paddle/math/tests/test_matrixUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+
+/**
+ *  C += A * B, A, B, C dense matrix
+ *  dense = dense * dense
+ */
+void testFuncDDDMatrix(
+    bool transa, bool transb, size_t dimM, size_t dimN, size_t dimK) {
+  real scaleT = 1.0;
+  size_t heightA = (transa == false) ? dimM : dimK;
+  size_t widthA = (transa == false) ? dimK : dimM;
+  size_t heightB = (transb == false) ? dimK : dimN;
+  size_t widthB = (transb == false) ? dimN : dimK;
+  size_t heightC = dimM;
+  size_t widthC = dimN;
+  // init Test object
+  FunctionCompare test(
+      "MulOp", FuncConfig().set("aTrans", transa).set("bTrans", transb));
+  // prepare input arguments
+  /// matrix A : HA * WA
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightA, widthA}));
+  /// matrix B: HB * WB
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightB, widthB}));
+
+  /// output matrix C: HC * WC
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightC, widthC}),
+                  scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
+  // run Function
+  test.run();
+}
+
+TEST(MulOp, DDDMatrixMul) {
+  LOG(INFO) << "function test for dense = dense * dense matrix";
+  for (const auto transa : {false, true}) {
+    for (const auto transb : {false, true}) {
+      for (const auto dimM : {1, 10, 100}) {
+        for (const auto dimN : {1, 10}) {
+          for (const auto dimK : {8}) {
+            if (transa && transb) {
+              continue;
+            }
+            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
+                    << " transa=" << transa << " transb=" << transb
+                    << " dimM=" << std::setw(5) << dimM
+                    << " dimN=" << std::setw(5) << dimN
+                    << " dimK=" << std::setw(5) << dimK;
+            testFuncDDDMatrix(transa, transb, dimM, dimN, dimK);
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+  * C += A * B, B, C dense, A sparse
+  * dense = sparse * dense
+  */
+void testFuncDSparseDMatrix(
+    size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
+  real scaleT = 1.0;
+  // init Test object
+  FunctionCompare test("MulOp",
+                       FuncConfig().set("aTrans", false).set("bTrans", false));
+  // prepare input arguments
+  /// sparse matrix A : M * K
+  test.addInputs(SparseMatrixArg(
+      VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}, nnz, FORMAT, FLOAT_VALUE));
+  /// matrix B: K * N
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimK, dimN}));
+
+  /// output matrix C: M * N
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}),
+                  scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
+  // run Function
+  test.run();
+}
+
+TEST(MuLOp, DSparseDMul) {
+  LOG(INFO) << "function test for dense = sparse * dense matrix";
+  for (const auto dimM : {10, 100, 1000}) {
+    for (const auto dimN : {10, 100}) {
+      for (const auto dimK : {3, 10}) {
+        for (const auto nnz : {3, 10}) {
+          for (const auto FORMAT : {SPARSE_CSR}) {
+            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
+                    << " dimM=" << std::setw(5) << dimM
+                    << " dimN=" << std::setw(5) << dimN
+                    << " dimK=" << std::setw(5) << dimK
+                    << " nnz=" << std::setw(5) << nnz
+                    << " format=" << std::setw(5) << FORMAT;
+            testFuncDSparseDMatrix(dimM, dimN, dimK, nnz, FORMAT);
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+  * C += A * B, A, C dense, B sparse
+  * dense = dense * sparse
+  */
+void testFuncDDSparseMatrix(
+    size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
+  real scaleT = 1.0;
+  // init Test object
+  FunctionCompare test("MulOp",
+                       FuncConfig().set("aTrans", false).set("bTrans", false));
+  // prepare input arguments
+  /// matrix A : M * K
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}));
+
+  /// matrix B: K * N
+  test.addInputs(SparseMatrixArg(
+      VALUE_TYPE_FLOAT, TensorShape{dimK, dimN}, nnz, FORMAT, FLOAT_VALUE));
+
+  /// output matrix C: M * N
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}),
+                  scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
+  // run Function
+  test.run();
+}
+
+TEST(MulOp, DDSparseMul) {
+  LOG(INFO) << "function test for dense = dense * sparse matrix";
+  for (const auto dimM : {10, 100, 1000}) {
+    for (const auto dimN : {10, 100}) {
+      for (const auto dimK : {3, 10}) {
+        for (const auto nnz : {3, 10}) {
+          for (const auto FORMAT : {SPARSE_CSR, SPARSE_CSC}) {
+            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
+                    << " dimM=" << std::setw(5) << dimM
+                    << " dimN=" << std::setw(5) << dimN
+                    << " dimK=" << std::setw(5) << dimK
+                    << " nnz=" << std::setw(5) << nnz
+                    << " format=" << std::setw(5) << FORMAT;
+            testFuncDDSparseMatrix(dimM, dimN, dimK, nnz, FORMAT);
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+  * C += A * B, A sparse, B, C dense
+  * sparse = dense * dense
+  */
+void testFuncSparseDDMatrix(
+    size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
+  real scaleT = 1.0;
+  // init Test object
+  FunctionCompare test("MulOp",
+                       FuncConfig().set("aTrans", false).set("bTrans", false));
+  // prepare input arguments
+  /// matrix A : M * K
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}));
+
+  /// matrix B: K * N
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimK, dimN}));
+
+  /// output sparse matrix C: M * N
+  test.addOutputs(
+      SparseMatrixArg(
+          VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}, nnz, FORMAT, FLOAT_VALUE),
+      scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
+  // run Function
+  test.run();
+}
+
+TEST(MulOp, SparseDDMul) {
+  LOG(INFO) << "function test for sparse = dense * dense matrix";
+  for (const auto dimM : {10, 100, 1000}) {
+    for (const auto dimN : {10, 100}) {
+      for (const auto dimK : {3, 10}) {
+        for (const auto nnz : {3, 10}) {
+          for (const auto FORMAT : {SPARSE_CSC, SPARSE_CSR}) {
+            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
+                    << " dimM=" << std::setw(5) << dimM
+                    << " dimN=" << std::setw(5) << dimN
+                    << " dimK=" << std::setw(5) << dimK
+                    << " nnz=" << std::setw(5) << nnz
+                    << " format=" << std::setw(5) << FORMAT;
+            testFuncSparseDDMatrix(dimM, dimN, dimK, nnz, FORMAT);
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/paddle/function/PadOp.cpp b/paddle/function/PadOp.cpp
new file mode 100644
index 00000000000000..f1a0d2a1a96f24
--- /dev/null
+++ b/paddle/function/PadOp.cpp
@@ -0,0 +1,223 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PadOp.h"
+#include "paddle/math/Vector.h"
+
+namespace paddle {
+
+template <>
+void Pad<DEVICE_TYPE_CPU>(real* outputs,
+                          const real* inputs,
+                          const int num,
+                          const int inC,
+                          const int inH,
+                          const int inW,
+                          const PadConf& pad) {
+  int cstart = pad.channelStart, cend = pad.channelEnd;
+  int hstart = pad.heightStart, hend = pad.heightEnd;
+  int wstart = pad.widthStart, wend = pad.widthEnd;
+  int outC = inC + cstart + cend;
+  int outH = inH + hstart + hend;
+  int outW = inW + wstart + wend;
+  for (int i = 0; i < num; i++) {
+    for (int c = 0; c < inC; c++) {
+      for (int h = 0; h < inH; h++) {
+        int inoff = ((i * inC + c) * inH + h) * inW;
+        int outoff =
+            ((i * outC + c + cstart) * outH + h + hstart) * outW + wstart;
+        memcpy(outputs + outoff, inputs + inoff, inW * sizeof(real));
+      }
+    }
+  }
+}
+
+template <>
+void PadGrad<DEVICE_TYPE_CPU>(real* inGrad,
+                              const real* outGrad,
+                              const int num,
+                              const int inC,
+                              const int inH,
+                              const int inW,
+                              const PadConf& pad) {
+  int cstart = pad.channelStart, cend = pad.channelEnd;
+  int hstart = pad.heightStart, hend = pad.heightEnd;
+  int wstart = pad.widthStart, wend = pad.widthEnd;
+  int outC = inC + cstart + cend;
+  int outH = inH + hstart + hend;
+  int outW = inW + wstart + wend;
+  for (int i = 0; i < num; i++) {
+    for (int c = 0; c < inC; c++) {
+      for (int h = 0; h < inH; h++) {
+        int inoff = ((i * inC + c) * inH + h) * inW;
+        int outoff =
+            ((i * outC + c + cstart) * outH + h + hstart) * outW + wstart;
+        CpuVector inG = CpuVector(inW, inGrad + inoff);
+        CpuVector outG = CpuVector(inW, const_cast<real*>(outGrad + outoff));
+        inG += outG;
+      }
+    }
+  }
+}
+
+/**
+ * \brief Padding zeros to input according to the specify dimension.
+ *        The struct pad_ contains the padding size in each dimension.
+ *        The input and output is a 4D tensor. In PadFunc, we only
+ *        pad zeros to the 2nd to 4th dimension.
+ *
+ * Argument in this Function:
+ * \param pad_    A struct object contains the padding size in each dimension.
+ *                It has six integers. The channelStart and channelEnd indicate
+ *                how many zeros to add before and after the input in channel
+ *                dimension. And the heightStart and heightEnd indicate padding
+ *                in height dimension. The widthStart and widthEnd indicate the
+ *                padding in width dimension.
+ * \param inputs  A 4D tensor, only one input.
+ * \param outputs A 4D tensor, the output value after padding.
+ *
+ * For example,
+ * Input(2,2,2,3) = [
+ *                    [ [[1,2,3], [3,4,5]],
+ *                      [[2,3,5], [1,6,7]] ],
+ *                    [ [[4,3,1], [1,8,7]],
+ *                      [[3,8,9], [2,3,5]] ]
+ *                  ] # the shape is (1,2,2,3)
+ *
+ * pad_: if channelStart = channelEnd = 1, others are 0.
+ * Output(2,4,2,3) = [
+ *                    [ [[0,0,0], [0,0,0]],
+ *                      [[1,2,3], [3,4,5]],
+ *                      [[2,3,5], [1,6,7]],
+ *                      [[0,0,0], [0,0,0]] ],
+ *                    [ [[0,0,0], [0,0,0]],
+ *                      [[4,3,1], [1,8,7]],
+ *                      [[3,8,9], [2,3,5]],
+ *                      [[0,0,0], [0,0,0]] ]
+ *                   ] # the shape is (2,4,2,3)
+ *
+ * pad_: if widthStart = 1, widthEnd = 2, others are 0.
+ * Output(2,2,2,6) = [
+ *                     [ [[0,1,2,3,0,0], [0,3,4,5,0,0]],
+ *                       [[0,2,3,5,0,0], [0,1,6,7,0,0]] ],
+ *                     [ [[0,4,3,1,0,0], [0,1,8,7,0,0]],
+ *                       [[0,3,8,9,0,0], [0,2,3,5,0,0]] ],
+ *                   ] # the shape is (2,2,2,6)
+ *
+ * pad_: if heightStart = 1, heightEnd = 1, others are 0.
+ * Output(2,2,4,3) = [
+ *                     [ [[0,0,0], [1,2,3], [3,4,5], [0,0,0]],
+ *                       [[0,0,0], [2,3,5], [1,6,7], [0,0,0]] ],
+ *                     [ [[0,0,0], [4,3,1], [1,8,7], [0,0,0]],
+ *                       [[0,0,0], [3,8,9], [2,3,5], [0,0,0]] ],
+ *                   ] # the shape is (2,2,4,3)
+ */
+
+template <DeviceType Device>
+class PadFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    pad_.channelStart = config.get<int>("cstart");
+    pad_.channelEnd = config.get<int>("cend");
+    pad_.heightStart = config.get<int>("hstart");
+    pad_.heightEnd = config.get<int>("hend");
+    pad_.widthStart = config.get<int>("wstart");
+    pad_.widthEnd = config.get<int>("wend");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+
+    size_t num = inputs[0].shape()[0];
+    size_t inC = inputs[0].shape()[1];
+    size_t inH = inputs[0].shape()[2];
+    size_t inW = inputs[0].shape()[3];
+    typename Tensor<real, Device>::Vector vec(outputs[0].shape().getElements(),
+                                              outputs[0].data<real>());
+    vec.zero();
+
+    Pad<Device>(outputs[0].data<real>(),
+                inputs[0].data<real>(),
+                num,
+                inC,
+                inH,
+                inW,
+                pad_);
+  }
+
+private:
+  PadConf pad_;
+};
+
+/**
+ * \brief The backward propagation of padding Function. Remove the elements
+ *        in the padding positions of forward.
+ *
+ * Argument in this Function:
+ * \param pad_    The same meaning as it in PadFunc.
+ * \param inputs  The gradient with respect to the output value of PadFunc.
+ * \param outputs The gradient with respect to the input value of PadFunc.
+ */
+
+template <DeviceType Device>
+class PadGradFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    pad_.channelStart = config.get<int>("cstart");
+    pad_.channelEnd = config.get<int>("cend");
+    pad_.heightStart = config.get<int>("hstart");
+    pad_.heightEnd = config.get<int>("hend");
+    pad_.widthStart = config.get<int>("wstart");
+    pad_.widthEnd = config.get<int>("wend");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+
+    size_t num = outputs[0].shape()[0];
+    size_t inC = outputs[0].shape()[1];
+    size_t inH = outputs[0].shape()[2];
+    size_t inW = outputs[0].shape()[3];
+
+    if (outputs[0].getArgType() != ADD_TO) {
+      // for unit test
+      typename Tensor<real, Device>::Vector tmp(
+          outputs[0].shape().getElements(), outputs[0].data<real>());
+      tmp.zero();
+    }
+
+    PadGrad<Device>(outputs[0].data<real>(),
+                    inputs[0].data<real>(),
+                    num,
+                    inC,
+                    inH,
+                    inW,
+                    pad_);
+  }
+
+private:
+  PadConf pad_;
+};
+
+REGISTER_TYPED_FUNC(Pad, CPU, PadFunc);
+REGISTER_TYPED_FUNC(PadGrad, CPU, PadGradFunc);
+#ifndef PADDLE_ONLY_CPU
+REGISTER_TYPED_FUNC(Pad, GPU, PadFunc);
+REGISTER_TYPED_FUNC(PadGrad, GPU, PadGradFunc);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/PadOp.h b/paddle/function/PadOp.h
new file mode 100644
index 00000000000000..7b5c730a6a0fa5
--- /dev/null
+++ b/paddle/function/PadOp.h
@@ -0,0 +1,79 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Function.h"
+
+namespace paddle {
+
+struct PadConf {
+  /// how many values to add before the data along channel dimension.
+  int channelStart;
+  /// how many values to add after the data along channel dimension.
+  int channelEnd;
+  /// how many values to add before the data along height dimension.
+  int heightStart;
+  /// how many values to add after the data along height dimension.
+  int heightEnd;
+  /// how many values to add before the data along width dimension.
+  int widthStart;
+  /// how many values to add after the data along width dimension.
+  int widthEnd;
+};
+
+/**
+ * \brief  This funtion pads zeros to inputs according to the specify dimension.
+ *         The input and output is a 4D tensor. Padding zeros from the 2nd to
+ *         the 4th dimenstion according argument of pad.
+ *
+ * \param[out] outputs save results.
+ * \param[in]  inputs  input data.
+ * \param[in]  num     batch size of input data.
+ * \param[in]  inC     channel number of input data.
+ * \param[in]  inH     height of input data.
+ * \param[in]  inH     with of input data.
+ * \param[in]  pad     the padding config, contains the size along the
+ *                     specify dimension.
+ */
+template <DeviceType Device>
+void Pad(real* outputs,
+         const real* inputs,
+         const int num,
+         const int inC,
+         const int inH,
+         const int inW,
+         const PadConf& pad);
+
+/**
+ * \brief   Padding operation backward.
+ *
+ * \param[out] inGrad  gradients of previous layer.
+ * \param[in]  outGrad output gradients.
+ * \param[in]  num     batch size of input data.
+ * \param[in]  inC     channel number of input data.
+ * \param[in]  inH     height of input data.
+ * \param[in]  inH     with of input data.
+ * \param[in]  pad     the padding config, contains the size along the
+ *                     specify dimension.
+ */
+template <DeviceType Device>
+void PadGrad(real* inGrad,
+             const real* outGrad,
+             const int num,
+             const int inC,
+             const int inH,
+             const int inW,
+             const PadConf& pad);
+}  // namespace paddle
diff --git a/paddle/function/PadOpGpu.cu b/paddle/function/PadOpGpu.cu
new file mode 100644
index 00000000000000..9104b1aca507c5
--- /dev/null
+++ b/paddle/function/PadOpGpu.cu
@@ -0,0 +1,98 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_base.h"
+#include "PadOp.h"
+
+namespace paddle {
+
+__global__ void KePad(real* outputs, const real* inputs,
+                      int inC, int inH, int inW,
+                      int padc, int padh, int padw,
+                      int outC, int outH, int outW, int nthreads) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < nthreads) {
+    const int w = idx % inW;
+    const int h = (idx / inW) % inH;
+    const int c = (idx / inW / inH) % inC;
+    const int n = idx / inW / inH / inC;
+
+    const int off = ((n * outC + c + padc) * outH + h + padh) * outW + padw + w;
+    outputs[off] = inputs[idx];
+  }
+}
+
+template <>
+void Pad<DEVICE_TYPE_GPU>(real* outputs,
+                          const real* inputs,
+                          const int num,
+                          const int inC,
+                          const int inH,
+                          const int inW,
+                          const PadConf& pad) {
+  size_t nth = num * inC * inH * inW;
+  int blockSize = 1024;
+  int gridSize = (nth + 1024 - 1) / 1024;
+  int cstart = pad.channelStart, cend = pad.channelEnd;
+  int hstart = pad.heightStart, hend = pad.heightEnd;
+  int wstart = pad.widthStart, wend = pad.widthEnd;
+  int outC = inC + cstart + cend;
+  int outH = inH + hstart + hend;
+  int outW = inW + wstart + wend;
+  KePad<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+    (outputs, inputs, inC, inH, inW, cstart, hstart, wstart,
+     outC, outH, outW, nth);
+  CHECK_SYNC("Pad");
+}
+
+__global__ void KePadDiff(real* inGrad, const real* outGrad,
+                          int inC, int inH, int inW,
+                          int padc, int padh, int padw,
+                          int outC, int outH, int outW, int nthreads) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < nthreads) {
+    const int w = idx % inW;
+    const int h = (idx / inW) % inH;
+    const int c = (idx / inW / inH) % inC;
+    const int n = idx / inW / inH / inC;
+
+    const int off = ((n * outC + c + padc) * outH + h + padh) * outW + padw + w;
+    inGrad[idx] += outGrad[off];
+  }
+}
+
+template <>
+void PadGrad<DEVICE_TYPE_GPU>(real* inGrad,
+                              const real* outGrad,
+                              const int num,
+                              const int inC,
+                              const int inH,
+                              const int inW,
+                              const PadConf& pad) {
+  int nth = num * inC * inH * inW;
+  int blockSize = 1024;
+  int gridSize = (nth + 1024 - 1) / 1024;
+  int cstart = pad.channelStart, cend = pad.channelEnd;
+  int hstart = pad.heightStart, hend = pad.heightEnd;
+  int wstart = pad.widthStart, wend = pad.widthEnd;
+  int outC = inC + cstart + cend;
+  int outH = inH + hstart + hend;
+  int outW = inW + wstart + wend;
+  KePadDiff <<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+    (inGrad, outGrad, inC, inH, inW, cstart, hstart, wstart,
+     outC, outH, outW, nth);
+  CHECK_SYNC("PadGrad");
+}
+
+}  // namespace paddle
diff --git a/paddle/function/PadOpTest.cpp b/paddle/function/PadOpTest.cpp
new file mode 100644
index 00000000000000..cd22d911356791
--- /dev/null
+++ b/paddle/function/PadOpTest.cpp
@@ -0,0 +1,75 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+
+namespace paddle {
+
+TEST(Pad, real) {
+  for (size_t numSamples : {5, 32}) {
+    for (size_t channels : {1, 5, 32}) {
+      for (size_t imgSizeH : {5, 33, 100}) {
+        for (size_t imgSizeW : {5, 32, 96}) {
+          VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
+                  << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
+
+          FunctionCompare compare("Pad",
+                                  FuncConfig()
+                                      .set("cstart", 2)
+                                      .set("cend", 3)
+                                      .set("hstart", 1)
+                                      .set("hend", 2)
+                                      .set("wstart", 3)
+                                      .set("wend", 2));
+          TensorShape inDims{numSamples, channels, imgSizeH, imgSizeW};
+          TensorShape outDims{
+              numSamples, channels + 5, imgSizeH + 3, imgSizeW + 5};
+          compare.addInputs(BufferArg(VALUE_TYPE_FLOAT, inDims));
+          compare.addOutputs(BufferArg(VALUE_TYPE_FLOAT, outDims, ASSIGN_TO));
+          compare.run();
+        }
+      }
+    }
+  }
+}
+
+TEST(PadGrad, real) {
+  for (size_t numSamples : {5, 32}) {
+    for (size_t channels : {1, 5, 32}) {
+      for (size_t imgSizeH : {5, 33, 100}) {
+        for (size_t imgSizeW : {5, 32, 96}) {
+          VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
+                  << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
+          FunctionCompare compare("PadGrad",
+                                  FuncConfig()
+                                      .set("cstart", 2)
+                                      .set("cend", 3)
+                                      .set("hstart", 1)
+                                      .set("hend", 2)
+                                      .set("wstart", 3)
+                                      .set("wend", 2));
+          TensorShape inDims{numSamples, channels, imgSizeH, imgSizeW};
+          TensorShape outDims{
+              numSamples, channels + 5, imgSizeH + 3, imgSizeW + 5};
+          compare.addInputs(BufferArg(VALUE_TYPE_FLOAT, outDims));
+          compare.addOutputs(BufferArg(VALUE_TYPE_FLOAT, inDims, ASSIGN_TO));
+          compare.run();
+        }
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/function/TensorShape.h b/paddle/function/TensorShape.h
index e491e3f1d6b26e..cda58f19dfa4a8 100644
--- a/paddle/function/TensorShape.h
+++ b/paddle/function/TensorShape.h
@@ -55,6 +55,15 @@ class TensorShape {
     numElements();
   }
 
+  void reshape(std::initializer_list<size_t> dims) {
+    ndims_ = dims.size();
+    if (ndims_ > kMinDims) {
+      dims_.resize(ndims_);
+    }
+    dims_.assign(dims);
+    numElements();
+  }
+
   // number of dimensions of the tensor
   size_t ndims() const { return ndims_; }
 
@@ -82,7 +91,7 @@ class TensorShape {
 
   // init dims_
   void initDims(size_t ndims) {
-    size_t count = ndims < 4 ? 4 : ndims;
+    size_t count = ndims < kMinDims ? kMinDims : ndims;
     dims_.assign(count, 1);
   }
 
@@ -92,6 +101,7 @@ class TensorShape {
   // number of elements
   size_t nelements_;
   std::vector<size_t> dims_;
+  static const size_t kMinDims = 4;
 };
 
 }  // namespace paddle
diff --git a/paddle/function/TensorType.h b/paddle/function/TensorType.h
index 98942cff9e2ea4..8308bbd8ad4fe1 100644
--- a/paddle/function/TensorType.h
+++ b/paddle/function/TensorType.h
@@ -31,6 +31,10 @@ enum DeviceType {
   DEVICE_TYPE_GPU = 2
 };
 
+enum SparseDataType { T_NO_VALUE = 0, T_FLOAT_VALUE = 1 };
+
+enum SparseDataFormat { T_SPARSE_CSR = 0, T_SPARSE_CSC = 1 };
+
 inline int sizeOfValuType(ValueType valueType) {
   if (valueType == VALUE_TYPE_INT32) {
     return 4;
@@ -87,6 +91,29 @@ struct MatrixT<int, DEVICE_TYPE_GPU> {
   using type = void;  // Not implemented
 };
 
+template <typename VType, DeviceType Device>
+struct SparseMatrixT;
+
+template <>
+struct SparseMatrixT<real, DEVICE_TYPE_CPU> {
+  using type = CpuSparseMatrix;
+};
+
+template <>
+struct SparseMatrixT<real, DEVICE_TYPE_GPU> {
+  using type = GpuSparseMatrix;
+};
+
+template <>
+struct SparseMatrixT<int, DEVICE_TYPE_CPU> {
+  using type = void;  // Not implemented
+};
+
+template <>
+struct SparseMatrixT<int, DEVICE_TYPE_GPU> {
+  using type = void;  // Not implemented
+};
+
 template <typename VType, DeviceType Device>
 struct VectorT;
 
@@ -114,8 +141,9 @@ struct VectorT<int, DEVICE_TYPE_GPU> {
 
 template <typename VType, DeviceType DType>
 struct Tensor {
-  typedef typename detail::MatrixT<VType, DType>::type Matrix;
   typedef typename detail::VectorT<VType, DType>::type Vector;
+  typedef typename detail::MatrixT<VType, DType>::type Matrix;
+  typedef typename detail::SparseMatrixT<VType, DType>::type SparseMatrix;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp
index f8c4bcac2f8eb4..c541b72e104bf2 100644
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -69,8 +69,14 @@ static ClassRegistrar<ActivationFunction> gActivationRegistrar;
 class IdentityActivation : public ActivationFunction {
 public:
   static const std::string name;
-  void forward(Argument& act) { (void)act; }
-  void backward(Argument& act) { (void)act; }
+  Error __must_check forward(Argument& act) {
+    (void)act;
+    return Error();
+  }
+  Error __must_check backward(Argument& act) {
+    (void)act;
+    return Error();
+  }
   const std::string& getName() const { return name; }
 };
 const std::string IdentityActivation::name = "";
@@ -86,8 +92,14 @@ static InitFunction __reg_activation__identity([] {
  * \f]
  */
 BEGIN_DEFINE_ACTIVATION(sigmoid)
-void forward(Argument& act) { act.value->sigmoid(*act.value); }
-void backward(Argument& act) { act.grad->sigmoidDerivative(*act.value); }
+Error __must_check forward(Argument& act) {
+  act.value->sigmoid(*act.value);
+  return Error();
+}
+Error __must_check backward(Argument& act) {
+  act.grad->sigmoidDerivative(*act.value);
+  return Error();
+}
 END_DEFINE_ACTIVATION(sigmoid)
 
 /**
@@ -103,9 +115,12 @@ MatrixPtr sftMaxDot_;
 MatrixPtr one_;
 
 public:
-void forward(Argument& act) { act.value->softmax(*act.value); }
+Error __must_check forward(Argument& act) {
+  act.value->softmax(*act.value);
+  return Error();
+}
 
-void backward(Argument& act) {
+Error __must_check backward(Argument& act) {
   MatrixPtr outputV = act.value;
   MatrixPtr outputG = act.grad;
 
@@ -137,6 +152,7 @@ void backward(Argument& act) {
 
     act.grad->softmaxDerivative(*act.value, *sftMaxSum_);
   }
+  return Error();
 }
 END_DEFINE_ACTIVATION(softmax)
 
@@ -151,8 +167,11 @@ ACTIVATION_CLASS_NAME(softmax) softmax_;
 Argument argument_;
 
 public:
-void forward(Argument& act) {
-  CHECK_EQ(act.value->getWidth(), 1UL);
+Error __must_check forward(Argument& act) {
+  if (act.value->getWidth() != 1UL) {
+    return Error(
+        "Input width for each timestep of sequence softmax should be 1");
+  }
 
   if (!argument_.value) {
     argument_.value = Matrix::create(nullptr,
@@ -169,10 +188,14 @@ void forward(Argument& act) {
 
   auto starts = act.sequenceStartPositions->getVector(useGpu(act.deviceId));
   act.value->sequenceSoftmax(*act.value, *starts);
+  return Error();
 }
 
-void backward(Argument& act) {
-  CHECK_EQ(act.grad->getWidth(), 1UL);
+Error __must_check backward(Argument& act) {
+  if (act.value->getWidth() != 1UL) {
+    return Error(
+        "Input width for each timestep of sequence softmax should be 1");
+  }
 
   size_t numSequences = act.getNumSequences();
   const int* starts = act.sequenceStartPositions->getData(false);
@@ -184,8 +207,10 @@ void backward(Argument& act) {
     argument_.value->setData(act.value->getData() + offset, 1UL, size);
     argument_.grad->setData(act.grad->getData() + offset, 1UL, size);
 
-    softmax_.backward(argument_);
+    Error status = softmax_.backward(argument_);
+    if (!status) return status;
   }
+  return Error();
 }
 END_DEFINE_ACTIVATION(sequence_softmax)
 
@@ -200,9 +225,15 @@ END_DEFINE_ACTIVATION(sequence_softmax)
  *    0 otherwise.
  */
 BEGIN_DEFINE_ACTIVATION(relu)
-void forward(Argument& act) { act.value->relu(*act.value); }
+Error __must_check forward(Argument& act) {
+  act.value->relu(*act.value);
+  return Error();
+}
 
-void backward(Argument& act) { act.grad->reluDerivative(*act.value); }
+Error __must_check backward(Argument& act) {
+  act.grad->reluDerivative(*act.value);
+  return Error();
+}
 END_DEFINE_ACTIVATION(relu)
 
 /**
@@ -219,9 +250,15 @@ END_DEFINE_ACTIVATION(relu)
  * TODO(yuyang18): Remove magic number 24 or make it configuable.
  */
 BEGIN_DEFINE_ACTIVATION(brelu)
-void forward(Argument& act) { act.value->brelu(*act.value); }
+Error __must_check forward(Argument& act) {
+  act.value->brelu(*act.value);
+  return Error();
+}
 
-void backward(Argument& act) { act.grad->breluDerivative(*act.value); }
+Error __must_check backward(Argument& act) {
+  act.grad->breluDerivative(*act.value);
+  return Error();
+}
 END_DEFINE_ACTIVATION(brelu)
 
 /**
@@ -231,9 +268,15 @@ END_DEFINE_ACTIVATION(brelu)
  * \f]
  */
 BEGIN_DEFINE_ACTIVATION(tanh)
-void forward(Argument& act) { act.value->tanh(*act.value); }
+Error __must_check forward(Argument& act) {
+  act.value->tanh(*act.value);
+  return Error();
+}
 
-void backward(Argument& act) { act.grad->tanhDerivative(*act.value); }
+Error __must_check backward(Argument& act) {
+  act.grad->tanhDerivative(*act.value);
+  return Error();
+}
 END_DEFINE_ACTIVATION(tanh)
 
 /**
@@ -248,10 +291,14 @@ real a, b;
 
 public:
 ACTIVATION_CLASS_NAME(stanh)() : a(1.7159), b(2. / 3.) {}
-void forward(Argument& act) { act.value->scaledTanh(*act.value, a, b); }
+Error __must_check forward(Argument& act) {
+  act.value->scaledTanh(*act.value, a, b);
+  return Error();
+}
 
-void backward(Argument& act) {
+Error __must_check backward(Argument& act) {
   act.grad->scaledTanhDerivative(*act.value, a, b);
+  return Error();
 }
 END_DEFINE_ACTIVATION(stanh)
 
@@ -262,9 +309,15 @@ END_DEFINE_ACTIVATION(stanh)
  * \f]
  */
 BEGIN_DEFINE_ACTIVATION(softrelu)
-void forward(Argument& act) { act.value->softrelu(*act.value); }
+Error __must_check forward(Argument& act) {
+  act.value->softrelu(*act.value);
+  return Error();
+}
 
-void backward(Argument& act) { act.grad->softreluDerivative(*act.value); }
+Error __must_check backward(Argument& act) {
+  act.grad->softreluDerivative(*act.value);
+  return Error();
+}
 END_DEFINE_ACTIVATION(softrelu)
 
 /**
@@ -280,7 +333,7 @@ END_DEFINE_ACTIVATION(softrelu)
  *     0   if z=0
  */
 BEGIN_DEFINE_ACTIVATION(abs)
-void forward(Argument& act) {
+Error __must_check forward(Argument& act) {
   SetDevice device(act.deviceId);
   Matrix::resizeOrCreate(act.in,
                          act.value->getHeight(),
@@ -290,9 +343,13 @@ void forward(Argument& act) {
 
   act.in->copyFrom(*act.value);
   act.value->abs2(*act.value);
+  return Error();
 }
 
-void backward(Argument& act) { act.grad->absDerivative(*act.in); }
+Error __must_check backward(Argument& act) {
+  act.grad->absDerivative(*act.in);
+  return Error();
+}
 END_DEFINE_ACTIVATION(abs)
 
 /**
@@ -302,7 +359,7 @@ END_DEFINE_ACTIVATION(abs)
  * \f]
  */
 BEGIN_DEFINE_ACTIVATION(square)
-void forward(Argument& act) {
+Error __must_check forward(Argument& act) {
   SetDevice device(act.deviceId);
   Matrix::resizeOrCreate(act.in,
                          act.value->getHeight(),
@@ -312,9 +369,13 @@ void forward(Argument& act) {
 
   act.in->copyFrom(*act.value);
   act.value->square2(*act.value);
+  return Error();
 }
 
-void backward(Argument& act) { act.grad->squareDerivative(*act.in); }
+Error __must_check backward(Argument& act) {
+  act.grad->squareDerivative(*act.in);
+  return Error();
+}
 END_DEFINE_ACTIVATION(square)
 
 /**
@@ -324,9 +385,15 @@ END_DEFINE_ACTIVATION(square)
  * \f]
  */
 BEGIN_DEFINE_ACTIVATION(exponential)
-void forward(Argument& act) { act.value->exp2(*act.value); }
+Error __must_check forward(Argument& act) {
+  act.value->exp2(*act.value);
+  return Error();
+}
 
-void backward(Argument& act) { act.grad->expDerivative(*act.value); }
+Error __must_check backward(Argument& act) {
+  act.grad->expDerivative(*act.value);
+  return Error();
+}
 END_DEFINE_ACTIVATION(exponential)
 
 /**
@@ -336,7 +403,7 @@ END_DEFINE_ACTIVATION(exponential)
  * \f]
  */
 BEGIN_DEFINE_ACTIVATION(log)
-void forward(Argument& act) {
+Error __must_check forward(Argument& act) {
   SetDevice device(act.deviceId);
   Matrix::resizeOrCreate(act.in,
                          act.value->getHeight(),
@@ -346,9 +413,13 @@ void forward(Argument& act) {
 
   act.in->copyFrom(*act.value);
   act.value->log2(*act.value);
+  return Error();
 }
 
-void backward(Argument& act) { act.grad->dotDiv(*act.grad, *act.in); }
+Error __must_check backward(Argument& act) {
+  act.grad->dotDiv(*act.grad, *act.in);
+  return Error();
+}
 END_DEFINE_ACTIVATION(log)
 
 ActivationFunction* ActivationFunction::create(const std::string& type) {
diff --git a/paddle/gserver/activations/ActivationFunction.h b/paddle/gserver/activations/ActivationFunction.h
index 601e3b6c0cd401..f208224e304a79 100644
--- a/paddle/gserver/activations/ActivationFunction.h
+++ b/paddle/gserver/activations/ActivationFunction.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <string>
 #include <vector>
+#include "paddle/utils/Error.h"
 
 namespace paddle {
 
@@ -48,7 +49,7 @@ class ActivationFunction {
    *
    * Usually, act is Layer::output_
    */
-  virtual void forward(Argument& act) = 0;
+  virtual Error __must_check forward(Argument& act) = 0;
 
   /**
    * @brief Backward propagaion
@@ -57,7 +58,7 @@ class ActivationFunction {
    * - Before calling backward(), act.grad = dE / dy, where E is the error/cost
    * - After backward() returns, act.grad = dE / dx = (dE/dy) * (dy/dx)
    */
-  virtual void backward(Argument& act) = 0;
+  virtual Error __must_check backward(Argument& act) = 0;
 
   virtual const std::string& getName() const = 0;
 };
diff --git a/paddle/gserver/dataproviders/PyDataProvider2.cpp b/paddle/gserver/dataproviders/PyDataProvider2.cpp
index c26e242534f2af..b8079dc0796d0e 100644
--- a/paddle/gserver/dataproviders/PyDataProvider2.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider2.cpp
@@ -647,7 +647,7 @@ class PyDataProvider2 : public DataProvider {
       DataBatch& gpuBatch = *batch;
       std::vector<Argument>& gpuArguments = gpuBatch.getStreams();
       gpuArguments.resize(cpuArguments.size());
-      gpuBatch.setSize(size);
+      gpuBatch.setSize(bsize);
       for (size_t i = 0; i < headers_.size(); ++i) {
         gpuArguments[i].resizeAndCopyFrom(
             cpuArguments[i], useGpu_, HPPL_STREAM_1);
diff --git a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
index 05aa6c012ae2bc..132119015f967c 100644
--- a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
+++ b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
@@ -20,7 +20,7 @@ namespace paddle {
 /**
  * calculate sequence-to-sequence edit distance
  */
-class CTCErrorEvaluator : public Evaluator {
+class CTCErrorEvaluator : public NotGetableEvaluator {
 private:
   MatrixPtr outActivations_;
   int numTimes_, numClasses_, numSequences_, blank_;
diff --git a/paddle/gserver/evaluators/Evaluator.cpp b/paddle/gserver/evaluators/Evaluator.cpp
index ae7508e2bb117a..9db6d252d97bfe 100644
--- a/paddle/gserver/evaluators/Evaluator.cpp
+++ b/paddle/gserver/evaluators/Evaluator.cpp
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/gserver/evaluators/Evaluator.h"
-#include "paddle/utils/Stat.h"
-
 #include "paddle/gserver/gradientmachines/NeuralNetwork.h"
+#include "paddle/utils/Stat.h"
+#include "paddle/utils/StringUtil.h"
 
 DECLARE_int32(trainer_id);
 
@@ -39,6 +39,14 @@ void Evaluator::eval(const NeuralNetwork& nn) {
  */
 class ClassificationErrorEvaluator : public Evaluator {
 public:
+  /*
+  ClassificationErrorEvaluator() : totalScore2_(0) {}
+
+  virtual void start() {
+    Evaluator::start();
+    totalScore2_ = 0;
+    } */
+
   virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
     if (3 == arguments.size()) {
       numSamples_ += arguments[2].value->getSum();
@@ -76,9 +84,11 @@ class ClassificationErrorEvaluator : public Evaluator {
                                               1,
                                               /* trans= */ false,
                                               useGpu(arguments[0].deviceId));
+
     errorMat->zeroMem();
+
     if (label != nullptr) {
-      errorMat->classificationError(*output, *label);
+      errorMat->classificationError(*output, *label, config_.top_k());
     } else if (dynamic_cast<CpuSparseMatrix*>(multiBinaryLabel.get()) ||
                dynamic_cast<GpuSparseMatrix*>(multiBinaryLabel.get())) {
       errorMat->classificationErrorMulti(
@@ -94,6 +104,16 @@ class ClassificationErrorEvaluator : public Evaluator {
     return errorMat;
   }
 
+  void printStats(std::ostream& os) const {
+    if (config_.top_k() == 1) {
+      os << config_.name() << "="
+         << (numSamples_ ? totalScore_ / numSamples_ : 0);
+    } else {
+      os << " top_" << config_.top_k()
+         << "_error=" << (numSamples_ ? totalScore_ / numSamples_ : 0);
+    }
+  }
+
   virtual real evalImp(std::vector<Argument>& arguments) {
     MatrixPtr errorMat = calcError(arguments);
     return errorMat->getSum();
@@ -102,6 +122,10 @@ class ClassificationErrorEvaluator : public Evaluator {
   virtual void distributeEval(ParameterClient2* client) {
     mergeResultsOfAllClients(client);
   }
+
+  // Evaluator interface
+protected:
+  std::string getTypeImpl() const { return "classification_error"; }
 };
 
 /**
@@ -140,6 +164,10 @@ class SequenceClassificationErrorEvaluator
   virtual void distributeEval(ParameterClient2* client) {
     mergeResultsOfAllClients(client);
   }
+
+  // Evaluator interface
+protected:
+  std::string getTypeImpl() const { return "seq_classification_error"; }
 };
 REGISTER_EVALUATOR(seq_classification_error,
                    SequenceClassificationErrorEvaluator);
@@ -230,6 +258,10 @@ class SumEvaluator : public Evaluator {
 private:
   IVectorPtr cpuLabel_;
   MatrixPtr cpuWeight_;
+
+  // Evaluator interface
+protected:
+  std::string getTypeImpl() const { return "sum"; }
 };
 /**
  * @brief column sum Evaluator
@@ -337,10 +369,18 @@ class ColumnSumEvaluator : public Evaluator {
   }
 
 private:
-  ColumnSumEvaluator() {}
   int32_t colIdx_;
   size_t colNum_;
   MatrixPtr sum_; /* cpu matrix */
+
+  // Evaluator interface
+protected:
+  std::string getTypeImpl() const {
+    if (colIdx_ == -1)
+      return "last-column-sum";
+    else
+      return "column-sum";
+  }
 };
 
 void AucEvaluator::start() {
@@ -449,6 +489,16 @@ double AucEvaluator::calcAuc() const {
   }
 }
 
+real AucEvaluator::getValueImpl() const { return calcAuc(); }
+
+std::string AucEvaluator::getTypeImpl() const {
+  if (colIdx_ == -1) {
+    return "last-column-auc";
+  } else {
+    return "auc";
+  }
+}
+
 // class RankAucEvaluator
 REGISTER_EVALUATOR(rankauc, RankAucEvaluator);
 
@@ -528,12 +578,15 @@ double RankAucEvaluator::calcRankAuc(real* outputData,
                                         : aucTmp / (clickSum * noClickSum);
 }
 
+std::string RankAucEvaluator::getTypeImpl() const { return "rankauc"; }
+
 // class PrecisionRecallEvaluator
 REGISTER_EVALUATOR(precision_recall, PrecisionRecallEvaluator);
 
 void PrecisionRecallEvaluator::start() {
   Evaluator::start();
   statsInfo_.clear();
+  values_.clear();
 }
 
 real PrecisionRecallEvaluator::evalImp(std::vector<Argument>& arguments) {
@@ -594,52 +647,23 @@ real PrecisionRecallEvaluator::evalImp(std::vector<Argument>& arguments) {
 }
 
 void PrecisionRecallEvaluator::printStats(std::ostream& os) const {
-  int label = config_.positive_label();
-  if (label != -1) {
-    CHECK(label >= 0 && label < (int)statsInfo_.size())
-        << "positive_label [" << label << "] should be in range [0, "
-        << statsInfo_.size() << ")";
-    double precision =
-        calcPrecision(statsInfo_[label].TP, statsInfo_[label].FP);
-    double recall = calcRecall(statsInfo_[label].TP, statsInfo_[label].FN);
-    os << "positive_label=" << label << " precision=" << precision
-       << " recall=" << recall
-       << " F1-score=" << calcF1Score(precision, recall);
-    return;
-  }
-
-  // micro average method: precision = (TP1+TP2)/(TP1+FP1+TP2+FP2)
-  // macro average method: precision = (precision1+precision2)/2
-  double microTotalTP = 0;
-  double microTotalFP = 0;
-  double microTotalFN = 0;
-  double macroAvgPrecision = 0;
-  double macroAvgRecall = 0;
-  size_t numLabels = statsInfo_.size();
-  for (size_t i = 0; i < numLabels; ++i) {
-    microTotalTP += statsInfo_[i].TP;
-    microTotalFP += statsInfo_[i].FP;
-    microTotalFN += statsInfo_[i].FN;
-    macroAvgPrecision += calcPrecision(statsInfo_[i].TP, statsInfo_[i].FP);
-    macroAvgRecall += calcRecall(statsInfo_[i].TP, statsInfo_[i].FN);
-  }
-  macroAvgPrecision /= numLabels;
-  macroAvgRecall /= numLabels;
-  double macroAvgF1Score = calcF1Score(macroAvgPrecision, macroAvgRecall);
-  os << "macro-average-precision=" << macroAvgPrecision
-     << " macro-average-recall=" << macroAvgRecall
-     << " macro-average-F1-score=" << macroAvgF1Score;
-
-  double microAvgPrecision = calcPrecision(microTotalTP, microTotalFP);
-  double microAvgRecall = calcPrecision(microTotalTP, microTotalFN);
-  double microAvgF1Score = calcF1Score(microAvgPrecision, microAvgRecall);
-  if (!isMultiBinaryLabel_) {
-    // precision and recall are equal in this case
-    os << " micro-average-precision=" << microAvgPrecision;
-  } else {
-    os << " micro-average-precision=" << microAvgPrecision
-       << " micro-average-recall=" << microAvgRecall
-       << " micro-average-F1-score=" << microAvgF1Score;
+  PrintStatsInfo info;
+  bool containMacroMicroInfo = getStatsInfo(&info);
+  os << "positive_label=" << config_.positive_label()
+     << " precision=" << info.precision << " recall=" << info.recall
+     << " F1-score=" << info.f1;
+  if (containMacroMicroInfo) {
+    os << "macro-average-precision=" << info.macroAvgPrecision
+       << " macro-average-recall=" << info.macroAvgRecall
+       << " macro-average-F1-score=" << info.macroAvgF1Score;
+    if (!isMultiBinaryLabel_) {
+      // precision and recall are equal in this case
+      os << " micro-average-precision=" << info.microAvgPrecision;
+    } else {
+      os << " micro-average-precision=" << info.microAvgPrecision
+         << " micro-average-recall=" << info.microAvgRecall
+         << " micro-average-F1-score=" << info.microAvgF1Score;
+    }
   }
 }
 
@@ -721,6 +745,60 @@ void PrecisionRecallEvaluator::calcStatsInfoMulti(const MatrixPtr& output,
   }
 }
 
+void PrecisionRecallEvaluator::storeLocalValues() const {
+  if (this->values_.size() == 0) {
+    PrintStatsInfo info;
+    bool containMacroMicroInfo = getStatsInfo(&info);
+    values_["precision"] = info.precision;
+    values_["recal"] = info.recall;
+    values_["F1-score"] = info.f1;
+    if (containMacroMicroInfo) {
+      values_["macro-average-precision"] = info.macroAvgPrecision;
+      values_["macro-average-recall"] = info.macroAvgRecall;
+      values_["macro-average-F1-score"] = info.macroAvgF1Score;
+      if (!isMultiBinaryLabel_) {
+        // precision and recall are equal in this case
+        values_["micro-average-precision"] = info.microAvgPrecision;
+      } else {
+        values_["micro-average-precision"] = info.microAvgPrecision;
+        values_["micro-average-recall"] = info.microAvgRecall;
+        values_["micro-average-F1-score"] = info.microAvgF1Score;
+      }
+    }
+  }
+}
+
+void PrecisionRecallEvaluator::getNames(std::vector<std::string>* names) {
+  this->storeLocalValues();
+  names->reserve(this->values_.size());
+  for (auto it = this->values_.begin(); it != this->values_.end(); ++it) {
+    names->push_back(this->config_.name() + "." + it->first);
+  }
+}
+
+real PrecisionRecallEvaluator::getValue(const std::string& name,
+                                        Error* err) const {
+  this->storeLocalValues();
+  std::vector<std::string> buffers;
+  paddle::str::split(name, '.', &buffers);
+  auto it = this->values_.find(buffers[buffers.size() - 1]);
+  if (it == this->values_.end()) {  // not found
+    *err = Error("No such key %s", name.c_str());
+    return .0f;
+  }
+
+  return it->second;
+}
+
+std::string PrecisionRecallEvaluator::getType(const std::string& name,
+                                              Error* err) const {
+  this->getValue(name, err);
+  if (!err->isOK()) {
+    return "";
+  }
+  return "precision_recall";
+}
+
 void PrecisionRecallEvaluator::distributeEval(ParameterClient2* client) {
   size_t size = 4 * statsInfo_.size();
   double* buf = new double[size];
@@ -740,6 +818,47 @@ void PrecisionRecallEvaluator::distributeEval(ParameterClient2* client) {
   delete[] buf;
 }
 
+bool PrecisionRecallEvaluator::getStatsInfo(
+    PrecisionRecallEvaluator::PrintStatsInfo* info) const {
+  int label = config_.positive_label();
+  if (label != -1) {
+    CHECK(label >= 0 && label < (int)statsInfo_.size())
+        << "positive_label [" << label << "] should be in range [0, "
+        << statsInfo_.size() << ")";
+    info->precision = calcPrecision(statsInfo_[label].TP, statsInfo_[label].FP);
+    info->recall = calcRecall(statsInfo_[label].TP, statsInfo_[label].FN);
+    info->f1 = calcF1Score(info->precision, info->recall);
+    return false;
+  }
+
+  // micro average method: precision = (TP1+TP2)/(TP1+FP1+TP2+FP2)
+  // macro average method: precision = (precision1+precision2)/2
+  double microTotalTP = 0;
+  double microTotalFP = 0;
+  double microTotalFN = 0;
+  info->macroAvgPrecision = 0;
+  info->macroAvgRecall = 0;
+  size_t numLabels = statsInfo_.size();
+  for (size_t i = 0; i < numLabels; ++i) {
+    microTotalTP += statsInfo_[i].TP;
+    microTotalFP += statsInfo_[i].FP;
+    microTotalFN += statsInfo_[i].FN;
+    info->macroAvgPrecision +=
+        calcPrecision(statsInfo_[i].TP, statsInfo_[i].FP);
+    info->macroAvgRecall += calcRecall(statsInfo_[i].TP, statsInfo_[i].FN);
+  }
+  info->macroAvgPrecision /= numLabels;
+  info->macroAvgRecall /= numLabels;
+  info->macroAvgF1Score =
+      calcF1Score(info->macroAvgPrecision, info->macroAvgRecall);
+
+  info->microAvgPrecision = calcPrecision(microTotalTP, microTotalFP);
+  info->microAvgRecall = calcPrecision(microTotalTP, microTotalFN);
+  info->microAvgF1Score =
+      calcF1Score(info->microAvgPrecision, info->microAvgRecall);
+  return true;
+}
+
 REGISTER_EVALUATOR(pnpair, PnpairEvaluator);
 void PnpairEvaluator::start() {
   Evaluator::start();
@@ -864,56 +983,35 @@ void PnpairEvaluator::calc(std::vector<PredictionResult>& predictArray) {
             << " calc total special pair: " << special;
 }
 
+std::string PnpairEvaluator::getTypeImpl() const { return "pnpair"; }
+
 ClassRegistrar<Evaluator> Evaluator::registrar_;
 Evaluator* Evaluator::create(const EvaluatorConfig& config) {
-  Evaluator* evaluator = nullptr;
-  if (config.type() == "classification_error") {
-    evaluator = new ClassificationErrorEvaluator();
-  } else if (config.type() == "sum") {
-    evaluator = new SumEvaluator();
-  } else if (config.type() == "last-column-sum") {
-    evaluator = new ColumnSumEvaluator(-1);
-  } else if (config.type() == "last-column-auc") {
-    evaluator = new AucEvaluator(-1);
-  } else {
-    evaluator = registrar_.createByType(config.type());
-  }
+  Evaluator* evaluator = registrar_.createByType(config.type());
   evaluator->init(config);
   return evaluator;
 }
+
+REGISTER_EVALUATOR(classification_error, ClassificationErrorEvaluator);
+REGISTER_EVALUATOR(sum, SumEvaluator);
+static InitFunction __reg_type_auc_sum__([]() {
+  Evaluator::registrar_.registerClass(
+      "last-column-sum", [] { return new ColumnSumEvaluator(-1); });
+  Evaluator::registrar_.registerClass("last-column-auc",
+                                      [] { return new AucEvaluator(-1); });
+});
+
 /**
  * @brief print value of each layer.
  *
  * The config file api is value_printer_evaluator.
  */
-class ValuePrinter : public Evaluator {
+class ValuePrinter : public NotGetableEvaluator {
 public:
-  ValuePrinter() {}
-
   virtual void eval(const NeuralNetwork& nn) {
     for (const std::string& name : config_.input_layers()) {
-      const Argument& argu = nn.getLayer(name)->getOutput();
-      if (argu.value) {
-        std::ostringstream os;
-        argu.value->print(os);
-        LOG(INFO) << "layer=" << name << " value matrix:\n" << os.str();
-      }
-      if (argu.ids) {
-        std::ostringstream os;
-        argu.ids->print(os, argu.ids->getSize());
-        LOG(INFO) << "layer=" << name << " ids vector:\n" << os.str();
-      }
-      if (auto startPos = argu.sequenceStartPositions) {
-        std::ostringstream os;
-        startPos->getVector(false)->print(os, startPos->getSize());
-        LOG(INFO) << "layer=" << name << " sequence pos vector:\n" << os.str();
-      }
-      if (auto subStartPos = argu.subSequenceStartPositions) {
-        std::ostringstream os;
-        subStartPos->getVector(false)->print(os, subStartPos->getSize());
-        LOG(INFO) << "layer=" << name << " sub-sequence pos vector:\n"
-                  << os.str();
-      }
+      nn.getLayer(name)->getOutput().printValueString(LOG(INFO),
+                                                      "layer=" + name + " ");
     }
   }
 
@@ -922,15 +1020,14 @@ class ValuePrinter : public Evaluator {
   virtual real evalImp(std::vector<Argument>& arguments) { return 0; }
 };
 REGISTER_EVALUATOR(value_printer, ValuePrinter);
+
 /**
  * @brief print gradient of each layer.
  *
  * The config file api is gradient_printer_evaluator.
  */
-class GradientPrinter : public Evaluator {
+class GradientPrinter : public NotGetableEvaluator {
 public:
-  GradientPrinter() {}
-
   virtual void eval(const NeuralNetwork& nn) {
     for (const std::string& name : config_.input_layers()) {
       const Argument& argu = nn.getLayer(name)->getOutput();
@@ -939,11 +1036,6 @@ class GradientPrinter : public Evaluator {
         argu.grad->print(os);
         LOG(INFO) << "layer=" << name << " grad matrix:\n" << os.str();
       }
-      if (auto startPos = argu.sequenceStartPositions) {
-        std::ostringstream os;
-        startPos->getVector(false)->print(os, startPos->getSize());
-        LOG(INFO) << "layer=" << name << " sequence pos vector:\n" << os.str();
-      }
     }
   }
 
@@ -957,7 +1049,7 @@ REGISTER_EVALUATOR(gradient_printer, GradientPrinter);
  *
  * The config file api is maxid_printer_evaluator.
  */
-class MaxIdPrinter : public Evaluator {
+class MaxIdPrinter : public NotGetableEvaluator {
 private:
   IVectorPtr maxIds_;
   MatrixPtr maxValues_;
@@ -999,7 +1091,7 @@ REGISTER_EVALUATOR(max_id_printer, MaxIdPrinter);
  *
  * The config file api is maxframe_printer_evaluator.
  */
-class MaxFramePrinter : public Evaluator {
+class MaxFramePrinter : public NotGetableEvaluator {
 private:
   IVectorPtr maxIds_;
   MatrixPtr maxValues_;
@@ -1086,7 +1178,7 @@ REGISTER_EVALUATOR(max_frame_printer, MaxFramePrinter);
  * The config file api is seqtext_printer_evaluator.
  *
  */
-class SequenceTextPrinter : public Evaluator {
+class SequenceTextPrinter : public NotGetableEvaluator {
 private:
   /// dict_file, which contains a list of tokens
   std::vector<std::string> dict_;
@@ -1253,4 +1345,6 @@ class ClassificationErrorPrinter : public ClassificationErrorEvaluator {
 };
 REGISTER_EVALUATOR(classification_error_printer, ClassificationErrorPrinter);
 
+std::string DummyEvaluator::getTypeImpl() const { return "dummy"; }
+
 }  // namespace paddle
diff --git a/paddle/gserver/evaluators/Evaluator.h b/paddle/gserver/evaluators/Evaluator.h
index 5770847309670e..b114500e2b7c1e 100644
--- a/paddle/gserver/evaluators/Evaluator.h
+++ b/paddle/gserver/evaluators/Evaluator.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/parameter/Argument.h"
 #include "paddle/pserver/ParameterClient2.h"
 #include "paddle/utils/ClassRegistrar.h"
+#include "paddle/utils/Error.h"
 
 namespace paddle {
 
@@ -117,12 +118,105 @@ class Evaluator {
 
   static ClassRegistrar<Evaluator> registrar_;
 
+  /**
+   * @brief getNames will return all field names of current evaluator.
+   *
+   * The format of name is `evaluator_name.evaluator_fields`. If the evaluator
+   * has multiple field, the name could be `evaluator_name.field1`. For example
+   * the PrecisionRecallEvaluator contains `precision`, `recall` fields. The get
+   * names will return `precision_recall_evaluator.precision`,
+   * `precision_recall_evaluator.recal`, etc.
+   *
+   * Also, if current Evaluator is a combined evaluator. getNames will return
+   * all names of all evaluators inside the combined evaluator.
+   *
+   * @param names [out]: the field names of current evaluator.
+   * @note Never clear the names parameter inside getNames.
+   */
+  virtual void getNames(std::vector<std::string>* names) {
+    names->push_back(config_.name());
+  }
+
+  /**
+   * @brief getValue will return the current evaluate value of one field.
+   *
+   * @param name: The field name of current evaluator.
+   * @param err [out]: The error state.
+   *
+   * @return The evaluate value(metric).
+   */
+  virtual real getValue(const std::string& name, Error* err) const {
+    if (name != config_.name()) {
+      *err = Error("no such name of evaluator %s", name.c_str());
+      return .0f;
+    }
+    return this->getValueImpl();
+  }
+
+  /**
+   * @brief getType will return the evaluator type by field name.
+   *
+   * Evaluate Type is the current type of evaluator in string. Such as 'auc',
+   * 'precision_recall'. In combined evaluator, different name may get different
+   * evaluate type because it could be evaluated by different evaluator inside.
+   *
+   * @param name: The field name of current Evaluator.
+   * @param err: The error state. nullptr means don't care.
+   * @return the evaluator type string.
+   */
+  virtual std::string getType(const std::string& name, Error* err) const {
+    if (name != config_.name()) {
+      *err = Error("no such name of evaluator %s", name.c_str());
+      return std::string();
+    }
+    return this->getTypeImpl();
+  }
+
+protected:
+  /**
+   * @brief getValueImpl The simplest way to define getValue result. If this
+   * evaluator doesn't contain multiple fields, and do not throw any error, just
+   * implemented this method to get the evaluate result(metric).
+   * @return Evaluate result(metric).
+   */
+  virtual real getValueImpl() const {
+    return numSamples_ != .0 ? totalScore_ / numSamples_ : .0;
+  }
+
+  /**
+   * @brief getTypeImpl The simplest way to define getType result. If this
+   * evaluator doesn't combine many evaluators, the get type should only return
+   * itself type.
+   * @return Evaluator type.
+   */
+  virtual std::string getTypeImpl() const { return "base"; }
+
 protected:
   EvaluatorConfig config_;
   double numSamples_;
   double totalScore_;
 };
 
+/**
+ * @brief The NotGetableEvaluator class is the base class of evaluator that
+ * cannot get value in runtime. The most NotGetableEvaluator is Printer
+ * Evaluator, which is only used to debug network configuration.
+ */
+class NotGetableEvaluator : public Evaluator {
+  // Evaluator interface
+public:
+  void getNames(std::vector<std::string>* names) {}
+
+  real getValue(const std::string& name, Error* err) const {
+    *err = Error("Not implemented");
+    return .0f;
+  }
+  std::string getType(const std::string& name, Error* err) const {
+    *err = Error("Not implemented");
+    return "";
+  }
+};
+
 class DummyEvaluator : public Evaluator {
 public:
   DummyEvaluator() {}
@@ -135,6 +229,10 @@ class DummyEvaluator : public Evaluator {
   }
   virtual void finish() {}
   virtual void printStats(std::ostream&) const {}
+
+  // Evaluator interface
+protected:
+  std::string getTypeImpl() const;
 };
 /**
  * @brief evaluate AUC using colIdx-th column as prediction.
@@ -191,6 +289,11 @@ class AucEvaluator : public Evaluator {
   }
 
   double calcAuc() const;
+
+  // Evaluator interface
+protected:
+  real getValueImpl() const;
+  std::string getTypeImpl() const;
 };
 
 /**
@@ -223,6 +326,10 @@ class RankAucEvaluator : public Evaluator {
                      real* clickData,
                      real* pvData,
                      size_t size);
+
+  // Evaluator interface
+protected:
+  std::string getTypeImpl() const;
 };
 /**
  * @brief precision, recall and f1 score Evaluator
@@ -272,6 +379,20 @@ class PrecisionRecallEvaluator : public Evaluator {
   IVectorPtr cpuLabel_;
   MatrixPtr cpuWeight_;
 
+  struct PrintStatsInfo {
+    double precision;
+    double recall;
+    double f1;
+    double macroAvgPrecision;
+    double macroAvgRecall;
+    double macroAvgF1Score;
+    double microAvgPrecision;
+    double microAvgRecall;
+    double microAvgF1Score;
+  };
+
+  bool getStatsInfo(PrintStatsInfo* info) const;
+
   void calcStatsInfo(const MatrixPtr& output,
                      const IVectorPtr& label,
                      const MatrixPtr& weight);
@@ -303,6 +424,15 @@ class PrecisionRecallEvaluator : public Evaluator {
       return 0;
     }
   }
+
+  mutable std::unordered_map<std::string, real> values_;
+
+  void storeLocalValues() const;
+  // Evaluator interface
+public:
+  void getNames(std::vector<std::string>* names);
+  real getValue(const std::string& name, Error* err) const;
+  std::string getType(const std::string& name, Error* err) const;
 };
 
 /*
@@ -349,8 +479,7 @@ class PnpairEvaluator : public Evaluator {
   virtual void finish() { calc(predictArray_); }
 
   virtual void printStats(std::ostream& os) const {
-    os << " pos/neg"
-       << "=" << pairArray_[0] / ((pairArray_[1] <= 0) ? 1.0 : pairArray_[1]);
+    os << " pos/neg=" << this->getValueImpl();
   }
 
   virtual void distributeEval(ParameterClient2* client) {
@@ -366,6 +495,13 @@ class PnpairEvaluator : public Evaluator {
   IVectorPtr cpuLabel_;
   IVectorPtr cpuInfo_;
   MatrixPtr cpuWeight_;
+
+  // Evaluator interface
+protected:
+  real getValueImpl() const {
+    return pairArray_[0] / ((pairArray_[1] <= 0) ? 1.0 : pairArray_[1]);
+  }
+  std::string getTypeImpl() const;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/GradientMachine.cpp b/paddle/gserver/gradientmachines/GradientMachine.cpp
index 36ca05b919b136..3eb87d9b85c820 100644
--- a/paddle/gserver/gradientmachines/GradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/GradientMachine.cpp
@@ -60,55 +60,6 @@ GradientMachine* GradientMachine::create(
   return nullptr;
 }
 
-GradientMachine* GradientMachine::create(const std::string& modelFile,
-                                         DataConfig* dataConfig) {
-  std::ifstream is(modelFile);
-  CHECK(is) << "Fail to open " << modelFile;
-  return create(is, dataConfig);
-}
-
-GradientMachine* GradientMachine::create(std::istream& is,
-                                         DataConfig* dataConfig) {
-  TrainerConfig trainerConfig;
-  GradientMachine* ret = create(is, &trainerConfig);
-  if (dataConfig && trainerConfig.has_data_config()) {
-    *dataConfig = trainerConfig.data_config();
-  }
-  return ret;
-}
-
-GradientMachine* GradientMachine::create(const std::string& modelFile,
-                                         TrainerConfig* trainerConfig) {
-  std::ifstream is(modelFile);
-  CHECK(is) << "Fail to open " << modelFile;
-  return create(is, trainerConfig);
-}
-
-GradientMachine* GradientMachine::create(std::istream& is,
-                                         TrainerConfig* trainerConfig) {
-  TrainerConfig trainerConfigTemp;
-  int64_t size;
-  CHECK(is.read((char*)&size, sizeof(size))) << "Fail to read ";
-  std::string buf;
-  buf.resize(size);
-  CHECK(is.read(&buf[0], size)) << "Fail to read ";
-  CHECK(trainerConfigTemp.ParseFromString(buf)) << "Fail to parse config";
-  std::unique_ptr<GradientMachine> machine(
-      create(trainerConfigTemp.model_config()));
-  std::vector<ParameterPtr>& parameters = machine->getParameters();
-  for (auto& para : parameters) {
-    para->load(is);
-  }
-
-  machine->onLoadParameter();
-
-  if (trainerConfig) {
-    *trainerConfig = trainerConfigTemp;
-  }
-
-  return machine.release();
-}
-
 void GradientMachine::saveParameters(const std::string& dir) const {
   LOG(INFO) << "Saving parameters to " << dir;
 
diff --git a/paddle/gserver/gradientmachines/GradientMachine.h b/paddle/gserver/gradientmachines/GradientMachine.h
index 1e35c7e2b8d185..bc2f2f8563526a 100644
--- a/paddle/gserver/gradientmachines/GradientMachine.h
+++ b/paddle/gserver/gradientmachines/GradientMachine.h
@@ -89,39 +89,6 @@ class GradientMachine {
           std::vector<ParameterType>{
               PARAMETER_VALUE, PARAMETER_GRADIENT, PARAMETER_MOMENTUM});
 
-  /**
-   * Create a gradient machine from the merged model file.
-   * The merged model file can be generated using tools/merge_model
-   * If dataConfig is not null, it will be filled with the DataConfig
-   * from the TrainerConfig
-   */
-  static GradientMachine* create(const std::string& modelFile,
-                                 DataConfig* dataConfig);
-
-  /**
-   * Create a gradient machine from a stream which contains the merged
-   * model file. The merged model file can be generated using tools/merge_model
-   * If dataConfig is not null, it will be filled with the DataConfig
-   * from the TrainerConfig
-   */
-  static GradientMachine* create(std::istream& is, DataConfig* dataConfig);
-
-  /**
-   * Create a gradient machine from the merged model file.
-   * The merged model file can be generated using tools/merge_model
-   * If trainerConfig is not null, it will be filled with the TrainerConfig
-   */
-  static GradientMachine* create(const std::string& modelFile,
-                                 TrainerConfig* trainerConfig);
-
-  /**
-   * Create a gradient machine from a stream which contains the merged
-   * model file. The merged model file can be generated using tools/merge_model
-   * If trainerConfig is not null, it will be filled with the TrainerConfig
-   */
-  static GradientMachine* create(std::istream& is,
-                                 TrainerConfig* trainerConfig);
-
   virtual ~GradientMachine() {}
 
   /**
@@ -167,6 +134,10 @@ class GradientMachine {
     backward(callback);
   }
 
+  virtual Argument getLayerOutput(const std::string& layerName) {
+    return *((Argument*)nullptr);
+  }
+
   // see comment in Layer.h for the function with the same name
   virtual void resetState() {}
 
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
index 80f223824d8dcc..123273f916f5d3 100644
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
@@ -282,6 +282,18 @@ void MultiGradientMachine::forwardBackward(const std::vector<Argument>& inArgs,
   backwardImp(callback);
 }
 
+Argument MultiGradientMachine::getLayerOutput(const std::string& layerName) {
+  std::vector<Argument> args;
+  args.reserve(threads_.size());
+
+  for (auto& thread : threads_) {
+    args.push_back(thread->getGradientMachine()->getLayerOutput(layerName));
+  }
+  outLayerArgs_.concat(args, false /* use_gpu */, outArgStream_, passType_);
+
+  return outLayerArgs_;
+}
+
 void MultiGradientMachine::backwardImp(const UpdateCallback& callback) {
   for (size_t i = 0; i < parameters_.size(); i++) {
     if (!parameters_[i]->useGpu() || parameters_[i]->isStatic()) continue;
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.h b/paddle/gserver/gradientmachines/MultiGradientMachine.h
index 9be15ef4bcf34f..838a52b5153af6 100644
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.h
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.h
@@ -189,6 +189,8 @@ class MultiGradientMachine : public GradientMachine {
                        PassType passType,
                        const UpdateCallback& callback);
 
+  virtual Argument getLayerOutput(const std::string& layerName);
+
   virtual void onPassEnd();
 
   virtual void finish();
@@ -314,6 +316,8 @@ class MultiGradientMachine : public GradientMachine {
   std::vector<Argument> outArgs_;
   hl_stream_t outArgStream_;
 
+  Argument outLayerArgs_;
+
   /// ParameterType which needs to be merged from each GPU
   std::vector<ParameterType> mergeTypes_;
   int numDevices_;         /* number of gpu devices */
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
index 22051e07ee0026..4512aacc81f86b 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -293,11 +293,10 @@ void NeuralNetwork::backward(const UpdateCallback& callback) {
   }
 }
 
-MatrixPtr NeuralNetwork::getLayerOutput(const std::string& layerName) {
-  auto it = layerMap_.find(layerName);
-  CHECK(it != layerMap_.end()) << "Cannot find layer: " << layerName;
-  return it->second->getOutputValue();
+Argument NeuralNetwork::getLayerOutput(const std::string& layerName) {
+  return getLayer(layerName)->getOutput();
 }
+
 void NeuralNetwork::onPassEnd() {
   for (auto& layer : layers_) {
     layer->onPassEnd();
@@ -306,7 +305,6 @@ void NeuralNetwork::onPassEnd() {
 
 class CombinedEvaluator : public Evaluator {
 public:
-  CombinedEvaluator() {}
   void addEvaluator(std::unique_ptr<Evaluator>&& evaluator) {
     evaluators_.emplace_back(std::move(evaluator));
   }
@@ -346,6 +344,55 @@ class CombinedEvaluator : public Evaluator {
 
 protected:
   std::vector<std::unique_ptr<Evaluator>> evaluators_;
+
+  // Evaluator interface
+public:
+  /**
+   * @brief getNames will return all inside evaluators' names.
+   * @param names [out]: return names.
+   */
+  void getNames(std::vector<std::string>* names) {
+    for (auto& eval : evaluators_) {
+      eval->getNames(names);
+    }
+  }
+
+  /**
+   * @brief getValue could get all inside evaluators' value.
+   */
+  real getValue(const std::string& name, Error* err) const {
+    return this->getMethodHelper<real>(
+        name, err, [&name, err](const std::unique_ptr<Evaluator>& eval) {
+          return eval->getValue(name, err);
+        });
+  }
+
+  /**
+   * @brief getType could get all inside evaluators' type.
+   */
+  std::string getType(const std::string& name, Error* err) const {
+    return this->getMethodHelper<std::string>(
+        name, err, [&name, err](const std::unique_ptr<Evaluator>& eval) {
+          return eval->getType(name, err);
+        });
+  }
+
+private:
+  template <typename T>
+  T getMethodHelper(const std::string& name,
+                    Error* err,
+                    const std::function<T(const std::unique_ptr<Evaluator>&)>&
+                        callback) const {
+    for (auto& eval : evaluators_) {
+      std::vector<std::string> names;
+      eval->getNames(&names);
+      if (std::find(names.begin(), names.end(), name) != names.end()) {
+        return callback(eval);
+      }
+    }
+    *err = Error("No such key %s", name.c_str());
+    return T();
+  }
 };
 
 Evaluator* NeuralNetwork::makeEvaluator() const {
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.h b/paddle/gserver/gradientmachines/NeuralNetwork.h
index 25af4abcf81700..e7b6c438407e7e 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.h
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.h
@@ -87,7 +87,8 @@ class NeuralNetwork : public GradientMachine {
 
   virtual void backward(const UpdateCallback& callback = nullptr);
 
-  MatrixPtr getLayerOutput(const std::string& layerName);
+  virtual Argument getLayerOutput(const std::string& layerName);
+
   const LayerPtr& getLayer(const std::string& layerName) const {
     auto it = layerMap_.find(layerName);
     CHECK(it != layerMap_.end()) << "Unknown layer " << layerName;
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
index a9a9f4f903e305..2ab964b8fc2e08 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -155,7 +155,8 @@ class BootBiasLayer : public Layer {
 public:
   explicit BootBiasLayer(const LayerConfig& config) : Layer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
     if (!Layer::init(layerMap, parameterMap)) return false;
 
     if (biasParameter_) {
@@ -174,7 +175,7 @@ class BootBiasLayer : public Layer {
     }
   }
 
-  virtual void forward(PassType passType) {
+  void forward(PassType passType) override {
     if (biases_) {
       MatrixPtr outV = getOutputValue();
       outV->addBias(*(biases_->getW()), 1);
@@ -182,7 +183,7 @@ class BootBiasLayer : public Layer {
     }
   }
 
-  virtual void backward(const UpdateCallback& callback) {
+  void backward(const UpdateCallback& callback) override {
     if (biases_) {
       backwardActivation();
       biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
diff --git a/paddle/gserver/layers/AddtoLayer.h b/paddle/gserver/layers/AddtoLayer.h
index 53d3f99cdd3439..4e98c174b46276 100644
--- a/paddle/gserver/layers/AddtoLayer.h
+++ b/paddle/gserver/layers/AddtoLayer.h
@@ -44,19 +44,20 @@ class AddtoLayer : public Layer {
   /**
    * Intialization of AddtoLayer.
    */
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   /**
    * Forward propagation.
    * @note There is no weight matrix for each input,
    *       because it just a simple add operation.
    */
-  void forward(PassType passType);
+  void forward(PassType passType) override;
 
   /**
    * Backward propagation.
    */
-  void backward(const UpdateCallback& callback = nullptr);
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/AgentLayer.h b/paddle/gserver/layers/AgentLayer.h
index 41683ad6712d5d..b6dac7ae6fec2d 100644
--- a/paddle/gserver/layers/AgentLayer.h
+++ b/paddle/gserver/layers/AgentLayer.h
@@ -35,7 +35,8 @@ class AgentLayer : public Layer {
 
   ~AgentLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   // if *numSamples* set,
   // real layer output will only use first *numSamples* rows
@@ -44,8 +45,8 @@ class AgentLayer : public Layer {
     numSamples_ = numSamples;
   }
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr) {}
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override {}
 };
 
 /**
@@ -56,8 +57,8 @@ class SequenceAgentLayer : public AgentLayer {
   explicit SequenceAgentLayer(const LayerConfig& config) : AgentLayer(config) {}
   ~SequenceAgentLayer() {}
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr) {}
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override {}
 };
 
 /**
@@ -78,7 +79,8 @@ class GatherAgentLayer : public Layer {
 
   virtual ~GatherAgentLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   // call before addRealLayer
   void copyIdAndSequenceInfo(const Argument& input,
@@ -88,8 +90,8 @@ class GatherAgentLayer : public Layer {
   // add one real layer, can call many times
   void addRealLayer(LayerPtr layer) { realLayers_.push_back(layer); }
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
 };
 
 /**
@@ -133,7 +135,8 @@ class ScatterAgentLayer : public Layer {
 
   virtual ~ScatterAgentLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   /**
    * @brief set real layer in generation
@@ -182,8 +185,8 @@ class ScatterAgentLayer : public Layer {
     numSequences_ = numSequences;
   }
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
 };
 
 /**
diff --git a/paddle/gserver/layers/AverageLayer.h b/paddle/gserver/layers/AverageLayer.h
index b3c4ecec8bc6f5..621e1d7bb12ec5 100644
--- a/paddle/gserver/layers/AverageLayer.h
+++ b/paddle/gserver/layers/AverageLayer.h
@@ -38,12 +38,11 @@ class AverageLayer : public SequencePoolLayer {
   explicit AverageLayer(const LayerConfig& config)
       : SequencePoolLayer(config) {}
 
-  ~AverageLayer() {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 
 protected:
   MatrixPtr outMtx_;
diff --git a/paddle/gserver/layers/BatchNormBaseLayer.h b/paddle/gserver/layers/BatchNormBaseLayer.h
index 75bda95de1472b..230bafc31d96bb 100644
--- a/paddle/gserver/layers/BatchNormBaseLayer.h
+++ b/paddle/gserver/layers/BatchNormBaseLayer.h
@@ -52,7 +52,8 @@ class BatchNormBaseLayer : public Layer {
    */
   static Layer* create(const LayerConfig& config);
 
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   /**
    * @brief Calculate feature map size. Some input uses frameHeight and
diff --git a/paddle/gserver/layers/BatchNormalizationLayer.h b/paddle/gserver/layers/BatchNormalizationLayer.h
index 195acbbfc58db8..f6115801fc6b34 100644
--- a/paddle/gserver/layers/BatchNormalizationLayer.h
+++ b/paddle/gserver/layers/BatchNormalizationLayer.h
@@ -33,9 +33,10 @@ class BatchNormalizationLayer : public BatchNormBaseLayer {
 
   ~BatchNormalizationLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 
 protected:
   /// Epsilon value used in the batch normalization formula.
@@ -58,7 +59,7 @@ class BatchNormalizationLayer : public BatchNormBaseLayer {
   /// to batch, channels* imagePixels.
   void shrinkMat(const MatrixPtr& in, MatrixPtr& out);
 
-  void onPassEnd() { firstTest_ = true; }
+  void onPassEnd() override { firstTest_ = true; }
 
   MatrixPtr tmpMat_, tmpGrad_;
   MatrixPtr expandedIn_, expandedOut_;
diff --git a/paddle/gserver/layers/BilinearInterpLayer.h b/paddle/gserver/layers/BilinearInterpLayer.h
index 4ff4b0ea793dc9..27c269f2781c99 100644
--- a/paddle/gserver/layers/BilinearInterpLayer.h
+++ b/paddle/gserver/layers/BilinearInterpLayer.h
@@ -38,9 +38,10 @@ class BilinearInterpLayer : public Layer {
   virtual ~BilinearInterpLayer() {}
 
   size_t getSize();
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/BlockExpandLayer.h b/paddle/gserver/layers/BlockExpandLayer.h
index cc96fdd03fcac6..8f347400e60ec8 100644
--- a/paddle/gserver/layers/BlockExpandLayer.h
+++ b/paddle/gserver/layers/BlockExpandLayer.h
@@ -58,10 +58,11 @@ class BlockExpandLayer : public Layer {
 
   ~BlockExpandLayer() {}
 
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  virtual void forward(PassType passType);
-  virtual void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/CRFDecodingLayer.h b/paddle/gserver/layers/CRFDecodingLayer.h
index 1fd444ad10e71d..3cbcac6cf62dec 100644
--- a/paddle/gserver/layers/CRFDecodingLayer.h
+++ b/paddle/gserver/layers/CRFDecodingLayer.h
@@ -32,9 +32,10 @@ namespace paddle {
 class CRFDecodingLayer : public CRFLayer {
 public:
   explicit CRFDecodingLayer(const LayerConfig& config) : CRFLayer(config) {}
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  virtual void forward(PassType passType);
-  virtual void backward(const UpdateCallback& callback);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
 
 protected:
   std::unique_ptr<LinearChainCRF> crf_;
diff --git a/paddle/gserver/layers/CRFLayer.h b/paddle/gserver/layers/CRFLayer.h
index d21b32b68c1a40..de36a85083b6b2 100644
--- a/paddle/gserver/layers/CRFLayer.h
+++ b/paddle/gserver/layers/CRFLayer.h
@@ -29,9 +29,10 @@ namespace paddle {
 class CRFLayer : public Layer {
 public:
   explicit CRFLayer(const LayerConfig& config) : Layer(config) {}
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  virtual void forward(PassType passType);
-  virtual void backward(const UpdateCallback& callback);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
 
 protected:
   size_t numClasses_;
diff --git a/paddle/gserver/layers/CTCLayer.h b/paddle/gserver/layers/CTCLayer.h
index 70d429bad656ad..f7a515f312d075 100644
--- a/paddle/gserver/layers/CTCLayer.h
+++ b/paddle/gserver/layers/CTCLayer.h
@@ -22,10 +22,11 @@ namespace paddle {
 class CTCLayer : public Layer {
 public:
   explicit CTCLayer(const LayerConfig& config) : Layer(config) {}
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  virtual void forward(PassType passType);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
   void forwardImp(const Argument& softmaxSeqs, const Argument& labelSeqs);
-  virtual void backward(const UpdateCallback& callback);
+  void backward(const UpdateCallback& callback) override;
   void backwardImp(const UpdateCallback& callback,
                    const Argument& softmaxSeqs,
                    const Argument& labelSeqs);
diff --git a/paddle/gserver/layers/ConcatenateLayer.cpp b/paddle/gserver/layers/ConcatenateLayer.cpp
index d19adace7d58af..c5fc4cf4f81a55 100644
--- a/paddle/gserver/layers/ConcatenateLayer.cpp
+++ b/paddle/gserver/layers/ConcatenateLayer.cpp
@@ -28,10 +28,11 @@ class ConcatenateLayer : public Layer {
 
   ~ConcatenateLayer() {}
 
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  virtual void forward(PassType passType);
-  virtual void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(concat, ConcatenateLayer);
@@ -101,10 +102,11 @@ class ConcatenateLayer2 : public Layer {
 
   ~ConcatenateLayer2() {}
 
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  virtual void forward(PassType passType);
-  virtual void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 
 protected:
   std::vector<std::unique_ptr<Projection>> projections_;
diff --git a/paddle/gserver/layers/ContextProjection.cpp b/paddle/gserver/layers/ContextProjection.cpp
index ebcc87cbf48a3c..d7042af1c25e74 100644
--- a/paddle/gserver/layers/ContextProjection.cpp
+++ b/paddle/gserver/layers/ContextProjection.cpp
@@ -118,16 +118,15 @@ void ContextProjection::forward() {
   /// first use state_, otherwise use weight_(padding false === w nullptr)
   auto w_ptr =
       state_ ? state_.get() : is_padding ? weight_->getW().get() : nullptr;
-  auto start_pos = in_->sequenceStartPositions;
-
+  const auto start_pos = in_->sequenceStartPositions->getVector(useGpu_);
   BufferArgs inputs;
   BufferArgs outputs;
-  inputs.addArg(*in_->value);
-  inputs.addArg(CpuMatrix(w_ptr ? w_ptr->getData() : nullptr,
-                          w_ptr ? w_ptr->getHeight() : 0,
-                          input_dim));
-  inputs.addArg(*in_->sequenceStartPositions->getVector(useGpu_));
-  outputs.addArg(*out_->value, ADD_TO);
+  inputs.addArg(*in_->value, *start_pos);
+  if (w_ptr) {
+    inputs.addArg(CpuMatrix(w_ptr->getData(), w_ptr->getHeight(), input_dim),
+                  *start_pos);
+  }
+  outputs.addArg(*out_->value, *start_pos, ADD_TO);
   forward_[0]->calc(inputs, outputs);
 
   if (state_ && config_.context_start() < 0) {
@@ -166,13 +165,16 @@ void ContextProjection::backward(const UpdateCallback& callback) {
 
   BufferArgs inputs;
   BufferArgs outputs;
-  inputs.addArg(CpuMatrix(
-      in_->grad ? in_->grad->getData() : nullptr, batch_size, input_dim));
-  inputs.addArg(CpuMatrix(w_ptr ? w_ptr->getData() : nullptr,
-                          w_ptr ? w_ptr->getHeight() : 0,
-                          input_dim));
-  inputs.addArg(*in_->sequenceStartPositions->getVector(useGpu_));
-  outputs.addArg(*out_->grad, ADD_TO);
+  inputs.addArg(*out_->grad, *in_->sequenceStartPositions->getVector(useGpu_));
+  outputs.addArg(
+      CpuMatrix(
+          in_->grad ? in_->grad->getData() : nullptr, batch_size, input_dim),
+      *in_->sequenceStartPositions->getVector(useGpu_),
+      ADD_TO);
+  outputs.addArg(CpuMatrix(w_ptr ? w_ptr->getData() : nullptr,
+                           w_ptr ? w_ptr->getHeight() : 0,
+                           input_dim),
+                 ADD_TO);
   backward_[0]->calc(inputs, outputs);
 
   if (config_.trainable_padding()) {
diff --git a/paddle/gserver/layers/ConvBaseLayer.h b/paddle/gserver/layers/ConvBaseLayer.h
index aedf4100e32fa1..e9d15d94f806a5 100644
--- a/paddle/gserver/layers/ConvBaseLayer.h
+++ b/paddle/gserver/layers/ConvBaseLayer.h
@@ -80,7 +80,8 @@ class ConvBaseLayer : public Layer {
 public:
   explicit ConvBaseLayer(const LayerConfig& config) : Layer(config) {}
 
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   /**
    * imgSizeH_ and imgSizeW_ will be set according to the previous input layers
diff --git a/paddle/gserver/layers/ConvShiftLayer.cpp b/paddle/gserver/layers/ConvShiftLayer.cpp
index 9bfb1ab7a47b11..002be415691f0b 100644
--- a/paddle/gserver/layers/ConvShiftLayer.cpp
+++ b/paddle/gserver/layers/ConvShiftLayer.cpp
@@ -47,10 +47,11 @@ class ConvShiftLayer : public Layer {
 
   ~ConvShiftLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(conv_shift, ConvShiftLayer);
diff --git a/paddle/gserver/layers/ConvexCombinationLayer.cpp b/paddle/gserver/layers/ConvexCombinationLayer.cpp
index ed57f2af3c6455..32eb3bf604acaa 100644
--- a/paddle/gserver/layers/ConvexCombinationLayer.cpp
+++ b/paddle/gserver/layers/ConvexCombinationLayer.cpp
@@ -49,10 +49,11 @@ class ConvexCombinationLayer : public Layer {
 
   ~ConvexCombinationLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(convex_comb, ConvexCombinationLayer);
diff --git a/paddle/gserver/layers/CosSimLayer.cpp b/paddle/gserver/layers/CosSimLayer.cpp
index 254120443dc3d4..57ba124e40cbd0 100644
--- a/paddle/gserver/layers/CosSimLayer.cpp
+++ b/paddle/gserver/layers/CosSimLayer.cpp
@@ -26,15 +26,23 @@ bool CosSimLayer::init(const LayerMap& layerMap,
   Layer::init(layerMap, parameterMap);
 
   CHECK_EQ(inputLayers_.size(), 2LU);
+
+  createFunction(forward_,
+                 "CosSimForward",
+                 FuncConfig().set("scale", (real)config_.cos_scale()));
+  createFunction(backward_,
+                 "CosSimBackward",
+                 FuncConfig().set("scale", (real)config_.cos_scale()));
+
   return true;
 }
 
 void CosSimLayer::forward(PassType passType) {
   Layer::forward(passType);
-
   /* malloc memory for the output_ if necessary */
   int batchSize = getInputValue(0)->getHeight();
   int size = getSize();
+  CHECK_EQ(forward_.size(), 1UL) << "Only one forward function needed";
 
   {
     REGISTER_TIMER_INFO("CosFwResetTimer", getName().c_str());
@@ -42,26 +50,43 @@ void CosSimLayer::forward(PassType passType) {
   }
 
   MatrixPtr outV = getOutputValue();
-
   /* activation */ {
     REGISTER_TIMER_INFO("CosFwAtvTimer", getName().c_str());
     MatrixPtr prevOut1 = getInputValue(0);
     MatrixPtr prevOut2 = getInputValue(1);
-    outV->cosSim(*prevOut1, *prevOut2, config_.cos_scale());
+
+    CHECK(outV && prevOut1 && prevOut2);
+    BufferArgs inputs;
+    BufferArgs outputs;
+    inputs.addArg(*prevOut1);
+    inputs.addArg(*prevOut2);
+    outputs.addArg(*outV, ASSIGN_TO);
+    forward_[0]->calc(inputs, outputs);
   }
 }
 
 void CosSimLayer::backward(const UpdateCallback& callback) {
   /* activation */ {
     REGISTER_TIMER_INFO("CosBpAtvTimer", getName().c_str());
-    MatrixPtr outG = this->getOutputGrad();
-
-    outG->cosSimDerivative(*this->getOutputValue(),
-                           *getInputValue(0),
-                           *getInputValue(1),
-                           *getInputGrad(0),
-                           *getInputGrad(1),
-                           config_.cos_scale());
+    CHECK_EQ(backward_.size(), 1UL) << "Only one backward function needed";
+
+    const auto outG = this->getOutputGrad();
+    const auto outV = this->getOutputValue();
+    const auto inV1 = this->getInputValue(0);
+    const auto inV2 = this->getInputValue(1);
+    auto inG1 = this->getInputGrad(0);
+    auto inG2 = this->getInputGrad(1);
+    CHECK(outG && outV && inV1 && inV2 && inG1 && inG2);
+    BufferArgs inputs;
+    BufferArgs outputs;
+    inputs.addArg(*outG);
+    inputs.addArg(*outV);
+    inputs.addArg(*inV1);
+    inputs.addArg(*inV2);
+    outputs.addArg(*inG1, ADD_TO);
+    outputs.addArg(*inG2, ADD_TO);
+
+    backward_[0]->calc(inputs, outputs);
   }
 }
 
diff --git a/paddle/gserver/layers/CosSimLayer.h b/paddle/gserver/layers/CosSimLayer.h
index 5dcc5d8a5b4dc7..8afaee62c2dcac 100644
--- a/paddle/gserver/layers/CosSimLayer.h
+++ b/paddle/gserver/layers/CosSimLayer.h
@@ -28,7 +28,7 @@ namespace paddle {
  *
  * - Input1: A vector (batchSize * dataDim) *
  * - Input2: A vector (batchSize * dataDim) or (1 * dataDim) *
- * - Output: A vector (dataDim * 1)
+ * - Output: A vector (batchSize * 1)
  *
  * The config file api is cos_sim.
  */
@@ -38,10 +38,11 @@ class CosSimLayer : public Layer {
 
   ~CosSimLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/CosSimVecMatLayer.cpp b/paddle/gserver/layers/CosSimVecMatLayer.cpp
index ad490b0b8c4656..0f887d8adfa053 100644
--- a/paddle/gserver/layers/CosSimVecMatLayer.cpp
+++ b/paddle/gserver/layers/CosSimVecMatLayer.cpp
@@ -18,7 +18,6 @@ limitations under the License. */
 #include "paddle/utils/Stat.h"
 
 namespace paddle {
-
 /**
  * @brief A layer for computing cosine similarity between a vector
  * and each row of a matrix
@@ -46,10 +45,11 @@ class CosSimVecMatLayer : public Layer {
 
   ~CosSimVecMatLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(cos_vm, CosSimVecMatLayer);
@@ -97,11 +97,22 @@ bool CosSimVecMatLayer::init(const LayerMap& layerMap,
                            dataDim,
                            /* trans= */ false,
                            useGpu_);
+
+  CHECK(tmpRow0 && tmpRow1 && tmpRow2 && tmpRow3 && tmpMtx0 && tmpMtx1);
+
+  createFunction(forward_,
+                 "CosSimForward",
+                 FuncConfig().set("scale", (real)config_.cos_scale()));
+  createFunction(backward_,
+                 "CosSimBackward",
+                 FuncConfig().set("scale", (real)config_.cos_scale()));
+
   return true;
 }
 
 void CosSimVecMatLayer::forward(PassType passType) {
   Layer::forward(passType);
+  CHECK_EQ(forward_.size(), 1UL) << "Only one forward function needed";
 
   MatrixPtr inV0 = getInputValue(0);
   MatrixPtr inV1 = getInputValue(1);
@@ -117,17 +128,25 @@ void CosSimVecMatLayer::forward(PassType passType) {
   }
 
   MatrixPtr outV = getOutputValue();
-
+  CHECK(outV && inV0 && inV1);
   REGISTER_TIMER_INFO("FwCosVMTimer", getName().c_str());
   for (size_t i = 0; i < batchSize; i++) {
     tmpRow0->setData(inV0->rowBuf(i));
     tmpMtx0->setData(inV1->rowBuf(i));
     tmpRow2->setData(outV->rowBuf(i));
-    tmpRow2->cosSim(*(tmpMtx0), *(tmpRow0), config_.cos_scale());
+
+    BufferArgs inputs;
+    BufferArgs outputs;
+    inputs.addArg(*tmpMtx0);
+    inputs.addArg(*tmpRow0);
+    outputs.addArg(*tmpRow2, ASSIGN_TO);
+    forward_[0]->calc(inputs, outputs);
   }
 }
 
 void CosSimVecMatLayer::backward(const UpdateCallback& callback) {
+  CHECK_EQ(backward_.size(), 1UL) << "Only one forward function needed";
+
   MatrixPtr inV0 = getInputValue(0);
   MatrixPtr inV1 = getInputValue(1);
   MatrixPtr inG0 = getInputGrad(0);
@@ -136,27 +155,27 @@ void CosSimVecMatLayer::backward(const UpdateCallback& callback) {
   MatrixPtr outG = getOutputGrad();
 
   size_t batchSize = inV0->getHeight();
-
+  CHECK(inV0 && inV1 && inG0 && inG1 && outV && outG);
   REGISTER_TIMER_INFO("BwCosVMTimer", getName().c_str());
 
-  if (inG0 && inG1) {
-    for (size_t i = 0; i < batchSize; i++) {
-      tmpRow0->setData(inV0->rowBuf(i));
-      tmpRow1->setData(inG0->rowBuf(i));
-      tmpMtx0->setData(inV1->rowBuf(i));
-      tmpMtx1->setData(inG1->rowBuf(i));
-      tmpRow2->setData(outV->rowBuf(i));
-      tmpRow3->setData(outG->rowBuf(i));
-
-      tmpRow3->cosSimDerivative(*(tmpRow2),
-                                *(tmpMtx0),
-                                *(tmpRow0),
-                                *(tmpMtx1),
-                                *(tmpRow1),
-                                config_.cos_scale());
-    }
-  } else {
-    CHECK(!inG0 || !inG1) << "Not supported";
+  for (size_t i = 0; i < batchSize; i++) {
+    tmpRow0->setData(inV0->rowBuf(i));
+    tmpRow1->setData(inG0->rowBuf(i));
+    tmpMtx0->setData(inV1->rowBuf(i));
+    tmpMtx1->setData(inG1->rowBuf(i));
+    tmpRow2->setData(outV->rowBuf(i));
+    tmpRow3->setData(outG->rowBuf(i));
+
+    BufferArgs inputs;
+    BufferArgs outputs;
+    inputs.addArg(*tmpRow3);
+    inputs.addArg(*tmpRow2);
+    inputs.addArg(*tmpMtx0);
+    inputs.addArg(*tmpRow0);
+    outputs.addArg(*tmpMtx1, ADD_TO);
+    outputs.addArg(*tmpRow1, ADD_TO);
+
+    backward_[0]->calc(inputs, outputs);
   }
 }
 
diff --git a/paddle/gserver/layers/CostLayer.cpp b/paddle/gserver/layers/CostLayer.cpp
index 7e9519f6b3af50..998b8d7d3034cb 100644
--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
@@ -367,8 +367,6 @@ void LambdaCost::backward(const UpdateCallback& callback) {
   getInputGrad(0)->add(*marginGrad_);
 }
 
-void LambdaCost::onPassEnd() {}
-
 void LambdaCost::calcGrad(const real* outputScore,
                           const real* score,
                           real* gradData,
@@ -611,14 +609,15 @@ class SumCostLayer : public Layer {
 public:
   explicit SumCostLayer(const LayerConfig& config) : Layer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
     bool ret = Layer::init(layerMap, parameterMap);
     if (!ret) return ret;
     CHECK_EQ(inputLayers_.size(), 1UL);
     return true;
   }
 
-  virtual void forward(PassType passType) {
+  void forward(PassType passType) override {
     Layer::forward(passType);
     const MatrixPtr& input = getInputValue(0);
 
@@ -629,7 +628,7 @@ class SumCostLayer : public Layer {
     output_.value->sumRows(*input, /* scaleSum= */ 1, /* scaleDest= */ 0);
   }
 
-  virtual void backward(const UpdateCallback& callback = nullptr) {
+  void backward(const UpdateCallback& callback = nullptr) override {
     getInputGrad(0)->add((real)1);
   }
 };
diff --git a/paddle/gserver/layers/CostLayer.h b/paddle/gserver/layers/CostLayer.h
index 7f73bdb3f7d63e..b3045e0b31308a 100644
--- a/paddle/gserver/layers/CostLayer.h
+++ b/paddle/gserver/layers/CostLayer.h
@@ -32,15 +32,16 @@ class CostLayer : public Layer {
 public:
   explicit CostLayer(const LayerConfig& config) : Layer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   LayerPtr getOutputLayer() { return inputLayers_[0]; }
 
   LayerPtr getLabelLayer() { return inputLayers_[1]; }
 
-  virtual void forward(PassType passType);
+  void forward(PassType passType) override;
 
-  virtual void backward(const UpdateCallback& callback = nullptr);
+  void backward(const UpdateCallback& callback = nullptr) override;
 
   virtual void forwardImp(Matrix& outputValue,
                           Argument& label,
@@ -68,11 +69,14 @@ class MultiClassCrossEntropy : public CostLayer {
   explicit MultiClassCrossEntropy(const LayerConfig& config)
       : CostLayer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost);
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
 
-  void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad);
+  void backwardImp(Matrix& outputValue,
+                   Argument& label,
+                   Matrix& outputGrad) override;
 };
 
 /**
@@ -95,11 +99,14 @@ class MultiClassCrossEntropyWithSelfNorm : public CostLayer {
   explicit MultiClassCrossEntropyWithSelfNorm(const LayerConfig& config)
       : CostLayer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost);
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
 
-  void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad);
+  void backwardImp(Matrix& outputValue,
+                   Argument& label,
+                   Matrix& outputGrad) override;
 
 protected:
   MatrixPtr sftMaxSum_;
@@ -117,11 +124,14 @@ class SoftBinaryClassCrossEntropy : public CostLayer {
   explicit SoftBinaryClassCrossEntropy(const LayerConfig& config)
       : CostLayer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost);
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
 
-  void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad);
+  void backwardImp(Matrix& outputValue,
+                   Argument& label,
+                   Matrix& outputGrad) override;
 
 protected:
   MatrixPtr targetPerDim_;
@@ -139,11 +149,14 @@ class SumOfSquaresCostLayer : public CostLayer {
   explicit SumOfSquaresCostLayer(const LayerConfig& config)
       : CostLayer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost);
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
 
-  void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad);
+  void backwardImp(Matrix& outputValue,
+                   Argument& label,
+                   Matrix& outputGrad) override;
 };
 
 /**
@@ -162,17 +175,18 @@ class RankingCost : public Layer {
 public:
   explicit RankingCost(const LayerConfig& config) : Layer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   LayerPtr getOutputLayer(size_t i) { return inputLayers_[i]; }
 
   LayerPtr getLabelLayer() { return inputLayers_[2]; }
 
-  void forward(PassType passType);
+  void forward(PassType passType) override;
 
-  void backward(const UpdateCallback& callback = nullptr);
+  void backward(const UpdateCallback& callback = nullptr) override;
 
-  void onPassEnd();
+  void onPassEnd() override;
 
   void forwardImp(Matrix& output, Argument& label, Matrix& cost) {
     (void)output;
@@ -214,17 +228,16 @@ class LambdaCost : public Layer {
 public:
   explicit LambdaCost(const LayerConfig& config) : Layer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   LayerPtr getOutputLayer() { return inputLayers_[0]; }
 
   LayerPtr getScoreLayer() { return inputLayers_[1]; }
 
-  void forward(PassType passType);
+  void forward(PassType passType) override;
 
-  void backward(const UpdateCallback& callback = nullptr);
-
-  void onPassEnd();
+  void backward(const UpdateCallback& callback = nullptr) override;
 
   real calcNDCG(const real* outputScore, const real* score, int size);
   void calcGrad(const real* outputScore,
@@ -256,11 +269,14 @@ class MultiBinaryLabelCrossEntropy : public CostLayer {
   explicit MultiBinaryLabelCrossEntropy(const LayerConfig& config)
       : CostLayer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost);
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
 
-  void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad);
+  void backwardImp(Matrix& outputValue,
+                   Argument& label,
+                   Matrix& outputGrad) override;
 };
 
 /**
@@ -282,13 +298,16 @@ class HuberTwoClass : public CostLayer {
 public:
   explicit HuberTwoClass(const LayerConfig& config) : CostLayer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost);
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
 
   void forwardImpIn(Matrix& output, Argument& label, Matrix& cost);
 
-  void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad);
+  void backwardImp(Matrix& outputValue,
+                   Argument& label,
+                   Matrix& outputGrad) override;
 
   void backwardImpIn(Matrix& outputValue, Argument& label, Matrix& outputGrad);
 };
diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.h b/paddle/gserver/layers/CudnnBatchNormLayer.h
index b1e7d2082f1443..413efd4d3ecd73 100644
--- a/paddle/gserver/layers/CudnnBatchNormLayer.h
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.h
@@ -35,14 +35,15 @@ class CudnnBatchNormLayer : public BatchNormBaseLayer {
 
   ~CudnnBatchNormLayer();
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
   /**
    * reshape tensor of ioDesc_.
    */
   void reshape(int batchSize);
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 
 protected:
   /**
diff --git a/paddle/gserver/layers/CudnnConvLayer.h b/paddle/gserver/layers/CudnnConvLayer.h
index b869c695bd7530..919b1efc4e4532 100644
--- a/paddle/gserver/layers/CudnnConvLayer.h
+++ b/paddle/gserver/layers/CudnnConvLayer.h
@@ -45,9 +45,10 @@ class CudnnConvLayer : public ConvBaseLayer {
 
   ~CudnnConvLayer();
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
   void addBiases();
   void bpropBiases();
 };
diff --git a/paddle/gserver/layers/CudnnPoolLayer.h b/paddle/gserver/layers/CudnnPoolLayer.h
index 072b2f9513f4ef..f0aa22fe3af90c 100644
--- a/paddle/gserver/layers/CudnnPoolLayer.h
+++ b/paddle/gserver/layers/CudnnPoolLayer.h
@@ -45,7 +45,8 @@ class CudnnPoolLayer : public PoolLayer {
                         hl_pooling_mode_t* mode = nullptr);
   explicit CudnnPoolLayer(const LayerConfig& config);
   ~CudnnPoolLayer();
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   /**
    * Reshape input and output tensor descriptor.
@@ -53,8 +54,8 @@ class CudnnPoolLayer : public PoolLayer {
    * So reshaping is needed.
    */
   void reshape(int batchSize);
-  virtual void forward(PassType passType);
-  virtual void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/DataLayer.h b/paddle/gserver/layers/DataLayer.h
index d3bc97bb6cd0b8..a9cf1f943c260a 100644
--- a/paddle/gserver/layers/DataLayer.h
+++ b/paddle/gserver/layers/DataLayer.h
@@ -33,13 +33,13 @@ class DataLayer : public Layer {
   /**
    * Prefetch sparse matrix/ids only.
    */
-  void prefetch() { output_ = data_; }
+  void prefetch() override { output_ = data_; }
 
   /**
    * Forward propagation. Copy data_ (value, in, grad, ids, cpuSequenceDims,
    * sequenceStartPositions, subSequenceStartPositions, strs) to output_.
    */
-  virtual void forward(PassType passType) {
+  void forward(PassType passType) override {
     Layer::forward(passType);
     copyDataToOutput(output_);
     if (FLAGS_show_layer_stat) {
@@ -50,9 +50,9 @@ class DataLayer : public Layer {
   /**
    * Data layer's backward propagation do nothing.
    */
-  virtual void backward(const UpdateCallback& callback) { (void)callback; }
+  void backward(const UpdateCallback& callback) override { (void)callback; }
 
-  virtual void copyOutputToOtherDevice() {
+  void copyOutputToOtherDevice() override {
     for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
       copyDataToOutput(outputOtherDevice_[i]);
     }
diff --git a/paddle/gserver/layers/DataNormLayer.h b/paddle/gserver/layers/DataNormLayer.h
index b3043cffd210fe..f0fd044e5b8343 100644
--- a/paddle/gserver/layers/DataNormLayer.h
+++ b/paddle/gserver/layers/DataNormLayer.h
@@ -44,10 +44,11 @@ class DataNormLayer : public Layer {
 
   ~DataNormLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 
 protected:
   int mode_;
diff --git a/paddle/gserver/layers/EosIdCheckLayer.cpp b/paddle/gserver/layers/EosIdCheckLayer.cpp
index fa53e2e4cfc8a2..686f1fa0543cb3 100644
--- a/paddle/gserver/layers/EosIdCheckLayer.cpp
+++ b/paddle/gserver/layers/EosIdCheckLayer.cpp
@@ -27,14 +27,14 @@ class EosIdCheckLayer : public Layer {
 public:
   explicit EosIdCheckLayer(const LayerConfig& config) : Layer(config) {}
 
-  virtual bool init(const LayerMap& layerMap,
-                    const ParameterMap& parameterMap) {
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
     bool ret = Layer::init(layerMap, parameterMap);
     CHECK_EQ(1UL, inputLayers_.size());
     return ret;
   }
 
-  virtual void forward(PassType passType) {
+  void forward(PassType passType) override {
     Layer::forward(passType);
 
     const Argument& input = getInput(0);
@@ -42,7 +42,7 @@ class EosIdCheckLayer : public Layer {
     output_.ids->isEqualTo(*input.ids, config_.eos_id());
   }
 
-  virtual void backward(const UpdateCallback& callback) {}
+  void backward(const UpdateCallback& callback) override {}
 };
 
 REGISTER_LAYER(eos_id, EosIdCheckLayer);
diff --git a/paddle/gserver/layers/ExpandConvBaseLayer.h b/paddle/gserver/layers/ExpandConvBaseLayer.h
index 8445642217cf3e..aabcdfc392d3e2 100644
--- a/paddle/gserver/layers/ExpandConvBaseLayer.h
+++ b/paddle/gserver/layers/ExpandConvBaseLayer.h
@@ -48,7 +48,8 @@ class ExpandConvBaseLayer : public ConvBaseLayer {
 
   ~ExpandConvBaseLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   size_t getOutputSize();
   /**
diff --git a/paddle/gserver/layers/ExpandConvLayer.h b/paddle/gserver/layers/ExpandConvLayer.h
index de81a017e1bac3..60681690e5dd55 100644
--- a/paddle/gserver/layers/ExpandConvLayer.h
+++ b/paddle/gserver/layers/ExpandConvLayer.h
@@ -35,10 +35,11 @@ class ExpandConvLayer : public ExpandConvBaseLayer {
 
   ~ExpandConvLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandConvTransLayer.h b/paddle/gserver/layers/ExpandConvTransLayer.h
index 4a527d67995e25..00b8f241889fdd 100644
--- a/paddle/gserver/layers/ExpandConvTransLayer.h
+++ b/paddle/gserver/layers/ExpandConvTransLayer.h
@@ -34,10 +34,11 @@ class ExpandConvTransLayer : public ExpandConvBaseLayer {
 
   ~ExpandConvTransLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandLayer.h b/paddle/gserver/layers/ExpandLayer.h
index 5c636144235cdb..c21b3350e2bc4b 100644
--- a/paddle/gserver/layers/ExpandLayer.h
+++ b/paddle/gserver/layers/ExpandLayer.h
@@ -53,10 +53,11 @@ class ExpandLayer : public Layer {
 
   ~ExpandLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/FeatureMapExpandLayer.cpp b/paddle/gserver/layers/FeatureMapExpandLayer.cpp
index d023074c521675..b3850f543af74a 100644
--- a/paddle/gserver/layers/FeatureMapExpandLayer.cpp
+++ b/paddle/gserver/layers/FeatureMapExpandLayer.cpp
@@ -46,10 +46,11 @@ class FeatureMapExpandLayer : public Layer {
 
   ~FeatureMapExpandLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(featmap_expand, FeatureMapExpandLayer);
@@ -95,6 +96,9 @@ void FeatureMapExpandLayer::forward(PassType passType) {
 
 void FeatureMapExpandLayer::backward(const UpdateCallback& callback) {
   MatrixPtr inGrad = getInputGrad(0);
+  if (NULL == inGrad) {
+    return;
+  }
   MatrixPtr outGrad = getOutputGrad();
   size_t batchSize = getInput(0).getBatchSize();
   int imgSize = inGrad->getWidth();
diff --git a/paddle/gserver/layers/FullyConnectedLayer.h b/paddle/gserver/layers/FullyConnectedLayer.h
index ccd584585c97cb..64e7a050125aa9 100644
--- a/paddle/gserver/layers/FullyConnectedLayer.h
+++ b/paddle/gserver/layers/FullyConnectedLayer.h
@@ -36,13 +36,14 @@ class FullyConnectedLayer : public Layer {
   explicit FullyConnectedLayer(const LayerConfig& config) : Layer(config) {}
   ~FullyConnectedLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   Weight& getWeight(int idx) { return *weights_[idx]; }
 
-  void prefetch();
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void prefetch() override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/GatedRecurrentLayer.cpp b/paddle/gserver/layers/GatedRecurrentLayer.cpp
index 930d9a056164e7..d3aeea921801da 100644
--- a/paddle/gserver/layers/GatedRecurrentLayer.cpp
+++ b/paddle/gserver/layers/GatedRecurrentLayer.cpp
@@ -314,13 +314,13 @@ void GatedRecurrentLayer::forwardBatch(int batchSize,
 
   batchValue_->resizeOrCreate(*output_.value);
   batchValue_->copy(*inputValue, *gate_.value, /* seq2batch */ true);
-  if (bias_ && bias_->getWGrad()) {
+  if (bias_) {
     gate_.value->addBias(*(bias_->getW()), 1);
   }
 
   {
     int numBatch = batchValue_->getNumBatch();
-    int batchSize = 0;
+    int curBatchSize = 0;
     AsyncGpuBlock asyncGpuBlock;
     for (int n = 0; n < numBatch; n++) {
       MatrixPtr outputValueTmp = batchValue_->getBatchValue(n);
@@ -330,16 +330,17 @@ void GatedRecurrentLayer::forwardBatch(int batchSize,
       gruValue.resetOutputValue =
           (batchValue_->getBatchValue(*resetOutput_.value, n))->getData();
 
-      batchSize = outputValueTmp->getHeight();
+      curBatchSize = outputValueTmp->getHeight();
       gruValue.prevOutValue =
-          (n == 0 ? nullptr
-                  : (batchValue_->getBatchValue(n - 1, batchSize))->getData());
+          (n == 0
+               ? nullptr
+               : (batchValue_->getBatchValue(n - 1, curBatchSize))->getData());
 
       {
         if (useGpu_) {
-          GruCompute::forward<1>(gruValue, getSize(), batchSize);
+          GruCompute::forward<1>(gruValue, getSize(), curBatchSize);
         } else {
-          GruCompute::forward<0>(gruValue, getSize(), batchSize);
+          GruCompute::forward<0>(gruValue, getSize(), curBatchSize);
         }
       }
     }
diff --git a/paddle/gserver/layers/GatedRecurrentLayer.h b/paddle/gserver/layers/GatedRecurrentLayer.h
index 25770ce57fbaa4..58dd760eb870e9 100644
--- a/paddle/gserver/layers/GatedRecurrentLayer.h
+++ b/paddle/gserver/layers/GatedRecurrentLayer.h
@@ -50,17 +50,18 @@ class GatedRecurrentLayer : public Layer, public GruCompute {
 public:
   explicit GatedRecurrentLayer(const LayerConfig& config) : Layer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
+  void forward(PassType passType) override;
 
-  void backward(const UpdateCallback& callback);
+  void backward(const UpdateCallback& callback) override;
 
-  void resetState();
+  void resetState() override;
 
-  void setState(LayerStatePtr state);
+  void setState(LayerStatePtr state) override;
 
-  LayerStatePtr getState();
+  LayerStatePtr getState() override;
 
 protected:
   void forwardSequence(int batchSize,
diff --git a/paddle/gserver/layers/GetOutputLayer.cpp b/paddle/gserver/layers/GetOutputLayer.cpp
index b77fdbb30e11b7..4e29efd4612b18 100644
--- a/paddle/gserver/layers/GetOutputLayer.cpp
+++ b/paddle/gserver/layers/GetOutputLayer.cpp
@@ -22,17 +22,18 @@ class GetOutputLayer : public Layer {
 
   ~GetOutputLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
     if (!Layer::init(layerMap, parameterMap)) return false;
     CHECK_EQ(1U, inputLayers_.size());
     CHECK_NE(inputArgument_[0], "");
     return true;
   }
 
-  void forward(PassType passType) {
+  void forward(PassType passType) override {
     output_ = getPrev(0)->getOutput(inputArgument_[0]);
   }
-  void backward(const UpdateCallback& callback = nullptr) {}
+  void backward(const UpdateCallback& callback = nullptr) override {}
 };
 
 REGISTER_LAYER(get_output, GetOutputLayer);
diff --git a/paddle/gserver/layers/GruStepLayer.cpp b/paddle/gserver/layers/GruStepLayer.cpp
index 4a1006aa941f39..5b5cb25f9269a3 100644
--- a/paddle/gserver/layers/GruStepLayer.cpp
+++ b/paddle/gserver/layers/GruStepLayer.cpp
@@ -55,10 +55,11 @@ class GruStepLayer : public Layer, public GruCompute {
 
   ~GruStepLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(gru_step, GruStepLayer);
diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.h b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
index 70da3ac126e147..3f6875fb9f007c 100644
--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.h
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
@@ -61,9 +61,10 @@ class HierarchicalSigmoidLayer : public Layer {
 public:
   explicit HierarchicalSigmoidLayer(const LayerConfig& config)
       : Layer(config) {}
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  virtual void forward(PassType passType);
-  virtual void backward(const UpdateCallback& callback);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
 
 protected:
   /**
diff --git a/paddle/gserver/layers/InterpolationLayer.cpp b/paddle/gserver/layers/InterpolationLayer.cpp
index 44fe1fb1fea420..eac7428571980b 100644
--- a/paddle/gserver/layers/InterpolationLayer.cpp
+++ b/paddle/gserver/layers/InterpolationLayer.cpp
@@ -43,10 +43,11 @@ class InterpolationLayer : public Layer {
 
   ~InterpolationLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(interpolation, InterpolationLayer);
diff --git a/paddle/gserver/layers/Layer.cpp b/paddle/gserver/layers/Layer.cpp
index c47943f81c0158..f76d41ad3e8a3b 100644
--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/utils/Util.h"
 
 #include "paddle/math/SparseMatrix.h"
+#include "paddle/utils/Error.h"
 #include "paddle/utils/Logging.h"
 
 #include "AddtoLayer.h"
@@ -334,7 +335,8 @@ void Layer::showOutputStats() {
 
 void Layer::forwardActivation() {
   /* activation */
-  activation_->forward(output_);
+  auto status = activation_->forward(output_);
+  status.check();
 
   /* dropout */
   if (config_.drop_rate() > 0) {
@@ -372,7 +374,8 @@ void Layer::backwardActivation() {
     oGrad->dotMul(*oGrad, *dropOutMask_);
   }
 
-  activation_->backward(output_);
+  auto status = activation_->backward(output_);
+  status.check();
 }
 
 void Layer::forwardDropOut() {
diff --git a/paddle/gserver/layers/Layer.h b/paddle/gserver/layers/Layer.h
index 6dfd48fb966181..7c4bea072157aa 100644
--- a/paddle/gserver/layers/Layer.h
+++ b/paddle/gserver/layers/Layer.h
@@ -311,6 +311,7 @@ class Layer {
         return *output->second;
       } else {
         LOG(FATAL) << "No specific output " << str;
+        return *((Argument*)nullptr);
       }
     }
   }
diff --git a/paddle/gserver/layers/LstmLayer.h b/paddle/gserver/layers/LstmLayer.h
index f49df2c412f05f..c45a52d2e9aaf4 100644
--- a/paddle/gserver/layers/LstmLayer.h
+++ b/paddle/gserver/layers/LstmLayer.h
@@ -74,17 +74,18 @@ class LstmLayer : public Layer, public LstmCompute {
 public:
   explicit LstmLayer(const LayerConfig &config) : Layer(config) {}
 
-  bool init(const LayerMap &layerMap, const ParameterMap &parameterMap);
+  bool init(const LayerMap &layerMap,
+            const ParameterMap &parameterMap) override;
 
-  void forward(PassType passType);
+  void forward(PassType passType) override;
 
-  void backward(const UpdateCallback &callback);
+  void backward(const UpdateCallback &callback) override;
 
-  void resetState();
+  void resetState() override;
 
-  void setState(LayerStatePtr state);
+  void setState(LayerStatePtr state) override;
 
-  LayerStatePtr getState();
+  LayerStatePtr getState() override;
 
 protected:
   /**
diff --git a/paddle/gserver/layers/LstmStepLayer.cpp b/paddle/gserver/layers/LstmStepLayer.cpp
index 5fc6474b8653f4..568277a90c62c7 100644
--- a/paddle/gserver/layers/LstmStepLayer.cpp
+++ b/paddle/gserver/layers/LstmStepLayer.cpp
@@ -35,10 +35,11 @@ class LstmStepLayer : public Layer, public LstmCompute {
 
   ~LstmStepLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(lstm_step, LstmStepLayer);
diff --git a/paddle/gserver/layers/MDLstmLayer.cpp b/paddle/gserver/layers/MDLstmLayer.cpp
index fb41af56319549..be0f2a07d4aae2 100644
--- a/paddle/gserver/layers/MDLstmLayer.cpp
+++ b/paddle/gserver/layers/MDLstmLayer.cpp
@@ -181,11 +181,12 @@ class MDLstmLayer : public LstmLayer {
 public:
   explicit MDLstmLayer(const LayerConfig& config) : LstmLayer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
+  void forward(PassType passType) override;
 
-  void backward(const UpdateCallback& callback);
+  void backward(const UpdateCallback& callback) override;
 
 protected:
   void forwardOneSequence(int start, CoordIterator& coordIter);
@@ -506,9 +507,12 @@ void MDLstmLayer::forwardGate2OutputSequence(int start,
           *frameState_[start + preOffsetV[i]].value, *checkFgOneDim, 1.0, 1.0);
     }
   }
-  activationGate_->forward(frameInputGate_[idxCurr]);
-  activationGate_->forward(frameForgetGate_[idxCurr]);
-  activation_->forward(frameInputNode_[idxCurr]);
+  auto status = activationGate_->forward(frameInputGate_[idxCurr]);
+  status.check();
+  status = activationGate_->forward(frameForgetGate_[idxCurr]);
+  status.check();
+  status = activation_->forward(frameInputNode_[idxCurr]);
+  status.check();
 
   frameState_[idxCurr].value->zeroMem();
   for (int i = 0; i < numDims_; i++) {
@@ -530,10 +534,12 @@ void MDLstmLayer::forwardGate2OutputSequence(int start,
 
   frameOutputGate_[idxCurr].value->addDotMul(
       *frameState_[idxCurr].value, *checkOg_, 1.0, 1.0);
-  activationGate_->forward(frameOutputGate_[idxCurr]);
+  status = activationGate_->forward(frameOutputGate_[idxCurr]);
+  status.check();
 
   framePreOutput_[idxCurr].value->copyFrom(*(frameState_[idxCurr].value));
-  activationState_->forward(framePreOutput_[idxCurr]);
+  status = activationState_->forward(framePreOutput_[idxCurr]);
+  status.check();
 
   frameOutput_[idxCurr].value->dotMul(*framePreOutput_[idxCurr].value,
                                       *frameOutputGate_[idxCurr].value);
@@ -640,12 +646,12 @@ void MDLstmLayer::backwardGate2OutputSequence(int start,
 
   framePreOutput_[idxCurr].grad->dotMul(*frameOutput_[idxCurr].grad,
                                         *frameOutputGate_[idxCurr].value);
-  activationState_->backward(framePreOutput_[idxCurr]);
+  activationState_->backward(framePreOutput_[idxCurr]).check();
   frameState_[idxCurr].grad->copyFrom(*(framePreOutput_[idxCurr].grad));
 
   frameOutputGate_[idxCurr].grad->dotMul(*frameOutput_[idxCurr].grad,
                                          *framePreOutput_[idxCurr].value);
-  activationGate_->backward(frameOutputGate_[idxCurr]);
+  activationGate_->backward(frameOutputGate_[idxCurr]).check();
 
   frameState_[idxCurr].grad->addDotMul(
       *frameOutputGate_[idxCurr].grad, *checkOg_, 1.0, 1.0);
@@ -702,9 +708,9 @@ void MDLstmLayer::backwardGate2OutputSequence(int start,
     }
   }
 
-  activationGate_->backward(frameInputGate_[idxCurr]);
-  activationGate_->backward(frameForgetGate_[idxCurr]);
-  activation_->backward(frameInputNode_[idxCurr]);
+  activationGate_->backward(frameInputGate_[idxCurr]).check();
+  activationGate_->backward(frameForgetGate_[idxCurr]).check();
+  activation_->backward(frameInputNode_[idxCurr]).check();
 
   if (bias_->getWGrad()) {
     for (int i = 0; i < numDims_; i++) {
diff --git a/paddle/gserver/layers/MaxIdLayer.cpp b/paddle/gserver/layers/MaxIdLayer.cpp
index 80555f3f7b3241..9e72b167cd963a 100644
--- a/paddle/gserver/layers/MaxIdLayer.cpp
+++ b/paddle/gserver/layers/MaxIdLayer.cpp
@@ -30,8 +30,8 @@ class MaxIdLayer : public Layer {
 public:
   explicit MaxIdLayer(const LayerConfig& config) : Layer(config) {}
 
-  virtual bool init(const LayerMap& layerMap,
-                    const ParameterMap& parameterMap) {
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
     bool ret = Layer::init(layerMap, parameterMap);
     CHECK_EQ(1UL, inputLayers_.size());
 
@@ -40,7 +40,7 @@ class MaxIdLayer : public Layer {
     return ret;
   }
 
-  virtual void forward(PassType passType) {
+  void forward(PassType passType) override {
     Layer::forward(passType);
     const Argument& input = getInput(0);
     size_t batchSize = input.getBatchSize();
@@ -54,7 +54,7 @@ class MaxIdLayer : public Layer {
     input.value->rowMax(*output_.ids, *output_.in);
   }
 
-  virtual void backward(const UpdateCallback& callback) {}
+  void backward(const UpdateCallback& callback) override {}
 };
 
 REGISTER_LAYER(maxid, MaxIdLayer);
diff --git a/paddle/gserver/layers/MaxLayer.h b/paddle/gserver/layers/MaxLayer.h
index 472ee0ccca1962..baa58ca2d7a697 100644
--- a/paddle/gserver/layers/MaxLayer.h
+++ b/paddle/gserver/layers/MaxLayer.h
@@ -42,14 +42,13 @@ class MaxLayer : public SequencePoolLayer {
 public:
   explicit MaxLayer(const LayerConfig& config) : SequencePoolLayer(config) {}
 
-  ~MaxLayer() {}
-
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
     return SequencePoolLayer::init(layerMap, parameterMap);
   }
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/MaxOutLayer.h b/paddle/gserver/layers/MaxOutLayer.h
index 59c2245e0d6490..73fd8536be56b2 100644
--- a/paddle/gserver/layers/MaxOutLayer.h
+++ b/paddle/gserver/layers/MaxOutLayer.h
@@ -45,10 +45,11 @@ class MaxOutLayer : public Layer {
   explicit MaxOutLayer(const LayerConfig& config) : Layer(config) {}
   virtual ~MaxOutLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/MixedLayer.h b/paddle/gserver/layers/MixedLayer.h
index 9655a152c7bc96..755c9deb8b1be3 100644
--- a/paddle/gserver/layers/MixedLayer.h
+++ b/paddle/gserver/layers/MixedLayer.h
@@ -35,21 +35,22 @@ class MixedLayer : public Layer {
 
   ~MixedLayer() {}
 
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  virtual void prefetch();
-  virtual void forward(PassType passType);
-  virtual void backward(const UpdateCallback& callback = nullptr);
-  virtual void resetState();
+  void prefetch() override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+  void resetState() override;
   /**
    * setState() should be called after getState().
    * Argument state consists of all projections states.
    */
-  virtual void setState(LayerStatePtr state);
+  void setState(LayerStatePtr state) override;
   /**
    * Return state which consists of all projections states.
    */
-  virtual LayerStatePtr getState();
+  LayerStatePtr getState() override;
 
 protected:
   std::vector<std::unique_ptr<Projection>> projections_;
diff --git a/paddle/gserver/layers/MultiplexLayer.cpp b/paddle/gserver/layers/MultiplexLayer.cpp
index d09720c5255747..297972b3cd9e4d 100644
--- a/paddle/gserver/layers/MultiplexLayer.cpp
+++ b/paddle/gserver/layers/MultiplexLayer.cpp
@@ -69,10 +69,11 @@ class MultiplexLayer : public Layer {
 
   ~MultiplexLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 
 private:
   /**
diff --git a/paddle/gserver/layers/NCELayer.cpp b/paddle/gserver/layers/NCELayer.cpp
index 5ab765247f63df..0bc2ef11829337 100644
--- a/paddle/gserver/layers/NCELayer.cpp
+++ b/paddle/gserver/layers/NCELayer.cpp
@@ -61,7 +61,8 @@ class NCELayer : public Layer {
         rand_(0, config.num_classes() - 1),
         prepared_(false) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
     /* Initialize the basic parent class */
     Layer::init(layerMap, parameterMap);
 
@@ -146,7 +147,7 @@ class NCELayer : public Layer {
     prepared_ = true;
   }
 
-  void prefetch() {
+  void prefetch() override {
     prepareSamples();
     IVector::resizeOrCreate(labelIds_, samples_.size(), useGpu_);
     int* ids = labelIds_->getData();
@@ -163,7 +164,7 @@ class NCELayer : public Layer {
     }
   }
 
-  void forward(PassType passType) {
+  void forward(PassType passType) override {
     Layer::forward(passType);
 
     CHECK(!useGpu_) << "GPU is not supported";
@@ -193,12 +194,13 @@ class NCELayer : public Layer {
       forwardOneInput(l);
     }
 
-    activation_->forward(sampleOut_);
+    auto status = activation_->forward(sampleOut_);
+    status.check();
 
     forwardCost();
   }
 
-  void backward(const UpdateCallback& callback) {
+  void backward(const UpdateCallback& callback) override {
     Matrix::resizeOrCreate(sampleOut_.grad,
                            1,
                            samples_.size(),
@@ -207,7 +209,8 @@ class NCELayer : public Layer {
 
     backwardCost();
 
-    activation_->backward(sampleOut_);
+    auto status = activation_->backward(sampleOut_);
+    status.check();
 
     if (biases_->getWGrad()) {
       backwardBias(callback);
diff --git a/paddle/gserver/layers/NormLayer.h b/paddle/gserver/layers/NormLayer.h
index 011bab8fdedab0..e77faaa3225709 100644
--- a/paddle/gserver/layers/NormLayer.h
+++ b/paddle/gserver/layers/NormLayer.h
@@ -30,7 +30,8 @@ class NormLayer : public Layer {
 public:
   explicit NormLayer(const LayerConfig& config) : Layer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
     Layer::init(layerMap, parameterMap);
     return true;
   }
@@ -56,9 +57,10 @@ class ResponseNormLayer : public NormLayer {
 public:
   explicit ResponseNormLayer(const LayerConfig& config) : NormLayer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  void forward(PassType passType) { LOG(FATAL) << "Not implemented"; }
-  void backward(const UpdateCallback& callback = nullptr) {
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override { LOG(FATAL) << "Not implemented"; }
+  void backward(const UpdateCallback& callback = nullptr) override {
     LOG(FATAL) << "Not implemented";
   }
 };
diff --git a/paddle/gserver/layers/NormProjectionLayer.h b/paddle/gserver/layers/NormProjectionLayer.h
index 2c0d8a3a718c48..2997ae8848c438 100644
--- a/paddle/gserver/layers/NormProjectionLayer.h
+++ b/paddle/gserver/layers/NormProjectionLayer.h
@@ -36,9 +36,10 @@ class CMRProjectionNormLayer : public ResponseNormLayer {
 
   size_t getSize();
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 
 protected:
   TensorShape shape_;
diff --git a/paddle/gserver/layers/OuterProdLayer.cpp b/paddle/gserver/layers/OuterProdLayer.cpp
index b606e4436567eb..283fdb003a2bb9 100644
--- a/paddle/gserver/layers/OuterProdLayer.cpp
+++ b/paddle/gserver/layers/OuterProdLayer.cpp
@@ -38,10 +38,11 @@ class OuterProdLayer : public Layer {
 
   ~OuterProdLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(out_prod, OuterProdLayer);
diff --git a/paddle/gserver/layers/PadLayer.cpp b/paddle/gserver/layers/PadLayer.cpp
new file mode 100644
index 00000000000000..bb618c09f97777
--- /dev/null
+++ b/paddle/gserver/layers/PadLayer.cpp
@@ -0,0 +1,115 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PadLayer.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(pad, PadLayer);
+
+bool PadLayer::init(const LayerMap& layerMap,
+                    const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  auto& pad_conf = config_.inputs(0).pad_conf();
+  auto& img_conf = pad_conf.image_conf();
+  CHECK_EQ(config_.inputs_size(), 1);
+  inDims_ = TensorShape(
+      {0,
+       img_conf.channels(),
+       img_conf.has_img_size_y() ? img_conf.img_size_y() : img_conf.img_size(),
+       img_conf.img_size()});
+
+  CHECK_EQ(2, pad_conf.pad_c_size());
+  CHECK_EQ(2, pad_conf.pad_h_size());
+  CHECK_EQ(2, pad_conf.pad_w_size());
+  padc_.push_back(pad_conf.pad_c(0));
+  padc_.push_back(pad_conf.pad_c(1));
+  padh_.push_back(pad_conf.pad_h(0));
+  padh_.push_back(pad_conf.pad_h(1));
+  padw_.push_back(pad_conf.pad_w(0));
+  padw_.push_back(pad_conf.pad_w(1));
+
+  outDims_ = TensorShape(4);
+  setOutDims(0);
+
+  createFunction(forward_,
+                 "Pad",
+                 FuncConfig()
+                     .set("cstart", padc_[0])
+                     .set("cend", padc_[1])
+                     .set("hstart", padh_[0])
+                     .set("hend", padh_[1])
+                     .set("wstart", padw_[0])
+                     .set("wend", padw_[1]));
+  createFunction(backward_,
+                 "PadGrad",
+                 FuncConfig()
+                     .set("cstart", padc_[0])
+                     .set("cend", padc_[1])
+                     .set("hstart", padh_[0])
+                     .set("hend", padh_[1])
+                     .set("wstart", padw_[0])
+                     .set("wend", padw_[1]));
+
+  return true;
+}
+
+void PadLayer::setOutDims(const size_t batchSize) {
+  outDims_.reshape({batchSize,
+                    inDims_[1] + padc_[0] + padc_[1],
+                    inDims_[2] + padh_[0] + padh_[1],
+                    inDims_[3] + padw_[0] + padw_[1]});
+}
+
+void PadLayer::setTensorDim(const size_t batchSize) {
+  CHECK_EQ(static_cast<int>(inputLayers_.size()), 1);
+  inDims_.setDim(0, batchSize);
+  int h = inputLayers_[0]->getOutput().getFrameHeight();
+  if (h != 0) inDims_.setDim(2, h);
+  int w = inputLayers_[0]->getOutput().getFrameWidth();
+  if (w != 0) inDims_.setDim(3, w);
+  setOutDims(batchSize);
+}
+
+void PadLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  MatrixPtr input = inputLayers_[0]->getOutputValue();
+  size_t batchSize = input->getHeight();
+  setTensorDim(batchSize);
+  int size = outDims_[1] * outDims_[2] * outDims_[3];
+  resetOutput(batchSize, size);
+  MatrixPtr outV = getOutputValue();
+  REGISTER_TIMER_INFO("PadForward", getName().c_str());
+
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getInputValue(0), inDims_);
+  outputs.addArg(*getOutputValue(), outDims_, ASSIGN_TO);
+  forward_[0]->calc(inputs, outputs);
+}
+
+void PadLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+  REGISTER_TIMER_INFO("PadBackward", getName().c_str());
+
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getOutputGrad(), outDims_);
+  outputs.addArg(*getInputGrad(0), inDims_, ADD_TO);
+  backward_[0]->calc(inputs, outputs);
+}
+}  // namespace paddle
diff --git a/paddle/gserver/layers/PadLayer.h b/paddle/gserver/layers/PadLayer.h
new file mode 100644
index 00000000000000..b2bbf28082e630
--- /dev/null
+++ b/paddle/gserver/layers/PadLayer.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * \brief  This layer pads zeros to inputs according to the specify dimension.
+ *         The input and output is a 4D tensor. Padding zeros from the 2nd to
+ *         the 4th dimenstion according padc_, padh_ and padw_.
+ */
+class PadLayer : public Layer {
+public:
+  explicit PadLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~PadLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+protected:
+  void setOutDims(const size_t batchSize);
+  void setTensorDim(const size_t batchSize);
+
+  std::vector<int> padc_;
+  std::vector<int> padh_;
+  std::vector<int> padw_;
+  TensorShape inDims_;
+  TensorShape outDims_;
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ParameterReluLayer.h b/paddle/gserver/layers/ParameterReluLayer.h
index a82497fc01ca1f..9a11b81ebf1f5c 100644
--- a/paddle/gserver/layers/ParameterReluLayer.h
+++ b/paddle/gserver/layers/ParameterReluLayer.h
@@ -56,9 +56,10 @@ class ParameterReluLayer : public Layer {
 
   ~ParameterReluLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 }  // namespace paddle
diff --git a/paddle/gserver/layers/PoolLayer.h b/paddle/gserver/layers/PoolLayer.h
index 318b89d7c2bce8..d43292ad2d4bbe 100644
--- a/paddle/gserver/layers/PoolLayer.h
+++ b/paddle/gserver/layers/PoolLayer.h
@@ -46,7 +46,8 @@ class PoolLayer : public Layer {
    */
   static Layer* create(const LayerConfig& config);
 
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/PoolProjectionLayer.h b/paddle/gserver/layers/PoolProjectionLayer.h
index 3dc6af2f0e9fb1..e31116de8ccb1f 100644
--- a/paddle/gserver/layers/PoolProjectionLayer.h
+++ b/paddle/gserver/layers/PoolProjectionLayer.h
@@ -40,7 +40,7 @@ class PoolProjectionLayer : public PoolLayer {
 
   size_t getSize();
 
-  virtual void forward(PassType passType);
-  virtual void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 }  // namespace paddle
diff --git a/paddle/gserver/layers/PowerLayer.cpp b/paddle/gserver/layers/PowerLayer.cpp
index 64fecab5b08354..31c34b43e2995a 100644
--- a/paddle/gserver/layers/PowerLayer.cpp
+++ b/paddle/gserver/layers/PowerLayer.cpp
@@ -40,10 +40,11 @@ class PowerLayer : public Layer {
 
   ~PowerLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(power, PowerLayer);
diff --git a/paddle/gserver/layers/PrintLayer.cpp b/paddle/gserver/layers/PrintLayer.cpp
index ac7f658864fee6..de198af111be42 100644
--- a/paddle/gserver/layers/PrintLayer.cpp
+++ b/paddle/gserver/layers/PrintLayer.cpp
@@ -19,38 +19,17 @@ namespace paddle {
 class PrintLayer : public Layer {
 public:
   explicit PrintLayer(const LayerConfig& config) : Layer(config) {}
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback) {}
-};
 
-void PrintLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    const auto& argu = getInput(i);
-    const std::string& name = inputLayers_[i]->getName();
-    if (argu.value) {
-      std::ostringstream os;
-      argu.value->print(os);
-      LOG(INFO) << "layer=" << name << " value matrix:\n" << os.str();
-    }
-    if (argu.ids) {
-      std::ostringstream os;
-      argu.ids->print(os, argu.ids->getSize());
-      LOG(INFO) << "layer=" << name << " ids vector:\n" << os.str();
-    }
-    if (auto startPos = argu.sequenceStartPositions) {
-      std::ostringstream os;
-      startPos->getVector(false)->print(os, startPos->getSize());
-      LOG(INFO) << "layer=" << name << " sequence pos vector:\n" << os.str();
-    }
-    if (auto subStartPos = argu.subSequenceStartPositions) {
-      std::ostringstream os;
-      subStartPos->getVector(false)->print(os, subStartPos->getSize());
-      LOG(INFO) << "layer=" << name << " sub-sequence pos vector:\n"
-                << os.str();
+  void forward(PassType passType) override {
+    Layer::forward(passType);
+    for (size_t i = 0; i != inputLayers_.size(); ++i) {
+      getInput(i).printValueString(LOG(INFO),
+                                   "layer=" + inputLayers_[i]->getName() + " ");
     }
   }
-}
+
+  void backward(const UpdateCallback& callback) override {}
+};
 
 REGISTER_LAYER(print, PrintLayer);
 
diff --git a/paddle/gserver/layers/PriorBox.cpp b/paddle/gserver/layers/PriorBox.cpp
index 36ace7597cd66c..bcf5e912a50fef 100644
--- a/paddle/gserver/layers/PriorBox.cpp
+++ b/paddle/gserver/layers/PriorBox.cpp
@@ -30,10 +30,11 @@ namespace paddle {
 class PriorBoxLayer : public Layer {
 public:
   explicit PriorBoxLayer(const LayerConfig& config) : Layer(config) {}
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback) {}
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override {}
 
 protected:
   int numPriors_;
diff --git a/paddle/gserver/layers/RecurrentLayer.cpp b/paddle/gserver/layers/RecurrentLayer.cpp
index 55e0fdfb9048c0..e4c2b483d2fa40 100644
--- a/paddle/gserver/layers/RecurrentLayer.cpp
+++ b/paddle/gserver/layers/RecurrentLayer.cpp
@@ -45,17 +45,18 @@ class RecurrentLayer : public Layer {
 public:
   explicit RecurrentLayer(const LayerConfig& config) : Layer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
+  void forward(PassType passType) override;
 
-  void backward(const UpdateCallback& callback);
+  void backward(const UpdateCallback& callback) override;
 
-  void resetState();
+  void resetState() override;
 
-  void setState(LayerStatePtr state);
+  void setState(LayerStatePtr state) override;
 
-  LayerStatePtr getState();
+  LayerStatePtr getState() override;
 
 protected:
   /**
@@ -217,21 +218,22 @@ void RecurrentLayer::forwardOneSequence(int start, int length) {
     if (prevOutput_) {
       frameOutput_[start].value->mul(*prevOutput_, *weight_->getW(), 1, 1);
     }
-    activation_->forward(frameOutput_[start]);
+    activation_->forward(frameOutput_[start]).check();
+
     for (int i = 1; i < length; ++i) {
       frameOutput_[start + i].value->mul(
           *frameOutput_[start + i - 1].value, *weight_->getW(), 1, 1);
-      activation_->forward(frameOutput_[start + i]);
+      activation_->forward(frameOutput_[start + i]).check();
     }
     if (prevOutput_) {
       prevOutput_->assign(*frameOutput_[start + length - 1].value);
     }
   } else {
-    activation_->forward(frameOutput_[start + length - 1]);
+    activation_->forward(frameOutput_[start + length - 1]).check();
     for (int i = length - 2; i >= 0; --i) {
       frameOutput_[start + i].value->mul(
           *frameOutput_[start + i + 1].value, *weight_->getW(), 1, 1);
-      activation_->forward(frameOutput_[start + i]);
+      activation_->forward(frameOutput_[start + i]).check();
     }
   }
 }
@@ -280,11 +282,11 @@ void RecurrentLayer::backwardOneSequence(int start, int length) {
   MatrixPtr weightT = weight_->getW()->getTranspose();
   if (!reversed_) {
     for (int i = length - 1; i > 0; --i) {
-      activation_->backward(frameOutput_[start + i]);
+      activation_->backward(frameOutput_[start + i]).check();
       frameOutput_[start + i - 1].grad->mul(
           *frameOutput_[start + i].grad, *weightT, 1, 1);
     }
-    activation_->backward(frameOutput_[start]);
+    activation_->backward(frameOutput_[start]).check();
     if (weight_->getWGrad()) {
       weight_->getWGrad()->mul(
           *output_.value->subMatrix(start, length - 1)->getTranspose(),
@@ -294,11 +296,11 @@ void RecurrentLayer::backwardOneSequence(int start, int length) {
     }
   } else {
     for (int i = 0; i < length - 1; ++i) {
-      activation_->backward(frameOutput_[start + i]);
+      activation_->backward(frameOutput_[start + i]).check();
       frameOutput_[start + i + 1].grad->mul(
           *frameOutput_[start + i].grad, *weightT, 1, 1);
     }
-    activation_->backward(frameOutput_[start + length - 1]);
+    activation_->backward(frameOutput_[start + length - 1]).check();
     if (weight_->getWGrad()) {
       weight_->getWGrad()->mul(
           *output_.value->subMatrix(start + 1, length - 1)->getTranspose(),
@@ -333,7 +335,7 @@ void RecurrentLayer::forwardBatch(int batchSize,
       }
       Argument arg;
       arg.value = batch2;
-      activation_->forward(arg);
+      activation_->forward(arg).check();
     }
   }
   batchValue_->copyBackSeq(*output_.value);
@@ -363,7 +365,7 @@ void RecurrentLayer::backwardBatch(int batchSize,
       Argument arg;
       arg.value = batch1;
       arg.grad = batch2;
-      activation_->backward(arg);
+      activation_->backward(arg).check();
 
       if (n != 0) {
         batch1 = batchGrad_->getBatchValue(n - 1, batch2->getHeight());
diff --git a/paddle/gserver/layers/RecurrentLayerGroup.cpp b/paddle/gserver/layers/RecurrentLayerGroup.cpp
index af8dd61d84e2e5..78a74ff19a38cd 100644
--- a/paddle/gserver/layers/RecurrentLayerGroup.cpp
+++ b/paddle/gserver/layers/RecurrentLayerGroup.cpp
@@ -33,15 +33,15 @@ class RecurrentLayerGroup : public Layer {
   void initSubNetwork(NeuralNetwork* rootNetwork,
                       const ModelConfig& config,
                       const std::vector<ParameterType>& parameterTypes,
-                      bool useGpu);
+                      bool useGpu) override;
 
-  void forward(PassType passType) {
+  void forward(PassType passType) override {
     REGISTER_TIMER_INFO("RecurrentGroupFwTime", getName().c_str());
     const std::vector<Argument> inArgs;
     std::vector<Argument> outArgs;
     network_->forward(inArgs, &outArgs, passType);
   }
-  void backward(const UpdateCallback& callback) {
+  void backward(const UpdateCallback& callback) override {
     REGISTER_TIMER_INFO("RecurrentGroupBwTime", getName().c_str());
     network_->backward(nullptr);
 
@@ -53,7 +53,8 @@ class RecurrentLayerGroup : public Layer {
   /**
    * @see Layer.accessSubNetwork
    */
-  void accessSubNetwork(const std::function<void(NeuralNetwork&)>& callback) {
+  void accessSubNetwork(
+      const std::function<void(NeuralNetwork&)>& callback) override {
     callback(*network_);
   }
 
diff --git a/paddle/gserver/layers/ResizeLayer.cpp b/paddle/gserver/layers/ResizeLayer.cpp
index 7fcb3adea01b9d..eb3b63c106901f 100644
--- a/paddle/gserver/layers/ResizeLayer.cpp
+++ b/paddle/gserver/layers/ResizeLayer.cpp
@@ -20,18 +20,19 @@ namespace paddle {
 /**
  * @brief A layer for resizing a minibatch matrix h*w to h'*w'
  * @note
- * origin matrix height * witdth)
+ * origin matrix height * width)
  * resize matrix: (height * width / size) * size
  */
 class ResizeLayer : public Layer {
 public:
   explicit ResizeLayer(const LayerConfig& config) : Layer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
+  void forward(PassType passType) override;
 
-  void backward(const UpdateCallback& callback);
+  void backward(const UpdateCallback& callback) override;
 };
 
 REGISTER_LAYER(resize, ResizeLayer);
diff --git a/paddle/gserver/layers/RotateLayer.cpp b/paddle/gserver/layers/RotateLayer.cpp
new file mode 100644
index 00000000000000..7c71088d786ab2
--- /dev/null
+++ b/paddle/gserver/layers/RotateLayer.cpp
@@ -0,0 +1,102 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "RotateLayer.h"
+
+namespace paddle {
+
+REGISTER_LAYER(rotate, RotateLayer);
+
+bool RotateLayer::init(const LayerMap& layerMap,
+                       const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  height_ = config_.height();
+  width_ = config_.width();
+  CHECK_GT(height_, 0);
+  CHECK_GT(width_, 0);
+  return true;
+}
+
+void RotateLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr input = getInputValue(0);
+  batchSize_ = input->getHeight();
+  size_ = input->getWidth();
+  CHECK_GE(size_, height_ * width_);
+  CHECK_EQ(size_ % (height_ * width_), 0)
+      << "total size_ is not dividable by (height_ * width_), i.e., "
+      << "channel number should be an integer";
+  channels_ = size_ / (height_ * width_);
+
+  resizeOutput(batchSize_, size_);
+
+  MatrixPtr outV = getOutputValue();
+  for (int b = 0; b < batchSize_; b++) {   // for each input feat map
+    for (int c = 0; c < channels_; c++) {  // for each feat channel
+      MatrixPtr inputSample =
+          Matrix::create(input->getData() + b * size_ + c * height_ * width_,
+                         height_,
+                         width_,
+                         false,
+                         useGpu_);
+      MatrixPtr outputSample =
+          Matrix::create(outV->getData() + b * size_ + c * height_ * width_,
+                         width_,
+                         height_,
+                         false,
+                         useGpu_);
+      inputSample->rotate(outputSample, false, true /* clock-wise */);
+    }
+  }
+
+  if (getInputGrad(0)) {
+    zeroGrad();
+  }
+}
+
+void RotateLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+
+  MatrixPtr outputGrad = getOutputGrad();
+  if (outputGrad == NULL) {
+    return;
+  }
+  // the grad should be rotated in the reverse direction
+  MatrixPtr preGrad = getInputGrad(0);
+
+  for (int b = 0; b < batchSize_; b++) {   // for each input feat map
+    for (int c = 0; c < channels_; c++) {  // for each feat channel
+      MatrixPtr inputSampleGrad =
+          Matrix::create(preGrad->getData() + b * size_ + c * height_ * width_,
+                         height_,
+                         width_,
+                         false,
+                         useGpu_);
+      MatrixPtr outputSampleGrad = Matrix::create(
+          outputGrad->getData() + b * size_ + c * height_ * width_,
+          width_,
+          height_,
+          false,
+          useGpu_);
+      MatrixPtr tmpGrad = nullptr;
+      outputSampleGrad->rotate(tmpGrad, true, false /* anti clock-wise */);
+      inputSampleGrad->add(*tmpGrad);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/RotateLayer.h b/paddle/gserver/layers/RotateLayer.h
new file mode 100644
index 00000000000000..1a64d4d5a51d9c
--- /dev/null
+++ b/paddle/gserver/layers/RotateLayer.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+/**
+ * A layer for rotating a multi-channel feature map (M x N x C) in the spatial
+ * domain
+ * The rotation is 90 degrees in clock-wise for each channel
+ * \f[
+ *   y(j,i,:) = x(M-i-1,j,:)
+ * \f]
+ * where \f$x\f$ is (M x N x C) input, and \f$y\f$ is (N x M x C) output.
+ *
+ * The config file api is rotate_layer
+ *
+*/
+
+class RotateLayer : public Layer {
+public:
+  explicit RotateLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+
+private:
+  int batchSize_;
+  int size_;
+  int height_;
+  int width_;
+  int channels_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SamplingIdLayer.cpp b/paddle/gserver/layers/SamplingIdLayer.cpp
index 59ff5d41b52909..2538d99bb71fa1 100644
--- a/paddle/gserver/layers/SamplingIdLayer.cpp
+++ b/paddle/gserver/layers/SamplingIdLayer.cpp
@@ -35,8 +35,8 @@ class SamplingIdLayer : public Layer {
   explicit SamplingIdLayer(const LayerConfig& config)
       : Layer(config), rand1_(0, 1) {}
 
-  virtual bool init(const LayerMap& layerMap,
-                    const ParameterMap& parameterMap) {
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
     bool ret = Layer::init(layerMap, parameterMap);
     CHECK_EQ(1UL, inputLayers_.size());
     if (useGpu_) {
@@ -48,7 +48,7 @@ class SamplingIdLayer : public Layer {
     return ret;
   }
 
-  void forward(PassType passType) {
+  void forward(PassType passType) override {
     Layer::forward(passType);
     if (useGpu_) {
       for (size_t i = 0; i < inputLayers_.size(); i++) {
@@ -83,7 +83,7 @@ class SamplingIdLayer : public Layer {
     output_.ids->copyFrom(ids.data(), batchSize);
   }
 
-  virtual void backward(const UpdateCallback& callback) {}
+  void backward(const UpdateCallback& callback) override {}
 };
 
 REGISTER_LAYER(sampling_id, SamplingIdLayer);
diff --git a/paddle/gserver/layers/ScalingLayer.cpp b/paddle/gserver/layers/ScalingLayer.cpp
index 7f0084be6b57f5..a38ee0857a7679 100644
--- a/paddle/gserver/layers/ScalingLayer.cpp
+++ b/paddle/gserver/layers/ScalingLayer.cpp
@@ -37,10 +37,11 @@ class ScalingLayer : public Layer {
 
   ~ScalingLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(scaling, ScalingLayer);
diff --git a/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp b/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp
index 5eacff6b714399..d9a91de8a6f4da 100644
--- a/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp
+++ b/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp
@@ -192,7 +192,8 @@ void SelectiveFullyConnectedLayer::forward(PassType passType) {
                                nnz,
                                /*trans=*/false,
                                /*useGpu=*/useGpu_);
-    activation_->forward(arg);
+    //! TODO(yuyang18): Why we cannot invoke forwardActivation here?
+    activation_->forward(arg).check();
   } else /* train and test in train, not generating */ {
     // during training, this layer output value is *Matrix*, which is input of
     // eg. multi-class-cross-entropy
diff --git a/paddle/gserver/layers/SelectiveFullyConnectedLayer.h b/paddle/gserver/layers/SelectiveFullyConnectedLayer.h
index bdf9a4652cc717..99126fdba542bd 100644
--- a/paddle/gserver/layers/SelectiveFullyConnectedLayer.h
+++ b/paddle/gserver/layers/SelectiveFullyConnectedLayer.h
@@ -65,9 +65,10 @@ class SelectiveFullyConnectedLayer : public Layer {
       : Layer(config), selCols_(nullptr) {}
 
   ~SelectiveFullyConnectedLayer() {}
-  void prefetch();
+  void prefetch() override;
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   Weight& getWeight(int idx) { return *weights_[idx]; }
 
@@ -90,8 +91,8 @@ class SelectiveFullyConnectedLayer : public Layer {
   void fillSelectiveData(
       const std::shared_ptr<std::vector<std::pair<int*, size_t>>>& candidates);
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 
 private:
   /**
diff --git a/paddle/gserver/layers/SequenceConcatLayer.cpp b/paddle/gserver/layers/SequenceConcatLayer.cpp
index 069bc26e602ff7..4b24d8f0c852e1 100644
--- a/paddle/gserver/layers/SequenceConcatLayer.cpp
+++ b/paddle/gserver/layers/SequenceConcatLayer.cpp
@@ -21,9 +21,11 @@ namespace paddle {
 
 /**
  * A layer for concatenating the first sequence with the second sequence
- * following the first
- * Input: two sequences each containing some instances
+ * Input: two sequences each containing the same number of instances
+ *        seq1 = [a1, a2, ..., an]
+ *        seq2 = [b1, b2, ..., bn]
  * Output: a concatenated sequence of the two input sequences
+ *        out = [a1, b1, a2, b2, ..., an, bn]
  */
 
 class SequenceConcatLayer : public Layer {
@@ -35,10 +37,11 @@ class SequenceConcatLayer : public Layer {
 
   ~SequenceConcatLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(seqconcat, SequenceConcatLayer);
@@ -167,13 +170,17 @@ void SequenceConcatLayer::backward(const UpdateCallback& callback) {
     size_t rightNumIns = 0;
     for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
       leftNumIns = starts1[seqId + 1] - starts1[seqId];
-      inputGrad1->subMatrix(starts1[seqId], leftNumIns)
-          ->add(*(outputGrad->subMatrix(offset, leftNumIns)));
+      if (inputGrad1) {
+        inputGrad1->subMatrix(starts1[seqId], leftNumIns)
+            ->add(*(outputGrad->subMatrix(offset, leftNumIns)));
+      }
       offset += leftNumIns;
 
       rightNumIns = starts2[seqId + 1] - starts2[seqId];
-      inputGrad2->subMatrix(starts2[seqId], rightNumIns)
-          ->add(*(outputGrad->subMatrix(offset, rightNumIns)));
+      if (inputGrad2) {
+        inputGrad2->subMatrix(starts2[seqId], rightNumIns)
+            ->add(*(outputGrad->subMatrix(offset, rightNumIns)));
+      }
       offset += rightNumIns;
     }
   }
diff --git a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
index 4bfce766c769f4..7a13cd7ad0fecf 100644
--- a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
+++ b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
@@ -42,12 +42,11 @@ class SequenceLastInstanceLayer : public SequencePoolLayer {
   explicit SequenceLastInstanceLayer(const LayerConfig& config)
       : SequencePoolLayer(config) {}
 
-  ~SequenceLastInstanceLayer() {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(seqlastins, SequenceLastInstanceLayer);
diff --git a/paddle/gserver/layers/SequencePoolLayer.h b/paddle/gserver/layers/SequencePoolLayer.h
index aa9c132586e55d..85b51ccd1dc7e7 100644
--- a/paddle/gserver/layers/SequencePoolLayer.h
+++ b/paddle/gserver/layers/SequencePoolLayer.h
@@ -46,12 +46,11 @@ class SequencePoolLayer : public Layer {
 public:
   explicit SequencePoolLayer(const LayerConfig& config) : Layer(config) {}
 
-  virtual ~SequencePoolLayer() {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/SequenceReshapeLayer.cpp b/paddle/gserver/layers/SequenceReshapeLayer.cpp
index 23924b0490851a..433592953b220e 100644
--- a/paddle/gserver/layers/SequenceReshapeLayer.cpp
+++ b/paddle/gserver/layers/SequenceReshapeLayer.cpp
@@ -20,9 +20,12 @@ limitations under the License. */
 namespace paddle {
 
 /**
- * A layer for reshaping the sequence
- * Input: a sequence
- * Output: a sequence
+ *  A layer for reshaping the sequence. Assume the input sequence has
+ *  T instances, the dimension of each instance is M, and the input
+ *  reshape_dim is N, then the output sequence has T*M/N instances,
+ *  the dimension of each instance is N.
+ *
+ *  Note that T*M/N must be an integer.
  */
 
 class SequenceReshapeLayer : public Layer {
@@ -34,12 +37,11 @@ class SequenceReshapeLayer : public Layer {
 public:
   explicit SequenceReshapeLayer(const LayerConfig& config) : Layer(config) {}
 
-  ~SequenceReshapeLayer() {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(seqreshape, SequenceReshapeLayer);
diff --git a/paddle/gserver/layers/SlopeInterceptLayer.cpp b/paddle/gserver/layers/SlopeInterceptLayer.cpp
index b678f414b6d76f..faf98744a7fdcf 100644
--- a/paddle/gserver/layers/SlopeInterceptLayer.cpp
+++ b/paddle/gserver/layers/SlopeInterceptLayer.cpp
@@ -39,12 +39,11 @@ class SlopeInterceptLayer : public Layer {
 public:
   explicit SlopeInterceptLayer(const LayerConfig& config) : Layer(config) {}
 
-  ~SlopeInterceptLayer() {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(slope_intercept, SlopeInterceptLayer);
diff --git a/paddle/gserver/layers/SpatialPyramidPoolLayer.h b/paddle/gserver/layers/SpatialPyramidPoolLayer.h
index 32e88cf141a667..7d3cb80443801a 100644
--- a/paddle/gserver/layers/SpatialPyramidPoolLayer.h
+++ b/paddle/gserver/layers/SpatialPyramidPoolLayer.h
@@ -43,9 +43,8 @@ class SpatialPyramidPoolLayer : public Layer {
 public:
   explicit SpatialPyramidPoolLayer(const LayerConfig& config) : Layer(config) {}
 
-  ~SpatialPyramidPoolLayer() {}
-
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   ProjectionConfig getConfig(size_t sizeX_,
                              size_t sizeY_,
@@ -54,7 +53,7 @@ class SpatialPyramidPoolLayer : public Layer {
                              std::string& poolType_);
   size_t getSize();
 
-  virtual void forward(PassType passType);
-  virtual void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 }  // namespace paddle
diff --git a/paddle/gserver/layers/SubSequenceLayer.cpp b/paddle/gserver/layers/SubSequenceLayer.cpp
index c52fbee26232ad..19b7ad1869af98 100644
--- a/paddle/gserver/layers/SubSequenceLayer.cpp
+++ b/paddle/gserver/layers/SubSequenceLayer.cpp
@@ -35,12 +35,11 @@ class SubSequenceLayer : public Layer {
 public:
   explicit SubSequenceLayer(const LayerConfig& config) : Layer(config) {}
 
-  ~SubSequenceLayer() {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(subseq, SubSequenceLayer);
diff --git a/paddle/gserver/layers/SumToOneNormLayer.cpp b/paddle/gserver/layers/SumToOneNormLayer.cpp
index aa99b49380d368..00f8519550bcff 100644
--- a/paddle/gserver/layers/SumToOneNormLayer.cpp
+++ b/paddle/gserver/layers/SumToOneNormLayer.cpp
@@ -41,12 +41,11 @@ class SumToOneNormLayer : public Layer {
 public:
   explicit SumToOneNormLayer(const LayerConfig& config) : Layer(config) {}
 
-  ~SumToOneNormLayer() {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(sum_to_one_norm, SumToOneNormLayer);
diff --git a/paddle/gserver/layers/TensorLayer.h b/paddle/gserver/layers/TensorLayer.h
index ac38ffb6205703..43992f692d3ce4 100644
--- a/paddle/gserver/layers/TensorLayer.h
+++ b/paddle/gserver/layers/TensorLayer.h
@@ -44,13 +44,12 @@ class TensorLayer : public Layer {
 public:
   explicit TensorLayer(const LayerConfig& config) : Layer(config) {}
 
-  ~TensorLayer() {}
-
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   Weight& getWeight(int idx) { return *weights_[idx]; }
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 }  // namespace paddle
diff --git a/paddle/gserver/layers/TransLayer.h b/paddle/gserver/layers/TransLayer.h
index b43fa1ebfb0032..be10bb74f6b218 100644
--- a/paddle/gserver/layers/TransLayer.h
+++ b/paddle/gserver/layers/TransLayer.h
@@ -20,7 +20,7 @@ limitations under the License. */
 
 namespace paddle {
 /**
- * A layer for transposition.
+ * A layer for transposing a minibatch matrix.
  * \f[
      y = x^\mathrm{T}
  * \f]
@@ -32,9 +32,10 @@ class TransLayer : public Layer {
 public:
   explicit TransLayer(const LayerConfig& config) : Layer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 }  // namespace paddle
diff --git a/paddle/gserver/layers/ValidationLayer.h b/paddle/gserver/layers/ValidationLayer.h
index 4c1de7b3b7d697..c8b2634a1366ed 100644
--- a/paddle/gserver/layers/ValidationLayer.h
+++ b/paddle/gserver/layers/ValidationLayer.h
@@ -26,7 +26,8 @@ class ValidationLayer : public Layer {
 public:
   explicit ValidationLayer(const LayerConfig& config) : Layer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   LayerPtr getOutputLayer() { return inputLayers_[0]; }
 
@@ -37,13 +38,13 @@ class ValidationLayer : public Layer {
     return inputLayers_[2];
   }
 
-  virtual void forward(PassType passType);
+  void forward(PassType passType) override;
 
-  virtual void backward(const UpdateCallback& callback = nullptr);
+  void backward(const UpdateCallback& callback = nullptr) override;
 
   virtual void validationImp(MatrixPtr outputValue, IVectorPtr label) = 0;
 
-  virtual void onPassEnd() = 0;
+  void onPassEnd() override = 0;
 };
 
 /*
@@ -57,11 +58,12 @@ class AucValidation : public ValidationLayer {
         cpuLabel_(nullptr),
         cpuWeight_(nullptr) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void validationImp(MatrixPtr outputValue, IVectorPtr label);
+  void validationImp(MatrixPtr outputValue, IVectorPtr label) override;
 
-  void onPassEnd();
+  void onPassEnd() override;
 
   struct PredictionResult {
     PredictionResult(real __out, int __label) : out(__out), label(__label) {}
@@ -86,11 +88,12 @@ class PnpairValidation : public ValidationLayer {
   explicit PnpairValidation(const LayerConfig& config)
       : ValidationLayer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void validationImp(MatrixPtr outputValue, IVectorPtr label);
+  void validationImp(MatrixPtr outputValue, IVectorPtr label) override;
 
-  void onPassEnd();
+  void onPassEnd() override;
 
 private:
   bool passBegin_;
diff --git a/paddle/gserver/layers/WarpCTCLayer.h b/paddle/gserver/layers/WarpCTCLayer.h
index 3d9ae9249af66d..7e8d7379d26788 100644
--- a/paddle/gserver/layers/WarpCTCLayer.h
+++ b/paddle/gserver/layers/WarpCTCLayer.h
@@ -30,9 +30,10 @@ class WarpCTCLayer : public Layer {
   explicit WarpCTCLayer(const LayerConfig& config) : Layer(config) {}
   ~WarpCTCLayer() {}
 
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  virtual void forward(PassType passType);
-  virtual void backward(const UpdateCallback& callback);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
 
 protected:
   /**
diff --git a/paddle/gserver/tests/test_Evaluator.cpp b/paddle/gserver/tests/test_Evaluator.cpp
index 8165eb82693361..4f5fdbb37ce024 100644
--- a/paddle/gserver/tests/test_Evaluator.cpp
+++ b/paddle/gserver/tests/test_Evaluator.cpp
@@ -110,6 +110,18 @@ void testEvaluator(TestConfig testConf,
   testEvaluator->finish();
   LOG(INFO) << *testEvaluator;
 
+  std::vector<std::string> names;
+  testEvaluator->getNames(&names);
+  paddle::Error err;
+  for (auto& name : names) {
+    auto value = testEvaluator->getValue(name, &err);
+    ASSERT_TRUE(err.isOK());
+    LOG(INFO) << name << " " << value;
+    auto tp = testEvaluator->getType(name, &err);
+    ASSERT_TRUE(err.isOK());
+    ASSERT_EQ(testConf.evaluatorConfig.type(), tp);
+  }
+
   double totalScore2 = 0.0;
   if (testConf.testAccumulate) {
     testEvaluator->start();
@@ -129,6 +141,7 @@ void testEvaluatorAll(TestConfig testConf,
 TEST(Evaluator, classification_error) {
   TestConfig config;
   config.evaluatorConfig.set_type("classification_error");
+  config.evaluatorConfig.set_top_k(5);
 
   config.inputDefs.push_back({INPUT_DATA, "output", 50});
   config.inputDefs.push_back({INPUT_LABEL, "label", 50});
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 66a70ecd41091b..14d9db52470b28 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -310,7 +310,11 @@ TEST(Layer, CTCLayer) {
   config.layerConfig.add_inputs();
 
   for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "ctc", 100, /* trans */ false, /* useGpu */ useGpu);
+    testLayerGrad(config,
+                  "ctc",
+                  100,
+                  /* trans */ false, /* useGpu */
+                  useGpu);
   }
 }
 
@@ -587,7 +591,11 @@ TEST(Layer, hsigmoidLayer) {
   config.layerConfig.add_inputs();
 
   // Not support GPU now
-  testLayerGrad(config, "hsigmoid", 100, /* trans */ false, /* useGpu */ false);
+  testLayerGrad(config,
+                "hsigmoid",
+                100,
+                /* trans */ false, /* useGpu */
+                false);
 }
 
 TEST(Layer, multi_cross) {
@@ -1022,8 +1030,12 @@ void testNormLayer(const string& normType, bool trans, bool useGpu) {
 }
 
 TEST(Layer, NormLayer) {
-  testNormLayer("cmrnorm-projection", /* trans= */ false, /* useGpu= */ true);
-  testNormLayer("cmrnorm-projection", /* trans= */ false, /* useGpu= */ false);
+  testNormLayer("cmrnorm-projection",
+                /* trans= */ false, /* useGpu= */
+                true);
+  testNormLayer("cmrnorm-projection",
+                /* trans= */ false, /* useGpu= */
+                false);
 }
 
 void setPoolConfig(TestConfig* config,
@@ -1304,6 +1316,25 @@ TEST(Layer, ResizeLayer) {
   }
 }
 
+TEST(Layer, RotateLayer) {
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("rotate");
+  const int CHANNEL = 2;
+  const int HEIGHT = 8;
+  const int WIDTH = 4;
+  const int INPUT_SIZE = HEIGHT * WIDTH * CHANNEL;
+  config.layerConfig.set_size(INPUT_SIZE);
+  config.layerConfig.set_height(HEIGHT);
+  config.layerConfig.set_width(WIDTH);
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", INPUT_SIZE, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "rotate", 100, false, useGpu);
+  }
+}
+
 TEST(Layer, NCELayer) {
   TestConfig config;
   size_t numClasses = 4;
@@ -1563,6 +1594,35 @@ TEST(Layer, MultiplexLayer) {
   }
 }
 
+TEST(Layer, PadLayer) {
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("pad");
+
+  int c = 4;
+  int h = 31;
+  int w = 36;
+  size_t size = c * h * w;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", size, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  PadConfig* pad = input->mutable_pad_conf();
+  ImageConfig* image = pad->mutable_image_conf();
+
+  image->set_channels(c);
+  image->set_img_size(h);
+  image->set_img_size_y(w);
+  pad->add_pad_c(1);
+  pad->add_pad_c(2);
+  pad->add_pad_h(2);
+  pad->add_pad_h(3);
+  pad->add_pad_w(3);
+  pad->add_pad_w(5);
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "pad", 10, false, useGpu);
+  }
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   initMain(argc, argv);
diff --git a/paddle/gserver/tests/test_WarpCTCLayer.cpp b/paddle/gserver/tests/test_WarpCTCLayer.cpp
index 23ae95852e8421..55427e2f12fd7b 100644
--- a/paddle/gserver/tests/test_WarpCTCLayer.cpp
+++ b/paddle/gserver/tests/test_WarpCTCLayer.cpp
@@ -148,11 +148,11 @@ LayerPtr createCTCLayer(string name,
 
   ActivationFunction* softmaxActivation = ActivationFunction::create("softmax");
 
-  softmaxActivation->forward(dataLayer->getOutput());
+  softmaxActivation->forward(dataLayer->getOutput()).check();
   layer->forward(PASS_GC);
 
   layer->backward();
-  softmaxActivation->backward(dataLayer->getOutput());
+  softmaxActivation->backward(dataLayer->getOutput()).check();
 
   return layer;
 }
diff --git a/paddle/math/CpuSparseMatrix.cpp b/paddle/math/CpuSparseMatrix.cpp
index 82a482f7014812..bf62229c03bb1d 100644
--- a/paddle/math/CpuSparseMatrix.cpp
+++ b/paddle/math/CpuSparseMatrix.cpp
@@ -372,7 +372,7 @@ MatrixPtr CpuSparseMatrix::subMatrix(size_t startRow, size_t numRows) {
 }
 
 /* mem MUST be alloced outside (memAlloc=false) */
-void CpuSparseMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
+void CpuSparseMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
   CHECK(!memAlloc);
   CpuSparseMatrix* mat = dynamic_cast<CpuSparseMatrix*>(matTrans.get());
   if (format_ == SPARSE_CSR) {
diff --git a/paddle/math/CpuSparseMatrix.h b/paddle/math/CpuSparseMatrix.h
index d3e8871cb5b320..860cad1047fc34 100644
--- a/paddle/math/CpuSparseMatrix.h
+++ b/paddle/math/CpuSparseMatrix.h
@@ -201,7 +201,7 @@ class CpuSparseMatrix : public Matrix {
   void zeroMem();
 
   /// mem MUST be alloced outside (memAlloc=false)
-  void transpose(MatrixPtr matTrans, bool memAlloc);
+  void transpose(MatrixPtr& matTrans, bool memAlloc);
 
   void mul(const Matrix& A, const Matrix& B, real alpha, real beta);
 
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 3ae237bc7de895..07450bfb0ef709 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -274,6 +274,18 @@ real GpuMatrix::getSum() {
   return sum;
 }
 
+real GpuMatrix::getMin() {
+  CHECK(isContiguous());
+  auto vec = GpuVector(height_ * width_, data_);
+  return vec.getMin();
+}
+
+real GpuMatrix::getMax() {
+  CHECK(isContiguous());
+  auto vec = GpuVector(height_ * width_, data_);
+  return vec.getMax();
+}
+
 void GpuMatrix::accumulateColSum(Matrix& src) {
   CHECK_EQ(getWidth(), src.getWidth());
   CHECK_EQ(getHeight(), (size_t)1);
@@ -371,11 +383,13 @@ MatrixPtr GpuMatrix::getTranspose() {
   }
 }
 
-void GpuMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
+void GpuMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
   if (memAlloc) {
     matTrans = std::make_shared<GpuMatrix>(width_, height_);
   } else {
     CHECK(matTrans != NULL);
+    CHECK_EQ(matTrans->getHeight(), width_);
+    CHECK_EQ(matTrans->getWidth(), height_);
   }
   real* dataTrans = matTrans->getData();
   real* data = getData();
@@ -385,13 +399,27 @@ void GpuMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
   hl_matrix_transpose(data, dataTrans, height_, width_, lda, ldc);
 }
 
+void GpuMatrix::rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise) {
+  if (memAlloc) {
+    matRot = std::make_shared<GpuMatrix>(width_, height_);
+  } else {
+    CHECK(matRot != NULL);
+    CHECK_EQ(matRot->getHeight(), width_);
+    CHECK_EQ(matRot->getWidth(), height_);
+  }
+
+  real* dataRot = matRot->getData();
+  real* data = getData();
+  hl_matrix_rotate(data, dataRot, height_, width_, clockWise);
+}
+
 MatrixPtr GpuMatrix::getInverse() {
   MatrixPtr matInv;
   inverse(matInv, true);
   return matInv;
 }
 
-void GpuMatrix::inverse(MatrixPtr matInv, bool memAlloc) {
+void GpuMatrix::inverse(MatrixPtr& matInv, bool memAlloc) {
   CHECK_EQ(height_, width_);
 
   if (memAlloc) {
@@ -704,6 +732,7 @@ void GpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
   size_t beam = maxVal.getWidth();
   CHECK_EQ(maxIds.getSize(), numSamples * beam);
   CHECK_EQ(maxVal.getHeight(), numSamples);
+  CHECK_EQ(maxVal.getWidth(), beam);
 
   hl_matrix_top_k(maxVal.getData(),
                   maxVal.getStride(),
@@ -764,19 +793,32 @@ void GpuMatrix::maxoutBackward(Matrix& a,
 }
 
 /*calulate the error of classification */
-void GpuMatrix::classificationError(Matrix& output, IVector& label) {
-  auto output_ptr = dynamic_cast<const GpuMatrix*>(&output);
-  auto label_ptr = dynamic_cast<const GpuIVector*>(&label);
-  CHECK(output_ptr && label_ptr) << "Invalid argument pointer";
-
-  CHECK(height_ == output_ptr->height_ && width_ == 1)
+void GpuMatrix::classificationError(Matrix& output,
+                                    IVector& label,
+                                    size_t topkSize) {
+  auto gpuOutput = dynamic_cast<GpuMatrix*>(&output);
+  auto gpuLabel = dynamic_cast<GpuIVector*>(&label);
+  size_t numSamples = this->getHeight();
+  GpuMatrixPtr gpuTopVal = std::make_shared<GpuMatrix>(numSamples, topkSize);
+  GpuIVectorPtr gpuTopIds = std::make_shared<GpuIVector>(numSamples * topkSize);
+
+  CHECK(gpuOutput && gpuLabel) << "Invalid argument pointer";
+  CHECK(gpuTopVal && gpuTopIds) << "Allocate GPU memory failed";
+  CHECK(gpuLabel->getSize() == numSamples) << "Vector size is not equal";
+  CHECK(numSamples == gpuOutput->getHeight() && this->getWidth() == 1)
       << "Matrix dimensions are not equal";
 
-  hl_matrix_classification_error((real*)output_ptr->data_,
-                                 (int*)label_ptr->getData(),
-                                 data_,
-                                 height_,
-                                 output_ptr->width_);
+  size_t dim = gpuOutput->getWidth();
+  hl_matrix_classification_error(gpuTopVal->getData(),
+                                 gpuTopVal->getStride(),
+                                 gpuTopIds->getData(),
+                                 gpuOutput->getData(),
+                                 gpuOutput->getStride(),
+                                 dim,
+                                 topkSize,
+                                 numSamples,
+                                 gpuLabel->getData(),
+                                 this->getData());
 }
 
 /* copy -log(output[i * width + label]) to this->data[i] */
@@ -913,59 +955,6 @@ void GpuMatrix::softreluDerivative(Matrix& output) {
 void GpuMatrix::scaledTanh(Matrix& output, real p1, real p2) {
   BaseMatrix::scaledTanh(output, p1, p2);
 }
-void GpuMatrix::cosSim(Matrix& output1, Matrix& output2, real scale) {
-  CHECK(output1.useGpu_ == true && output2.useGpu_ == true)
-      << "Matrix type are not equal";
-  size_t numSamples = getHeight();
-  size_t dim = output1.getWidth();
-  CHECK_EQ(getWidth(), 1UL);
-  CHECK_EQ(output1.getHeight(), numSamples);
-  CHECK_EQ(output1.getWidth(), output2.getWidth());
-  real* out = getData();
-  real* x = output1.getData();
-  real* y = output2.getData();
-  hl_cossim(out, x, y, dim, output1.getHeight(), output2.getHeight(), scale);
-}
-void GpuMatrix::cosSimDerivative(Matrix& output,
-                                 Matrix& prevOut1,
-                                 Matrix& prevOut2,
-                                 Matrix& prevGrad1,
-                                 Matrix& prevGrad2,
-                                 real scale) {
-  CHECK(output.useGpu_ == true && prevOut1.useGpu_ == true &&
-        prevOut2.useGpu_ == true && prevGrad1.useGpu_ == true &&
-        prevGrad2.useGpu_ == true)
-      << "Matrix type are not equal";
-  CHECK_EQ(getWidth(), 1UL);
-  CHECK_EQ(output.getWidth(), 1UL);
-
-  size_t numSamples = getHeight();
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(prevOut1.getHeight(), numSamples);
-  CHECK_EQ(prevGrad1.getHeight(), numSamples);
-
-  size_t dim = prevOut1.getWidth();
-  CHECK_EQ(prevOut2.getWidth(), dim);
-  CHECK_EQ(prevGrad1.getWidth(), dim);
-  CHECK_EQ(prevGrad2.getWidth(), dim);
-
-  real* grad = getData();
-  real* out = output.getData();
-  real* prevOutX = prevOut1.getData();
-  real* prevOutY = prevOut2.getData();
-  real* prevGradX = prevGrad1.getData();
-  real* prevGradY = prevGrad2.getData();
-  hl_cossim_derivative(grad,
-                       out,
-                       prevOutX,
-                       prevOutY,
-                       prevGradX,
-                       prevGradY,
-                       dim,
-                       prevOut1.getHeight(),
-                       prevOut2.getHeight(),
-                       scale);
-}
 
 void GpuMatrix::randomizeUniform() {
   CHECK(isContiguous());
@@ -1690,11 +1679,13 @@ MatrixPtr CpuMatrix::getTranspose() {
   }
 }
 
-void CpuMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
+void CpuMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
   if (memAlloc) {
     matTrans = std::make_shared<CpuMatrix>(width_, height_);
   } else {
     CHECK(matTrans != NULL);
+    CHECK_EQ(matTrans->getHeight(), width_);
+    CHECK_EQ(matTrans->getWidth(), height_);
   }
   real* dataTrans = matTrans->getData();
   real* data = getData();
@@ -1708,13 +1699,35 @@ void CpuMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
   }
 }
 
+void CpuMatrix::rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise) {
+  if (memAlloc) {
+    matRot = std::make_shared<CpuMatrix>(width_, height_);
+  } else {
+    CHECK(matRot != NULL);
+    CHECK_EQ(matRot->getHeight(), width_);
+    CHECK_EQ(matRot->getWidth(), height_);
+  }
+  real* dataRot = matRot->getData();
+  real* data = getData();
+
+  for (size_t i = 0; i < height_; i++) {
+    for (size_t j = 0; j < width_; j++) {
+      if (clockWise) {
+        dataRot[j * height_ + i] = data[(height_ - i - 1) * width_ + j];
+      } else {
+        dataRot[j * height_ + i] = data[i * width_ + (width_ - j - 1)];
+      }
+    }
+  }
+}
+
 MatrixPtr CpuMatrix::getInverse() {
   MatrixPtr matInv;
   inverse(matInv, true);
   return matInv;
 }
 
-void CpuMatrix::inverse(MatrixPtr matInv, bool memAlloc) {
+void CpuMatrix::inverse(MatrixPtr& matInv, bool memAlloc) {
   CHECK_EQ(height_, width_);
 
   if (memAlloc) {
@@ -3040,7 +3053,7 @@ void CpuMatrix::rowMax(Matrix& max) {
   max.maxRows(*this);
 }
 
-/* get beam size of max ids and values */
+/* Get the top k elements of each row of this matrix */
 void CpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
   CHECK(isContiguous());
   CHECK(!maxIds.useGpu() && !maxVal.useGpu()) << "Matrix type are not equal";
@@ -3048,6 +3061,7 @@ void CpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
   size_t beam = maxVal.getWidth();
   CHECK_EQ(maxIds.getSize(), numSamples * beam);
   CHECK_EQ(maxVal.getHeight(), numSamples);
+  CHECK_EQ(maxVal.getWidth(), beam);
 
   real* a = getData();
   int* s = maxIds.getData();
@@ -3199,32 +3213,39 @@ void CpuMatrix::rowNormalizeL1(Matrix& out) {
 }
 
 /* calulate classification error */
-void CpuMatrix::classificationError(Matrix& output, IVector& label) {
-  CHECK(dynamic_cast<const CpuMatrix*>(&output));
-  CHECK(dynamic_cast<const CpuIVector*>(&label));
+void CpuMatrix::classificationError(Matrix& output,
+                                    IVector& label,
+                                    size_t topkSize) {
+  size_t numSamples = this->getHeight();
+  auto cpuOutput = dynamic_cast<CpuMatrix*>(&output);
+  auto cpuLabel = dynamic_cast<CpuIVector*>(&label);
+  IVectorPtr cpuTopIds = std::make_shared<CpuIVector>(numSamples * topkSize);
+  MatrixPtr cpuTopVal = std::make_shared<CpuMatrix>(numSamples, topkSize);
+
+  CHECK(cpuOutput && cpuLabel) << "Invalid argument pointer";
+  CHECK(cpuTopIds && cpuTopVal) << "Allocate cpu memory failed";
+  CHECK(cpuLabel->getSize() == numSamples) << "Vector size is not equal";
+  CHECK(cpuOutput->getHeight() == numSamples && this->getWidth() == 1)
+      << "Matrix dimensions are not equal";
 
-  CHECK_EQ(getWidth(), (size_t)1);
-  size_t numSamples = getHeight();
-  CHECK_EQ(label.getSize(), numSamples);
-  CHECK_EQ(output.getHeight(), numSamples);
+  // top k matrix classification
+  cpuOutput->rowMax(*cpuTopIds, *cpuTopVal);
 
-  size_t dim = output.getWidth();
-  real* out = output.getData();
-  int* lbl = label.getData();
-  real maxData = 0.0;
-  int maxIndex = -1;
+  size_t dim = cpuOutput->getWidth();
+  real* result = this->getData();
+  int* ids = cpuTopIds->getData();
+  int* lbl = cpuLabel->getData();
   for (size_t i = 0; i < numSamples; ++i) {
     CHECK_GE(lbl[i], 0);
     CHECK_LT((size_t)lbl[i], dim);
-    maxData = out[i * dim];
-    maxIndex = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      if (maxData < out[i * dim + j]) {
-        maxIndex = j;
-        maxData = out[i * dim + j];
+
+    for (size_t j = 0; j < topkSize; ++j) {
+      if (ids[j + i * topkSize] == lbl[i]) {
+        result[i] = 0;
+        break;
       }
+      result[i] = 1.0f;
     }
-    getData()[i] = (maxIndex != lbl[i]);
   }
 }
 
@@ -3418,105 +3439,6 @@ void CpuMatrix::softmaxDerivative(Matrix& output, Matrix& sftmaxSum) {
   }
 }
 
-void CpuMatrix::cosSim(Matrix& output1, Matrix& output2, real scale) {
-  size_t numSamples = getHeight();
-  size_t dim = output1.getWidth();
-  CHECK_EQ(getWidth(), 1UL);
-  CHECK_EQ(output1.getHeight(), numSamples);
-  CHECK_EQ(output1.getWidth(), output2.getWidth());
-
-  real* out = getData();
-  const real* x = output1.getData();
-  const real* y = output2.getData();
-  size_t yInc = dim;
-  if (output2.getHeight() == 1LU) {
-    yInc = 0;
-  } else {
-    CHECK_EQ(output2.getHeight(), numSamples);
-  }
-  for (size_t i = 0; i < numSamples; ++i, x += dim, y += yInc) {
-    real squareSumX = 0;
-    real squareSumY = 0;
-    real xy = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      squareSumX += _square(x[j]);
-      squareSumY += _square(y[j]);
-      xy += x[j] * y[j];
-    }
-    CHECK(squareSumX > 0 && squareSumY > 0);
-    out[i] = scale * xy / (std::sqrt(squareSumX) * std::sqrt(squareSumY));
-  }
-}
-
-void CpuMatrix::cosSimDerivative(Matrix& output,
-                                 Matrix& prevOut1,
-                                 Matrix& prevOut2,
-                                 Matrix& prevGrad1,
-                                 Matrix& prevGrad2,
-                                 real scale) {
-  CHECK(output.useGpu_ == false) << "Matrix type are not equal";
-
-  CHECK_EQ(getWidth(), 1UL);
-  CHECK_EQ(output.getWidth(), 1UL);
-
-  size_t numSamples = getHeight();
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(prevOut1.getHeight(), numSamples);
-  CHECK_EQ(prevGrad1.getHeight(), numSamples);
-
-  size_t dim = prevOut1.getWidth();
-  CHECK_EQ(prevOut2.getWidth(), dim);
-  CHECK_EQ(prevGrad1.getWidth(), dim);
-  CHECK_EQ(prevGrad2.getWidth(), dim);
-
-  const real* grad = getData();
-  const real* out = output.getData();
-  const real* prevOutX = prevOut1.getData();
-  const real* prevOutY = prevOut2.getData();
-  real* prevGradX = prevGrad1.getData();
-  real* prevGradY = prevGrad2.getData();
-  size_t yInc = dim;
-  if (prevOut2.getHeight() == 1LU) {
-    yInc = 0;
-    CHECK_EQ(prevGrad2.getHeight(), 1LU);
-  } else {
-    CHECK_EQ(prevOut2.getHeight(), numSamples);
-    CHECK_EQ(prevGrad2.getHeight(), numSamples);
-  }
-  for (size_t i = 0; i < numSamples; ++i,
-              prevOutX += dim,
-              prevOutY += yInc,
-              prevGradX += dim,
-              prevGradY += yInc) {
-    real squareSumX = 0;
-    real squareSumY = 0;
-    real xy = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      squareSumX += _square(prevOutX[j]);
-      squareSumY += _square(prevOutY[j]);
-      xy += prevOutX[j] * prevOutY[j];
-    }
-    CHECK(squareSumX > 0 && squareSumY > 0);
-    if (xy == 0) {
-      real reciprocal = 1.0f / (std::sqrt(squareSumX) * std::sqrt(squareSumY));
-      for (size_t j = 0; j < dim; ++j) {
-        prevGradX[j] += scale * grad[i] * prevOutY[j] * reciprocal;
-        prevGradY[j] += scale * grad[i] * prevOutX[j] * reciprocal;
-      }
-    } else {
-      real reciprocalXY = 1.0f / xy;
-      real reciprocalSquareSumX = 1.0f / squareSumX;
-      real reciprocalSquareSumY = 1.0f / squareSumY;
-      for (size_t j = 0; j < dim; ++j) {
-        prevGradX[j] += out[i] * grad[i] * (prevOutY[j] * reciprocalXY -
-                                            prevOutX[j] * reciprocalSquareSumX);
-        prevGradY[j] += out[i] * grad[i] * (prevOutX[j] * reciprocalXY -
-                                            prevOutY[j] * reciprocalSquareSumY);
-      }
-    }
-  }
-}
-
 void CpuMatrix::sumOfSquares(Matrix& output, Matrix& label) {
   CHECK(output.useGpu_ == false && label.useGpu_ == false)
       << "Matrix type are not equal";
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index dd24f8821d4976..d0ba2e93feabfc 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -31,6 +31,7 @@ limitations under the License. */
 
 namespace paddle {
 
+/// TODO(tianbing), move to paddle/function/TensorType.h
 enum SparseValueType { NO_VALUE = 0, FLOAT_VALUE = 1 };
 
 /**
@@ -56,6 +57,7 @@ enum SparseValueType { NO_VALUE = 0, FLOAT_VALUE = 1 };
  *            value [1, 1, 2, 2, 5]
  * @endcode
  */
+/// TODO(tianbing), move to paddle/function/TensorType.h
 enum SparseFormat { SPARSE_CSR = 0, SPARSE_CSC = 1 };
 
 class Matrix;
@@ -370,7 +372,27 @@ class Matrix : public BaseMatrix {
    * allocate matTrans' memory outside, then set memAlloc as false;
    * else set as true.
    */
-  virtual void transpose(MatrixPtr matTrans, bool memAlloc) {
+  virtual void transpose(MatrixPtr& matTrans, bool memAlloc) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @brief  rotate 90 degrees in clock-wise if clockWise=true;
+   *         otherwise rotate in anti clock-wise
+   * clock-wise:
+   * \f[
+   *   y(j,i) = x(M-i-1,j)
+   * \f]
+   * anti clock-wise:
+   * \f[
+   *   y(j,i) = x(i, N-1-j)
+   * \f]
+   * where \f$x\f$ is (M x N) input, and \f$y\f$ is (N x M) output.
+   *
+   * allocate matRot' memory outside, then set memAlloc as false;
+   * else set as true.
+   */
+  virtual void rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise) {
     LOG(FATAL) << "Not implemented";
   }
 
@@ -385,7 +407,7 @@ class Matrix : public BaseMatrix {
    * if allocate matInv's memory outside, then set memAlloc as false;
    * else set as true.
    */
-  virtual void inverse(MatrixPtr matInv, bool memAlloc) {
+  virtual void inverse(MatrixPtr& matInv, bool memAlloc) {
     LOG(FATAL) << "Not implemented";
   }
 
@@ -777,26 +799,6 @@ class Matrix : public BaseMatrix {
     LOG(FATAL) << "Not implemented";
   }
 
-  /**
-   * cosine similarity, for each row i,
-   *   this[i] = cos(output1[i], output2[i])
-   *
-   * output2 can only have one row, then for each row i,
-   *   this[i] = cos(output1[i], output2[0])
-   */
-  virtual void cosSim(Matrix& output1, Matrix& output2, real scale = 1.0f) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void cosSimDerivative(Matrix& output,
-                                Matrix& prevOut1,
-                                Matrix& prevOut2,
-                                Matrix& prevGrad1,
-                                Matrix& prevGrad2,
-                                real scale = 1.0f) {
-    LOG(FATAL) << "Not implemented";
-  }
-
   /// print out the values of elements to os
   virtual void print(std::ostream& os) const {
     LOG(FATAL) << "Not implemented";
@@ -834,8 +836,11 @@ class Matrix : public BaseMatrix {
    * output[i] = 1 if row i is an error.
    *
    * output[i] = 0 if row i is correct.
+   *
    */
-  virtual void classificationError(Matrix& output, IVector& label) {
+  virtual void classificationError(Matrix& output,
+                                   IVector& label,
+                                   size_t topkSize = 1) {
     LOG(FATAL) << "Not implemented";
   }
 
@@ -1167,11 +1172,15 @@ class GpuMatrix : public Matrix {
   void accumulateColSum(Matrix& src);
   real getAbsSum();
 
+  real getMin();
+  real getMax();
+
   MatrixPtr getTranspose();
-  void transpose(MatrixPtr matTrans, bool memAlloc);
+  void transpose(MatrixPtr& matTrans, bool memAlloc);
+  void rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise);
 
   MatrixPtr getInverse();
-  void inverse(MatrixPtr matInv, bool memAlloc);
+  void inverse(MatrixPtr& matInv, bool memAlloc);
 
   /// add b to each sample of this.
   void addBias(Matrix& b, real scale);
@@ -1298,14 +1307,6 @@ class GpuMatrix : public Matrix {
   void softreluDerivative(Matrix& output);
   void scaledTanh(Matrix& output, real p1, real p2);
 
-  void cosSim(Matrix& output1, Matrix& output2, real scale);
-  void cosSimDerivative(Matrix& output,
-                        Matrix& prevOut1,
-                        Matrix& prevOut2,
-                        Matrix& prevGrad1,
-                        Matrix& prevGrad2,
-                        real scale);
-
   virtual void print(std::ostream& os) const;
   virtual void print(std::ostream& os, size_t height, size_t width) const;
 
@@ -1316,7 +1317,7 @@ class GpuMatrix : public Matrix {
   void check(std::ostream& os, Matrix& refMat, bool printDiff = true);
   void randomizeUniform();
 
-  void classificationError(Matrix& output, IVector& label);
+  void classificationError(Matrix& output, IVector& label, size_t topkSize = 1);
 
   void convExpand(Matrix& feature,
                   int feaImgHeight,
@@ -1483,10 +1484,11 @@ class CpuMatrix : public Matrix {
   real getAbsSum();
 
   MatrixPtr getTranspose();
-  void transpose(MatrixPtr matTrans, bool memAlloc);
+  void transpose(MatrixPtr& matTrans, bool memAlloc);
+  void rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise);
 
   MatrixPtr getInverse();
-  void inverse(MatrixPtr matInv, bool memAlloc);
+  void inverse(MatrixPtr& matInv, bool memAlloc);
 
   void copyFrom(const Matrix& src);
 
@@ -1725,14 +1727,6 @@ class CpuMatrix : public Matrix {
   void softreluDerivative(Matrix& output);
   void scaledTanh(Matrix& output, real p1, real p2);
 
-  void cosSim(Matrix& output1, Matrix& output2, real scale);
-  void cosSimDerivative(Matrix& output,
-                        Matrix& prevOut1,
-                        Matrix& prevOut2,
-                        Matrix& prevGrad1,
-                        Matrix& prevGrad2,
-                        real scale);
-
   void print(std::ostream& os) const;
   void print(std::ostream& os, size_t height, size_t width) const;
   void printOneRow(std::ostream& os, size_t idx) const;
@@ -1748,7 +1742,7 @@ class CpuMatrix : public Matrix {
 
   void randomizeUniform();
 
-  void classificationError(Matrix& output, IVector& label);
+  void classificationError(Matrix& output, IVector& label, size_t topkSize = 1);
 
   void addByBitCode(size_t numClasses, const IVector& codes, const Matrix& vec);
 
diff --git a/paddle/math/RowBuffer.h b/paddle/math/RowBuffer.h
new file mode 100644
index 00000000000000..dbb829c4e24a65
--- /dev/null
+++ b/paddle/math/RowBuffer.h
@@ -0,0 +1,135 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "MemoryHandle.h"
+#include "paddle/utils/Util.h"
+
+namespace paddle {
+
+/**
+ * @brief The RowBuffer class
+ * Represent the SparseRow Matrix Data.
+ *
+ * If not set memory handler, then the data could be auto growth.
+ */
+class RowBuffer {
+public:
+  /**
+   * @brief RowBuffer create a auto-growth row buffer. The row length is width.
+   * @param width the length of each row, a.k.a matrix width.
+   */
+  explicit RowBuffer(size_t width) : width_(width) {}
+
+  /**
+   * @brief RowBuffer create a row buffer, which cannot be auto-growth.
+   * @param mem the pre-allocated memory.
+   * @param width the length of each row, a.k.a matrix width.
+   */
+  RowBuffer(const CpuMemHandlePtr& mem, size_t width)
+      : preallocatedBuf_(mem), width_(width) {}
+
+  /**
+   * @brief resize resize the buffer with rowCount
+   * @param rowCnt number of row. matrix height.
+   */
+  inline void resize(int rowCnt) {
+    if (preallocatedBuf_) {
+      CHECK(preallocatedBuf_->getSize() >= rowCnt * width_ * sizeof(real));
+    } else {
+      rowStore_.resize(rowCnt * width_);
+    }
+  }
+
+  /**
+   * @brief get a row buffer with row index.
+   * @param row the index of row.
+   * @return row buffer.
+   */
+  inline real* get(int row) const {
+    if (preallocatedBuf_) {
+      CHECK_LE((row + 1) * width_ * sizeof(real), preallocatedBuf_->getSize());
+      return reinterpret_cast<real*>(preallocatedBuf_->getBuf()) + row * width_;
+    } else {
+      CHECK_LE((row + 1) * width_, rowStore_.size());
+      return const_cast<real*>(rowStore_.data() + row * width_);
+    }
+  }
+
+  /**
+   * @brief get a row buffer with row index. If row index is larger than local
+   *        buffer, the size of local buffer will grow.
+   * @param row the index of row.
+   * @return row buffer.
+   */
+  inline real* getWithAutoGrowth(int row) {
+    if (preallocatedBuf_) {
+      return get(row);
+    } else {
+      if ((rowStore_.size() <= row * width_)) {
+        rowStore_.resize((row + 1) * width_);
+      }
+      return rowStore_.data() + row * width_;
+    }
+  }
+
+  /**
+   * @return raw data buffer.
+   */
+  inline real* data() {
+    if (preallocatedBuf_) {
+      return reinterpret_cast<real*>(preallocatedBuf_->getBuf());
+    } else {
+      return rowStore_.data();
+    }
+  }
+
+  /**
+   * @brief clear local buffer. It only affect auto-growth buffer.
+   */
+  inline void clear() { rowStore_.clear(); }
+
+  /**
+   * @brief get current number of rows.
+   * @return number of rows.
+   */
+  inline size_t getRowCount() const {
+    if (preallocatedBuf_) {
+      return preallocatedBuf_->getSize() / sizeof(real) / width_;
+    } else {
+      return rowStore_.size() / width_;
+    }
+  }
+
+  /**
+   * @brief get is this buffer can automatically grow or not.
+   * @return ture if can automacitally grow.
+   */
+  inline bool isAutoGrowth() const { return !preallocatedBuf_; }
+
+  /**
+   * @brief return the width of matrix. a.k.a length of row.
+   * @return width of matrix
+   */
+  inline size_t getWidth() const { return width_; }
+
+private:
+  //! TODO(yuyang18): Add resize method to CpuMemHandlePtr, then we can get rid
+  //! of std::vector here.
+  CpuMemHandlePtr preallocatedBuf_;
+  std::vector<real, AlignedAllocator<real, 32>> rowStore_;
+  size_t width_;
+};
+}  // namespace paddle
diff --git a/paddle/math/SparseMatrix.cpp b/paddle/math/SparseMatrix.cpp
index 720a035ecbd26d..6370c77386688a 100644
--- a/paddle/math/SparseMatrix.cpp
+++ b/paddle/math/SparseMatrix.cpp
@@ -177,7 +177,6 @@ GpuSparseMatrix::GpuSparseMatrix(real* value,
       hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
       sMatrix_ = tmp2;
     }
-    LOG(INFO) << "weight to matrix ";
   }
 }
 
@@ -498,7 +497,7 @@ void GpuSparseMatrix::setRow(size_t row,
 
 SparseValueType GpuSparseMatrix::getValueType() const { return valueType_; }
 
-void GpuSparseMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
+void GpuSparseMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
   CHECK_EQ(format_, SPARSE_CSC);
   int nnz = sMatrix_->nnz;
   if (memAlloc) {
diff --git a/paddle/math/SparseMatrix.h b/paddle/math/SparseMatrix.h
index 1d3801548e03a6..f6cd5df338965b 100644
--- a/paddle/math/SparseMatrix.h
+++ b/paddle/math/SparseMatrix.h
@@ -109,7 +109,7 @@ class GpuSparseMatrix : public Matrix {
   MatrixPtr getTranspose();
 
   /// B = A'
-  void transpose(MatrixPtr matTrans, bool memAlloc);
+  void transpose(MatrixPtr& matTrans, bool memAlloc);
 
   void copyFrom(const Matrix& src);
   void copyFrom(const Matrix& src, hl_stream_t stream);
diff --git a/paddle/math/SparseRowMatrix.cpp b/paddle/math/SparseRowMatrix.cpp
index b61c6b2d49ccea..b8c781ca1fd46c 100644
--- a/paddle/math/SparseRowMatrix.cpp
+++ b/paddle/math/SparseRowMatrix.cpp
@@ -24,10 +24,6 @@ limitations under the License. */
 #include "paddle/utils/Thread.h"
 #include "paddle/utils/Util.h"
 
-DEFINE_bool(allow_inefficient_sparse_update,
-            false,
-            "Whether to allow inefficient sparse update");
-
 namespace paddle {
 
 const unsigned int SparseRowCpuMatrix::kUnusedId_ = -1U;
diff --git a/paddle/math/SparseRowMatrix.h b/paddle/math/SparseRowMatrix.h
index 778a9bd8456618..1ccbf97b25922a 100644
--- a/paddle/math/SparseRowMatrix.h
+++ b/paddle/math/SparseRowMatrix.h
@@ -18,10 +18,9 @@ limitations under the License. */
 #include <string.h>
 #include <algorithm>
 #include "Matrix.h"
+#include "RowBuffer.h"
 #include "paddle/utils/Util.h"
 
-DECLARE_bool(allow_inefficient_sparse_update);
-
 namespace paddle {
 
 /**
@@ -45,12 +44,9 @@ class SparseRowCpuMatrix : public CpuMatrix {
                      IndexDictPtr indexDictHandle = nullptr,
                      bool trans = false)
       : CpuMatrix(nullptr, height, width, trans),
-        storeMat_(dataHandle,
-                  dataHandle ? dataHandle->getSize() / sizeof(real) / width : 0,
-                  width,
-                  trans),
         indexDictHandle_(indexDictHandle) {
     init(height, width);
+    buf_.reset(new RowBuffer(dataHandle, width));
   }
 
   virtual ~SparseRowCpuMatrix() {}
@@ -71,25 +67,16 @@ class SparseRowCpuMatrix : public CpuMatrix {
    *
    *  @param row row id in local storage
    */
-  real* getLocalRow(size_t row) {
-    if (storeMat_.getData()) return storeMat_.rowBuf(row);
-    if (rowStore_.size() <= row * width_) {
-      rowStore_.resize((row + 1) * width_);
-    }
-    return rowStore_.data() + row * width_;
-  }
+  real* getLocalRow(size_t row) { return buf_->getWithAutoGrowth(row); }
 
   /**
-   *  reserve the storage for rows according to current size of indexDictHandle.
+   *  reserve the storage for rows according to current size of
+   * indexDictHandle.
    *
    *  This is only used when SparseRowCpuMatrix is constructed with
    *  indexDictHandle.
    */
-  void reserveStore() {
-    if (!storeMat_.getData() && !localIndices_->empty()) {
-      rowStore_.resize(localIndices_->size() * width_);
-    }
-  }
+  void reserveStore() { buf_->resize(localIndices_->size()); }
 
   // row is the row id in the original matrix
   virtual real* getRowBuf(size_t row) { return getRow(row); }
@@ -117,7 +104,8 @@ class SparseRowCpuMatrix : public CpuMatrix {
    *
    * If L1 decay set use L1, else if L2 set use L2, otherwise no decay atall.
    *
-   * t0 is a int vector used by L1/L2 decay, size = height of parameter matrix,
+   * t0 is a int vector used by L1/L2 decay, size = height of parameter
+   * matrix,
    * store the time that each weight row last updated.
    *
    * Time is batchId, currentTime is current batchId.
@@ -176,8 +164,7 @@ class SparseRowCpuMatrix : public CpuMatrix {
 protected:
   template <typename Func>
   void apply(Func f) {
-    real* data = storeMat_.getData() ? storeMat_.getData() : rowStore_.data();
-    f(data, localIndices_->size() * width_);
+    f(buf_->data(), localIndices_->size() * width_);
   }
 
   void init(size_t height, size_t width);
@@ -188,25 +175,23 @@ class SparseRowCpuMatrix : public CpuMatrix {
       globalIndices_[id] = kUnusedId_;
     }
     localIndices_->clear();
-    rowStore_.clear();
+    buf_->clear();
   }
 
   inline void checkStoreSize() {
-    if (storeMat_.getData()) {
-      CHECK_LE(localIndices_->size(), storeMat_.getHeight());
-    } else if (!FLAGS_allow_inefficient_sparse_update) {
-      if (localIndices_->size() > 0.5 * height_) {
-        LOG(WARNING)
-            << "There are more than 0.5*height (" << localIndices_->size()
-            << ") rows are used for sparse "
-            << "update, which is not efficient. Considering not use "
-            << "sparse_update or set --allow_inefficient_sparse_update=true";
+    if (buf_->isAutoGrowth()) {
+      if (buf_->getRowCount() > 0.5 * height_) {
+        LOG(WARNING) << "There are more than 0.5*height ("
+                     << localIndices_->size() << ") rows are used for sparse "
+                     << "update, which is not efficient. Considering not use "
+                     << "sparse_update.";
       }
+    } else {
+      CHECK_LE(localIndices_->size(), buf_->getRowCount());
     }
   }
 
-  CpuMatrix storeMat_;
-  std::vector<real, AlignedAllocator<real, 32>> rowStore_;
+  std::unique_ptr<RowBuffer> buf_;
   IndexDictPtr indexDictHandle_;
   std::vector<unsigned int>* localIndices_;  // =&indexDictHandle_->localIndices
   unsigned int* globalIndices_;  // =indexDictHandle_->globalIndices.data();
diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/math/tests/CMakeLists.txt
index 06fc10bae7232f..ceb96b2e250d8e 100644
--- a/paddle/math/tests/CMakeLists.txt
+++ b/paddle/math/tests/CMakeLists.txt
@@ -4,6 +4,7 @@ add_simple_unittest(test_ExecViaCpu)
 add_simple_unittest(test_SIMDFunctions)
 add_simple_unittest(test_TrainingAlgorithm)
 add_simple_unittest(test_SparseMatrix)
+add_simple_unittest(test_RowBuffer)
 
 # TODO(yuyang18): Refactor TestUtil.cpp. Remove this cross module reference.
 add_unittest(test_matrixCompare
diff --git a/paddle/math/tests/test_Matrix.cpp b/paddle/math/tests/test_Matrix.cpp
index a4084bdf7c6953..1c21da5b76e956 100644
--- a/paddle/math/tests/test_Matrix.cpp
+++ b/paddle/math/tests/test_Matrix.cpp
@@ -181,28 +181,6 @@ TEST(Matrix, copyByRowIndex) {
   }
 }
 
-void testCosSim(int heightX, int heightY, int width, real scale) {
-  AutoCompare test(heightX, 1);
-  CpuMatrix arg1(heightX, width);
-  CpuMatrix arg2(heightY, width);
-  arg1.randomizeUniform();
-  arg2.randomizeUniform();
-  arg2.add(-0.5);
-  test.cmpWithArg(&Matrix::cosSim, arg1, arg2, scale);
-}
-
-TEST(Matrix, cosSim) {
-  for (auto heightX : {10, 100, 1000}) {
-    for (auto heightY : {1, heightX}) {
-      for (auto width : {10, 100, 1000}) {
-        for (auto scale : {1.0, 2.0}) {
-          testCosSim(heightX, heightY, width, scale);
-        }
-      }
-    }
-  }
-}
-
 void testParamReluForward(int height, int width, int w_height, int w_width) {
   AutoCompare test(height, width);
   CpuMatrix arg1(height, width);
diff --git a/paddle/math/tests/test_RowBuffer.cpp b/paddle/math/tests/test_RowBuffer.cpp
new file mode 100644
index 00000000000000..8cc4c69a1a4d8a
--- /dev/null
+++ b/paddle/math/tests/test_RowBuffer.cpp
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/math/RowBuffer.h"
+
+TEST(RowBuffer, testAutoGrow) {
+  paddle::RowBuffer buf(128);
+  ASSERT_EQ(128UL, buf.getWidth());
+  ASSERT_TRUE(buf.isAutoGrowth());
+  buf.resize(2);
+  ASSERT_EQ(2UL, buf.getRowCount());
+  for (size_t i = 0; i < buf.getWidth() * 2; ++i) {
+    buf.data()[i] = i;
+  }
+  for (size_t i = 0; i < buf.getRowCount(); ++i) {
+    for (size_t j = 0; j < buf.getWidth(); ++j) {
+      ASSERT_NEAR(i * buf.getWidth() + j, buf.get(i)[j], 1e-5);
+    }
+  }
+
+  auto data = buf.getWithAutoGrowth(2);
+  for (size_t i = 0; i < buf.getWidth(); ++i) {
+    data[i] = i;
+  }
+
+  ASSERT_EQ(3UL, buf.getRowCount());
+  for (size_t i = 0; i < buf.getRowCount() - 1; ++i) {
+    for (size_t j = 0; j < buf.getWidth(); ++j) {
+      ASSERT_NEAR(i * buf.getWidth() + j, buf.get(i)[j], 1e-5);
+    }
+  }
+  for (size_t i = 0; i < buf.getWidth(); ++i) {
+    ASSERT_NEAR(i, buf.get(2)[i], 1e-5);
+  }
+}
+
+TEST(RowBuffer, testWithMemBuf) {
+  paddle::CpuMemHandlePtr mem =
+      std::make_shared<paddle::CpuMemoryHandle>(128 * 2 * sizeof(real));
+  paddle::RowBuffer buf(mem, 128);
+  ASSERT_TRUE(!buf.isAutoGrowth());
+  ASSERT_EQ(2UL, buf.getRowCount());
+  for (size_t i = 0; i < buf.getWidth() * 2; ++i) {
+    buf.data()[i] = i;
+  }
+  for (size_t i = 0; i < buf.getRowCount(); ++i) {
+    for (size_t j = 0; j < buf.getWidth(); ++j) {
+      ASSERT_NEAR(i * buf.getWidth() + j, buf.getWithAutoGrowth(i)[j], 1e-5);
+    }
+  }
+
+  ASSERT_DEATH_IF_SUPPORTED(buf.getWithAutoGrowth(3), ".*");
+}
diff --git a/paddle/math/tests/test_SparseMatrix.cpp b/paddle/math/tests/test_SparseMatrix.cpp
index 9d3fbaef43d719..c0572dfdbf738a 100644
--- a/paddle/math/tests/test_SparseMatrix.cpp
+++ b/paddle/math/tests/test_SparseMatrix.cpp
@@ -248,11 +248,13 @@ TEST(Matrix, SparseMatrixTranspose) {
             /*dense matrix transpose*/
             CpuMatrixPtr matC(new CpuMatrix(height, width));
             matC->copyFrom(*matA);
-            CpuMatrixPtr matD(new CpuMatrix(width, height));
+            MatrixPtr matD(new CpuMatrix(width, height));
             matC->transpose(matD, false);
+
             /*check result*/
             checkSMatrixEqual2Dense(
-                std::dynamic_pointer_cast<CpuSparseMatrix>(matB), matD);
+                std::dynamic_pointer_cast<CpuSparseMatrix>(matB),
+                std::dynamic_pointer_cast<CpuMatrix>(matD));
           }
         }
       }
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index f0c49791d7e2a6..08b64c1bb6f5d3 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -105,6 +105,21 @@ void testMatrixGetSum(int height, int width) {
   EXPECT_LE(fabs(cpuSum - gpuSum), err);
 }
 
+void testMatrixGetMinMax(int height, int width) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+
+  real cpuMin = cpuInput->getMin();
+  real gpuMin = gpuInput->getMin();
+  real cpuMax = cpuInput->getMax();
+  real gpuMax = gpuInput->getMax();
+
+  EXPECT_EQ(cpuMin, gpuMin);
+  EXPECT_EQ(cpuMax, gpuMax);
+}
+
 void testMatrixZeroAtOffset(int height, int width) {
   MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
   MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
@@ -161,11 +176,29 @@ void testMatrixTranspose(int height, int width) {
   cpu->randomizeUniform();
   gpu->copyFrom(*cpu);
   cpu->transpose(cpuT, false);
-  gpu->transpose(gpuT, false);
+  gpu->transpose(gpuT, true);
 
   TensorCheckEqual(*cpuT, *gpuT);
 }
 
+void testMatrixRotate(int height, int width) {
+  MatrixPtr cpu = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpu = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr cpuR = std::make_shared<CpuMatrix>(width, height);
+  MatrixPtr gpuR = std::make_shared<GpuMatrix>(width, height);
+
+  cpu->randomizeUniform();
+  gpu->copyFrom(*cpu);
+
+  cpu->rotate(cpuR, false, true);
+  gpu->rotate(gpuR, true, true);
+  TensorCheckEqual(*cpuR, *gpuR);
+
+  cpu->rotate(cpuR, true, false);
+  gpu->rotate(gpuR, false, false);
+  TensorCheckEqual(*cpuR, *gpuR);
+}
+
 void testMatrixInverse(int height) {
   MatrixPtr cpu = std::make_shared<CpuMatrix>(height, height);
   MatrixPtr gpu = std::make_shared<GpuMatrix>(height, height);
@@ -181,7 +214,7 @@ void testMatrixInverse(int height) {
   cpu->add(*outputCheck);
 
   gpu->copyFrom(*cpu);
-  cpu->inverse(cpuI, false);
+  cpu->inverse(cpuI, true);
   gpu->inverse(gpuI, false);
 
   TensorCheckErr(*cpuI, *gpuI);
@@ -200,6 +233,7 @@ TEST(Matrix, unary) {
       testMatrixZeroAtOffset(height, width);
       testMatrixGetSum(height, width);
       testMatrixTranspose(height, width);
+      testMatrixRotate(height, width);
     }
     // inverse
     testMatrixInverse(height);
@@ -686,61 +720,6 @@ TEST(Matrix, sequenceAvgForward) {
   }
 }
 
-void testCosSimDerivate(int heightX, int heightY, int width, real scale) {
-  MatrixPtr prevOutX = CpuMatrix::create(heightX, width, false, false);
-  MatrixPtr prevOutY = CpuMatrix::create(heightY, width, false, false);
-  MatrixPtr grad = CpuMatrix::create(heightX, 1, false, false);
-  MatrixPtr output = CpuMatrix::create(heightX, 1, false, false);
-  MatrixPtr prevGradX = CpuMatrix::create(heightX, width, false, false);
-  MatrixPtr prevGradY = CpuMatrix::create(heightY, width, false, false);
-
-  prevOutX->randomizeUniform();
-  prevOutY->randomizeUniform();
-  grad->randomizeUniform();
-  output->randomizeUniform();
-  prevGradX->randomizeUniform();
-  prevGradY->randomizeUniform();
-
-  MatrixPtr prevOutXGpu = GpuMatrix::create(heightX, width, false, true);
-  MatrixPtr prevOutYGpu = GpuMatrix::create(heightY, width, false, true);
-  MatrixPtr gradGpu = GpuMatrix::create(heightX, 1, false, true);
-  MatrixPtr outputGpu = GpuMatrix::create(heightX, 1, false, true);
-  MatrixPtr prevGradXGpu = GpuMatrix::create(heightX, width, false, true);
-  MatrixPtr prevGradYGpu = GpuMatrix::create(heightY, width, false, true);
-
-  prevOutXGpu->copyFrom(*prevOutX);
-  prevOutYGpu->copyFrom(*prevOutY);
-  gradGpu->copyFrom(*grad);
-  outputGpu->copyFrom(*output);
-  prevGradXGpu->copyFrom(*prevGradX);
-  prevGradYGpu->copyFrom(*prevGradY);
-
-  grad->cosSimDerivative(
-      *output, *prevOutX, *prevOutY, *prevGradX, *prevGradY, scale);
-
-  gradGpu->cosSimDerivative(*outputGpu,
-                            *prevOutXGpu,
-                            *prevOutYGpu,
-                            *prevGradXGpu,
-                            *prevGradYGpu,
-                            scale);
-
-  TensorCheckErr(*prevGradX, *prevGradXGpu);
-  TensorCheckErr(*prevGradY, *prevGradYGpu);
-}
-
-TEST(Matrix, cosSimDerivate) {
-  for (auto heightX : {1, 10, 100}) {
-    for (auto heightY : {1, heightX}) {
-      for (auto width : {1, 10, 100}) {
-        for (auto scale : {1.0, 2.0}) {
-          testCosSimDerivate(heightX, heightY, width, scale);
-        }
-      }
-    }
-  }
-}
-
 void testParamReluBackwardDiff(int height,
                                int width,
                                int w_height,
@@ -785,7 +764,7 @@ TEST(Matrix, paramReluBackwardDiff) {
   }
 }
 
-void testClassificationError(int numSamples, int dim) {
+void testClassificationError(int numSamples, int dim, int topkSize) {
   MatrixPtr cpuError = std::make_shared<CpuMatrix>(numSamples, 1);
   MatrixPtr gpuError = std::make_shared<GpuMatrix>(numSamples, 1);
   MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(numSamples, dim);
@@ -798,17 +777,22 @@ void testClassificationError(int numSamples, int dim) {
   gpuOutput->copyFrom(*cpuOutput);
   gpuLabel->copyFrom(*cpuLabel);
 
-  cpuError->classificationError(*cpuOutput, *cpuLabel);
-  gpuError->classificationError(*gpuOutput, *gpuLabel);
+  cpuError->classificationError(*cpuOutput, *cpuLabel, topkSize);
+  gpuError->classificationError(*gpuOutput, *gpuLabel, topkSize);
 
   TensorCheckEqual(*cpuError, *gpuError);
 }
 
 TEST(Matrix, classificationError) {
-  for (auto numSamples : {1, 10, 100, 1000, 70000}) {
-    for (auto dim : {1, 10, 100, 1000}) {
-      VLOG(3) << " numSamples=" << numSamples << " dim=" << dim;
-      testClassificationError(numSamples, dim);
+  for (auto numSamples : {1, 5, 31, 90, 150, 300}) {
+    for (auto dim :
+         {1, 5, 8, 10, 15, 64, 80, 120, 256, 300, 1280, 5120, 50000}) {
+      for (auto topkSize : {1, 5, 10, 20, 40, (int)rand() % dim + 1}) {
+        if (topkSize > dim) continue;
+        VLOG(3) << " sample= " << numSamples << " topkSize= " << topkSize
+                << " dim= " << dim;
+        testClassificationError(numSamples, dim, topkSize);
+      }
     }
   }
 }
diff --git a/paddle/math/tests/test_matrixUtil.h b/paddle/math/tests/test_matrixUtil.h
index 9aa74b15193723..47f461474622d1 100644
--- a/paddle/math/tests/test_matrixUtil.h
+++ b/paddle/math/tests/test_matrixUtil.h
@@ -30,6 +30,17 @@ void checkMatrixEqual(const MatrixPtr& a, const MatrixPtr& b) {
   }
 }
 
+void checkSMatrixEqual(const CpuSparseMatrix& a, const CpuSparseMatrix& b) {
+  ASSERT_EQ(a.getWidth(), b.getWidth());
+  ASSERT_EQ(a.getHeight(), b.getHeight());
+  ASSERT_EQ(a.isTransposed(), b.isTransposed());
+  ASSERT_EQ(a.getFormat(), b.getFormat());
+  ASSERT_EQ(a.getElementCnt(), b.getElementCnt());
+  for (size_t r = 0; r < a.getElementCnt(); ++r) {
+    ASSERT_FLOAT_EQ(a.getValue()[r], b.getValue()[r]);
+  }
+}
+
 void checkSMatrixEqual(const CpuSparseMatrixPtr& a,
                        const CpuSparseMatrixPtr& b) {
   ASSERT_EQ(a->getWidth(), b->getWidth());
@@ -73,6 +84,36 @@ void checkSMatrixEqual2(const CpuSparseMatrixPtr& a,
   }
 }
 
+void checkSMatrixEqual2Dense(const CpuSparseMatrix& a, const CpuMatrix& b) {
+  ASSERT_EQ(a.getWidth(), b.getWidth());
+  ASSERT_EQ(a.getHeight(), b.getHeight());
+  ASSERT_EQ(a.isTransposed(), b.isTransposed());
+
+  if (a.getFormat() == SPARSE_CSC) {
+    int* rows = a.getRows();
+    for (size_t i = 0; i < a.getWidth(); i++) {
+      for (size_t j = a.getColStartIdx(i); j < a.getColStartIdx(i + 1); j++) {
+        if (a.getValueType() == FLOAT_VALUE) {
+          ASSERT_FLOAT_EQ(a.getValue()[j], b.getElement(rows[j], i));
+        } else {
+          ASSERT_FLOAT_EQ(1.0, b.getElement(rows[j], i));
+        }
+      }
+    }
+  } else {
+    int* cols = a.getCols();
+    for (size_t i = 0; i < a.getHeight(); i++) {
+      for (size_t j = a.getRowStartIdx(i); j < a.getRowStartIdx(i + 1); j++) {
+        if (a.getValueType() == FLOAT_VALUE) {
+          ASSERT_FLOAT_EQ(a.getValue()[j], b.getElement(i, cols[j]));
+        } else {
+          ASSERT_FLOAT_EQ(1.0, b.getElement(i, cols[j]));
+        }
+      }
+    }
+  }
+}
+
 void checkSMatrixEqual2Dense(const CpuSparseMatrixPtr& a,
                              const CpuMatrixPtr& b) {
   ASSERT_EQ(a->getWidth(), b->getWidth());
diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp
index 65d01a15718ae2..7a343cca33f5b4 100644
--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -602,6 +602,44 @@ void Argument::degradeSequence(const Argument& input, bool useGpu) {
   tgtBuf[numSequences] = numSubSequences;
 }
 
+void Argument::getValueString(
+    std::unordered_map<std::string, std::string>* out) const {
+  if (value) {
+    std::ostringstream os;
+    value->print(os);
+    out->insert({"value", os.str()});
+  }
+  if (ids) {
+    std::ostringstream os;
+    ids->print(os, ids->getSize());
+    out->insert({"ids", os.str()});
+  }
+  if (sequenceStartPositions) {
+    std::ostringstream os;
+    sequenceStartPositions->getVector(false)->print(
+        os, sequenceStartPositions->getSize());
+    out->insert({"sequence pos", os.str()});
+  }
+  if (subSequenceStartPositions) {
+    std::ostringstream os;
+    subSequenceStartPositions->getVector(false)->print(
+        os, subSequenceStartPositions->getSize());
+    out->insert({"sub-sequence pos", os.str()});
+  }
+}
+
+void Argument::printValueString(std::ostream& stream,
+                                const std::string& prefix) const {
+  std::unordered_map<std::string, std::string> out;
+  getValueString(&out);
+  for (auto field : {"value", "id", "sequence pos", "sub-sequence pos"}) {
+    auto it = out.find(field);
+    if (it != out.end()) {
+      stream << prefix << field << ":\n" << it->second;
+    }
+  }
+}
+
 void Argument::subArgFrom(const Argument& input,
                           size_t offset,
                           size_t height,
diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h
index afd2de0202bf0f..178c068b93ac5f 100644
--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
@@ -297,6 +297,23 @@ struct Argument {
    sequence has sub-sequence degrades to a sequence.
    */
   void degradeSequence(const Argument& input, bool useGpu);
+
+  /**
+   * @brief getValueString will return the argument's output in string. There
+   * are several kinds of output. The keys of output dictionary are 'value',
+   * 'id', 'sequence pos', 'sub-sequence pos'.
+   * @param out [out]: the return values.
+   */
+  void getValueString(std::unordered_map<std::string, std::string>* out) const;
+
+  /**
+   * @brief printValueString will print the argument's output in order of
+   * 'value', 'id', 'sequence pos', 'sub-sequence pos'.
+   * @param stream: Output stream
+   * @param prefix: line prefix for printing.
+   */
+  void printValueString(std::ostream& stream,
+                        const std::string& prefix = "") const;
 };
 
 }  // namespace paddle
diff --git a/paddle/parameter/Parameter.cpp b/paddle/parameter/Parameter.cpp
index 29d6e20dc16968..1ccded81879679 100644
--- a/paddle/parameter/Parameter.cpp
+++ b/paddle/parameter/Parameter.cpp
@@ -375,10 +375,6 @@ bool Parameter::load(const std::string& filename) {
   std::ifstream fs(filename, std::ios_base::binary);
   if (!fs) {
     LOG(INFO) << "missing parameters [" << filename << "] while loading model.";
-    if (isStatic()) {
-      LOG(FATAL) << getName() << " is static but missing, not allowed.";
-      return false;
-    }
     if (kMissParameterFail == FLAGS_load_missing_parameter_strategy) {
       LOG(FATAL) << getName() << " missing, not allowed.";
       return false;
diff --git a/paddle/parameter/ParameterUpdaterBase.h b/paddle/parameter/ParameterUpdaterBase.h
index b230e170c15f1b..6265c828a1a254 100644
--- a/paddle/parameter/ParameterUpdaterBase.h
+++ b/paddle/parameter/ParameterUpdaterBase.h
@@ -55,7 +55,7 @@ class ParameterUpdater {
   // between startBatch() and finishBatch(), update() will be called
   // by the trainer multiple times, each time for updating one Parameter
   // with its gradient in PARAMETER_GRADIENT
-  virtual void update(Parameter* para) {
+  void update(Parameter* para) {
     SetDevice setDevice(para->getDeviceId());
     para->updateHook();
     this->updateImpl(para);
diff --git a/paddle/pserver/test/test_ProtoServer.cpp b/paddle/pserver/test/test_ProtoServer.cpp
index 9f86ee80f4e5cc..04236fda2fb62b 100644
--- a/paddle/pserver/test/test_ProtoServer.cpp
+++ b/paddle/pserver/test/test_ProtoServer.cpp
@@ -12,14 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Util.h"
-
 #include <gtest/gtest.h>
-
+#include <memory>
 #include "ParameterService.pb.h"
 #include "paddle/math/Vector.h"
 #include "paddle/pserver/ProtoServer.h"
 #include "paddle/utils/Stat.h"
+#include "paddle/utils/Util.h"
 
 DEFINE_string(server_addr, "127.0.0.1", "Server address");
 DEFINE_int64(dim, 50000000, "Data size");
@@ -162,18 +161,9 @@ TEST(ProtoServer, extended) {
 int main(int argc, char** argv) {
   paddle::initMain(argc, argv);
   testing::InitGoogleTest(&argc, argv);
-
-  MyServer* server;
-  if (FLAGS_rdma_tcp == "rdma") {
-    server = new MyServer(FLAGS_port, 0);
-  } else {
-    server = new MyServer(FLAGS_port);
-  }
-
-  server->start();
+  MyServer server(FLAGS_port, FLAGS_rdma_tcp == "rdma" ? 0 : -1);
+  server.start();
   usleep(10000);
 
-  int ret = RUN_ALL_TESTS();
-
-  exit(ret);
+  return RUN_ALL_TESTS();
 }
diff --git a/paddle/py_paddle/dataprovider_converter.py b/paddle/py_paddle/dataprovider_converter.py
index 981d10afda2671..c009b05cdeeb9d 100644
--- a/paddle/py_paddle/dataprovider_converter.py
+++ b/paddle/py_paddle/dataprovider_converter.py
@@ -23,8 +23,18 @@
 class IScanner(object):
     def __init__(self, input_type, pos):
         self.input_type = input_type
-        assert isinstance(self.input_type, dp2.InputType)
+        if not isinstance(self.input_type, dp2.InputType):
+            raise ValueError("input type should be dataprovider2.InputType")
         self.pos = pos
+        # data_in_gpu is used to indicate whether to create argument on GPU
+        # or not in GPU mode. Now if using one thread (trainer_count=1),
+        # trainer uses NeuralNetwork which needs to create argument on GPU
+        # before calling forward function. So, set data_in_gpu to True.
+        # Otherwise, trainer uses MultiGradientMachine which will transfer
+        # data from CPU to GPU in the forward function, set data_in_gpu to
+        # False in this case.
+        self.data_in_gpu = swig_paddle.isUsingGpu(
+        ) and swig_paddle.getTrainerCount() == 1
 
     def scan(self, dat):
         pass
@@ -34,6 +44,10 @@ def finish_scan(self, argument):
 
 
 class DenseScanner(IScanner):
+    """
+    :type __mat__: numpy.ndarray
+    """
+
     def __init__(self, input_type, pos):
         IScanner.__init__(self, input_type, pos)
         self.__mat__ = None
@@ -46,8 +60,10 @@ def scan(self, dat):
 
     def finish_scan(self, argument):
         assert isinstance(argument, swig_paddle.Arguments)
-        assert isinstance(self.input_type, dp2.InputType)
-        m = swig_paddle.Matrix.createDenseFromNumpy(self.__mat__, True, False)
+        if self.__mat__.dtype != numpy.float32:
+            self.__mat__ = self.__mat__.astype(numpy.float32)
+        m = swig_paddle.Matrix.createDenseFromNumpy(self.__mat__, True,
+                                                    self.data_in_gpu)
         argument.setSlotValue(self.pos, m)
 
 
@@ -57,7 +73,6 @@ def __init__(self, input_type, pos):
         self.__rows__ = [0]
         self.__cols__ = []
         self.__height__ = 0
-        self.__nnz__ = 0
         self.__value__ = []
 
     def scan(self, dat):
@@ -70,11 +85,13 @@ def extend_cols(self, dat):
 
     def finish_scan(self, argument):
         assert isinstance(argument, swig_paddle.Arguments)
-        assert isinstance(self.input_type, dp2.InputType)
-        m = swig_paddle.Matrix.createSparse(self.__height__,
-                                            self.input_type.dim,
-                                            len(self.__cols__),
-                                            len(self.__value__) == 0)
+        m = swig_paddle.Matrix.createSparse(
+            self.__height__,
+            self.input_type.dim,
+            len(self.__cols__),
+            len(self.__value__) == 0,
+            False,  # trans
+            False)  # TODO supoort GPU
         assert isinstance(m, swig_paddle.Matrix)
         m.sparseCopyFrom(self.__rows__, self.__cols__, self.__value__)
         argument.setSlotValue(self.pos, m)
@@ -98,7 +115,7 @@ def scan(self, dat):
         self.__ids__.append(dat)
 
     def finish_scan(self, argument):
-        ids = swig_paddle.IVector.create(self.__ids__)
+        ids = swig_paddle.IVector.create(self.__ids__, self.data_in_gpu)
         assert isinstance(argument, swig_paddle.Arguments)
         argument.setSlotIds(self.pos, ids)
 
diff --git a/paddle/py_paddle/util.py b/paddle/py_paddle/util.py
index ce105d249aaf3e..a708def1d2d7f6 100644
--- a/paddle/py_paddle/util.py
+++ b/paddle/py_paddle/util.py
@@ -208,7 +208,7 @@ def getLayerOutputs(self, layerNames):
 
         output = dict()
         for name in layerNames:
-            output[name] = __matrix_to_numpy__(self.getLayerOutput(name))
+            output[name] = __arguments_to_numpy__(0, self.getLayerOutput(name))
         return output
 
     swig_paddle.GradientMachine.getLayerOutputs = getLayerOutputs
diff --git a/paddle/scripts/docker/Dockerfile b/paddle/scripts/docker/Dockerfile
index 1522be023f6de3..27d19dc15c8bd6 100644
--- a/paddle/scripts/docker/Dockerfile
+++ b/paddle/scripts/docker/Dockerfile
@@ -4,28 +4,36 @@ MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 ARG DEBIAN_FRONTEND=noninteractive
 ARG UBUNTU_MIRROR
 RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
-RUN apt-get update \
-    && apt-get install -y cmake libprotobuf-dev protobuf-compiler git \
-    libgoogle-glog-dev libgflags-dev libgtest-dev \
-    libatlas-dev libatlas3-base g++ m4 python-pip \
-    python-protobuf python-numpy python-dev swig openssh-server \
-    wget unzip python-matplotlib tar xz-utils bzip2 gzip coreutils \
-    sed grep graphviz libjpeg-dev zlib1g-dev doxygen \
-    clang-3.8 llvm-3.8 libclang-3.8-dev \
-    && apt-get clean -y
-RUN cd /usr/src/gtest && cmake . && make && cp *.a /usr/lib
-RUN pip install -U BeautifulSoup docopt PyYAML pillow \
-    sphinx sphinx_rtd_theme recommonmark jupyter
 
+RUN apt-get update && \
+    apt-get install -y git python-pip python-dev openssh-server bison && \
+    apt-get install -y wget unzip tar xz-utils bzip2 gzip coreutils && \
+    apt-get install -y curl sed grep graphviz libjpeg-dev zlib1g-dev && \
+    apt-get install -y python-numpy python-matplotlib gcc g++ gfortran && \
+    apt-get install -y automake && \
+    apt-get clean -y
+
+RUN pip install --upgrade pip && \
+    pip install -U "protobuf==3.1.0" && \
+    pip install -U wheel pillow BeautifulSoup && \
+    pip install -U docopt PyYAML sphinx && \
+    pip install -U sphinx_rtd_theme recommonmark jupyter
+
+RUN curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
+    cd cmake-3.4.1 && ./bootstrap && make -j `nproc` && make install && \
+    cd .. && rm -rf cmake-3.4.1
+
+ARG BUILD_WOBOQ
+ARG BUILD_AND_INSTALL
 ARG WITH_AVX
 ARG WITH_DOC
-ARG WITH_SWIG_PY
 ARG WITH_STYLE_CHECK
 
+ENV BUILD_WOBOQ=${BUILD_WOBOQ:-OFF}
+ENV BUILD_AND_INSTALL=${BUILD_AND_INSTALL:-OFF}
 ENV WITH_GPU=OFF
 ENV WITH_AVX=${WITH_AVX:-ON}
-ENV WITH_DOC=${WITH_DOC:-ON}
-ENV WITH_SWIG_PY=${WITH_SWIG_PY:-ON}
+ENV WITH_DOC=${WITH_DOC:-OFF}
 ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
 
 RUN mkdir /paddle
@@ -33,10 +41,6 @@ COPY . /paddle/
 RUN /paddle/paddle/scripts/docker/build.sh
 VOLUME ["/usr/share/nginx/html/data", "/usr/share/nginx/html/paddle"]
 
-RUN echo 'export LD_LIBRARY_PATH=/usr/lib64:${LD_LIBRARY_PATH}' >> /etc/profile
-RUN pip install /usr/local/opt/paddle/share/wheels/*.whl
-RUN paddle version  # print version after build
-
 # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
 RUN mkdir /var/run/sshd
 RUN echo 'root:root' | chpasswd
diff --git a/paddle/scripts/docker/Dockerfile.gpu b/paddle/scripts/docker/Dockerfile.gpu
index 09f07043e21723..538233dd0c7bb5 100644
--- a/paddle/scripts/docker/Dockerfile.gpu
+++ b/paddle/scripts/docker/Dockerfile.gpu
@@ -4,28 +4,36 @@ MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 ARG DEBIAN_FRONTEND=noninteractive
 ARG UBUNTU_MIRROR
 RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
-RUN apt-get update \
-    && apt-get install -y cmake libprotobuf-dev protobuf-compiler git \
-    libgoogle-glog-dev libgflags-dev libgtest-dev \
-    libatlas-dev libatlas3-base g++ m4 python-pip \
-    python-protobuf python-numpy python-dev swig openssh-server \
-    wget unzip python-matplotlib tar xz-utils bzip2 gzip coreutils \
-    sed grep graphviz libjpeg-dev zlib1g-dev doxygen \
-    clang-3.8 llvm-3.8 libclang-3.8-dev \
-    && apt-get clean -y
-RUN cd /usr/src/gtest && cmake . && make && cp *.a /usr/lib
-RUN pip install -U BeautifulSoup docopt PyYAML pillow \
-    sphinx sphinx_rtd_theme recommonmark jupyter
 
+RUN apt-get update && \
+    apt-get install -y git python-pip python-dev openssh-server bison && \
+    apt-get install -y wget unzip tar xz-utils bzip2 gzip coreutils && \
+    apt-get install -y curl sed grep graphviz libjpeg-dev zlib1g-dev && \
+    apt-get install -y python-numpy python-matplotlib gcc g++ gfortran && \
+    apt-get install -y automake && \
+    apt-get clean -y
+
+RUN pip install --upgrade pip && \
+    pip install -U "protobuf==3.1.0" && \
+    pip install -U wheel pillow BeautifulSoup && \
+    pip install -U docopt PyYAML sphinx && \
+    pip install -U sphinx_rtd_theme recommonmark jupyter
+
+RUN curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
+    cd cmake-3.4.1 && ./bootstrap && make -j `nproc` && make install && \
+    cd .. && rm -rf cmake-3.4.1
+
+ARG BUILD_WOBOQ
+ARG BUILD_AND_INSTALL
 ARG WITH_AVX
 ARG WITH_DOC
-ARG WITH_SWIG_PY
 ARG WITH_STYLE_CHECK
 
+ENV BUILD_WOBOQ=${BUILD_WOBOQ:-OFF}
+ENV BUILD_AND_INSTALL=${BUILD_AND_INSTALL:-OFF}
 ENV WITH_GPU=ON
 ENV WITH_AVX=${WITH_AVX:-ON}
-ENV WITH_DOC=${WITH_DOC:-ON}
-ENV WITH_SWIG_PY=${WITH_SWIG_PY:-ON}
+ENV WITH_DOC=${WITH_DOC:-OFF}
 ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
 
 RUN mkdir /paddle
@@ -33,10 +41,6 @@ COPY . /paddle/
 RUN /paddle/paddle/scripts/docker/build.sh
 VOLUME ["/usr/share/nginx/html/data", "/usr/share/nginx/html/paddle"]
 
-RUN echo 'export LD_LIBRARY_PATH=/usr/lib64:${LD_LIBRARY_PATH}' >> /etc/profile
-RUN pip install /usr/local/opt/paddle/share/wheels/*.whl
-RUN paddle version  # print version after build
-
 # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
 RUN mkdir /var/run/sshd
 RUN echo 'root:root' | chpasswd
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 7edba3dd09cdc5..d9c44f42340323 100755
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -8,42 +8,54 @@ function abort(){
 trap 'abort' 0
 set -e
 
-if [ ${WITH_GPU} == 'ON' ]; then
-  ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so /usr/lib/libcudnn.so
+# If Dockerfile.* sets BUILD_AND_INSTALL to 'ON', it would have copied
+# source tree to /paddle, and this scripts should build it into
+# /paddle/build.
+if [[ ${BUILD_AND_INSTALL:-OFF} == 'ON' ]]; then
+    if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
+	ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so /usr/lib/libcudnn.so
+    fi
+
+    mkdir -p /paddle/build # -p means no error if exists
+    cd /paddle/build
+    cmake .. \
+	  -DWITH_DOC=${WITH_DOC:-OFF} \
+	  -DWITH_GPU=${WITH_GPU:-OFF} \
+	  -DWITH_AVX=${WITH_AVX:-OFF} \
+	  -DWITH_SWIG_PY=ON \
+	  -DCUDNN_ROOT=/usr/ \
+	  -DWITH_STYLE_CHECK=OFF \
+	  -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+    make -j `nproc`
+    make install
+
+    if [[ ${BUILD_WOBOQ:-OFF} == 'ON' ]]; then
+        apt-get install -y clang-3.8 llvm-3.8 libclang-3.8-dev 
+        # Install woboq_codebrowser.
+        git clone https://github.com/woboq/woboq_codebrowser /woboq
+        cd /woboq
+        cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
+        -DCMAKE_BUILD_TYPE=Release \
+        .
+        make
+
+        export WOBOQ_OUT=/usr/share/nginx/html/paddle
+        export BUILD_DIR=/paddle/build
+        mkdir -p $WOBOQ_OUT
+        cp -rv /woboq/data $WOBOQ_OUT/../data
+        /woboq/generator/codebrowser_generator \
+        -b /paddle/build \
+        -a \
+        -o $WOBOQ_OUT \
+        -p paddle:/paddle
+        /woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
+        cd /woboq
+        make clean
+    fi
+
+    pip install /usr/local/opt/paddle/share/wheels/py_paddle*linux*.whl
+    pip install /usr/local/opt/paddle/share/wheels/paddle*.whl
+    paddle version
 fi
 
-mkdir -p /paddle/build # -p means no error if exists
-cd /paddle/build
-cmake .. \
-      -DWITH_DOC=ON \
-      -DWITH_GPU=${WITH_GPU} \
-      -DWITH_AVX=${WITH_AVX} \
-      -DWITH_SWIG_PY=ON \
-      -DCUDNN_ROOT=/usr/ \
-      -DWITH_STYLE_CHECK=OFF \
-      -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-make -j `nproc`
-make install
-
-# Install woboq_codebrowser.
-git clone https://github.com/woboq/woboq_codebrowser /woboq
-cd /woboq
-cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
-      -DCMAKE_BUILD_TYPE=Release \
-      .
-make
-
-export WOBOQ_OUT=/usr/share/nginx/html/paddle
-export BUILD_DIR=/paddle/build
-mkdir -p $WOBOQ_OUT
-cp -rv /woboq/data $WOBOQ_OUT/../data
-/woboq/generator/codebrowser_generator \
-    -b /paddle/build \
-    -a \
-    -o $WOBOQ_OUT \
-    -p paddle:/paddle
-/woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
-cd /woboq
-make clean
-rm -rf /paddle/build
 trap : 0
diff --git a/paddle/scripts/travis/before_install.osx.sh b/paddle/scripts/travis/before_install.osx.sh
index 7036f971fdd7ba..80f031a74e7052 100755
--- a/paddle/scripts/travis/before_install.osx.sh
+++ b/paddle/scripts/travis/before_install.osx.sh
@@ -1,6 +1,4 @@
 #!/bin/bash
 brew update
 brew tap homebrew/science
-brew install python
-sudo pip install --upgrade protobuf
-brew install swig openblas md5sha1sum protobuf
+brew install openblas swig md5sha1sum
diff --git a/paddle/scripts/travis/build_and_test.sh b/paddle/scripts/travis/build_and_test.sh
index fd3aeb02b21d65..5e6350b5745859 100755
--- a/paddle/scripts/travis/build_and_test.sh
+++ b/paddle/scripts/travis/build_and_test.sh
@@ -6,7 +6,7 @@ if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then
   export PYTHONPATH=/opt/python/2.7.12/lib/python2.7/site-packages
   export PYTHONHOME=/opt/python/2.7.12
   export PATH=/opt/python/2.7.12/bin:${PATH}
-  cmake .. -DON_TRAVIS=ON -DON_COVERALLS=ON -DCOVERALLS_UPLOAD=ON ${EXTRA_CMAKE_OPTS}
+  cmake .. -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DON_TRAVIS=ON -DON_COVERALLS=ON -DCOVERALLS_UPLOAD=ON ${EXTRA_CMAKE_OPTS}
   NRPOC=`nproc`
   make -j $NPROC
   make coveralls
diff --git a/paddle/scripts/travis/docs.sh b/paddle/scripts/travis/docs.sh
index bdafb145bcd4e5..6b43cad20b76e9 100755
--- a/paddle/scripts/travis/docs.sh
+++ b/paddle/scripts/travis/docs.sh
@@ -4,7 +4,7 @@
 source ./common.sh
 
 # Compile Documentation only.
-cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=ON ${EXTRA_CMAKE_OPTS}
+cmake .. -DCMAKE_BUILD_TYPE=Debug -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DWITH_GPU=OFF -DWITH_DOC=ON ${EXTRA_CMAKE_OPTS}
 make paddle_docs paddle_docs_cn
 
 # check websites for broken links
diff --git a/paddle/setup.py.in b/paddle/setup.py.in
index c79666bc81b6f3..38621af065913c 100644
--- a/paddle/setup.py.in
+++ b/paddle/setup.py.in
@@ -55,6 +55,9 @@ elif is_osx == True:
 
 include_dirs = [np.get_include(), "../"]    # include numpy and paddle.
 
+os.environ["CC"] = "@CMAKE_C_COMPILER@"
+os.environ["CXX"] = "@CMAKE_CXX_COMPILER@"
+
 setup(name="py_paddle",
   version="@PADDLE_VERSION@",
   ext_modules=[
diff --git a/paddle/trainer/ParameterUpdater.h b/paddle/trainer/ParameterUpdater.h
index c3207e63ce72b7..9e9e948b8856d2 100644
--- a/paddle/trainer/ParameterUpdater.h
+++ b/paddle/trainer/ParameterUpdater.h
@@ -184,7 +184,6 @@ class SgdCpuUpdater : public SgdLocalUpdater, public Deprecated {
    * @param para
    */
   virtual void updateImpl(Parameter* para) {}
-  virtual void update(Parameter* para) {}
 };
 
 /**
diff --git a/paddle/trainer/Trainer.cpp b/paddle/trainer/Trainer.cpp
index 8465addaf9e038..bd84545375117b 100644
--- a/paddle/trainer/Trainer.cpp
+++ b/paddle/trainer/Trainer.cpp
@@ -90,16 +90,6 @@ DEFINE_string(model_list, "", "File that saves the model list when evaluation");
 
 namespace paddle {
 
-void Trainer::init(int argc, char** argv) {
-  initMain(argc, argv);
-  initPython(argc, argv);
-
-  auto config = TrainerConfigHelper::createFromFlagConfig();
-  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
-
-  init(config);
-}
-
 void Trainer::init(const std::shared_ptr<TrainerConfigHelper>& config,
                    bool testing,
                    const std::shared_ptr<GradientMachine>& gradientMachine,
diff --git a/paddle/trainer/Trainer.h b/paddle/trainer/Trainer.h
index 7cbf18ace7a5fe..c8ee4726c24c33 100644
--- a/paddle/trainer/Trainer.h
+++ b/paddle/trainer/Trainer.h
@@ -71,11 +71,6 @@ class Trainer {
       const std::shared_ptr<DataProvider>& dataProvider = nullptr,
       const std::shared_ptr<DataProvider>& testDataProvider = nullptr);
 
-  /**
-   * Initialize Trainer from command line flags.
-   */
-  void init(int argc, char** argv);
-
   /**
    * Train until num_passes reached.
    * One pass means neural network train through all training data.
diff --git a/paddle/trainer/tests/CMakeLists.txt b/paddle/trainer/tests/CMakeLists.txt
index 22e07bd0e98a4c..c5c76a030d9e5f 100644
--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
@@ -1,11 +1,3 @@
-################# test_Prediction ######################
-add_unittest_without_exec(test_Prediction
-    test_Prediction.cpp)
-add_test(NAME test_Prediction
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python
-        ${CMAKE_CURRENT_BINARY_DIR}/test_Prediction --merger=${CMAKE_CURRENT_BINARY_DIR}/../paddle_merge_model
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
-
 ################# test_Compare ############################
 add_unittest_without_exec(test_Compare
     test_Compare.cpp)
diff --git a/paddle/trainer/tests/test_Prediction.cpp b/paddle/trainer/tests/test_Prediction.cpp
deleted file mode 100644
index 0c79404eee1c09..00000000000000
--- a/paddle/trainer/tests/test_Prediction.cpp
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <paddle/utils/PythonUtil.h>
-
-#include "paddle/trainer/Trainer.h"
-
-#include <gtest/gtest.h>
-
-DECLARE_string(config);
-DECLARE_string(config_args);
-DEFINE_string(merger,
-              "./paddle_merge_model",
-              "path to paddle_merge_model binary");
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-static const string& configFile = "trainer/tests/sample_trainer_config.conf";
-static const string& mergedModelFile = "./test_model_file";
-static const string& modelDir = "./test_model_dir";
-
-void checkBuffer(real* vec1, real* vec2, size_t len) {
-  for (size_t i = 0; i < len; i++) {
-    EXPECT_EQ(vec1[i], vec2[i]) << "vec1:" << vec1[i] << " vec2:" << vec2[i];
-  }
-}
-
-void checkParameters(vector<ParameterPtr> A, vector<ParameterPtr> B) {
-  CHECK_EQ(B.size(), A.size()) << "parameter size not equal";
-  for (size_t i = 0; i < A.size(); i++) {
-    auto vec1 = A[i]->getBuf(PARAMETER_VALUE);
-    auto vec2 = B[i]->getBuf(PARAMETER_VALUE);
-    CHECK_EQ(vec1->useGpu_, vec2->useGpu_) << "use gpu not equal";
-    CHECK_EQ(vec1->getSize(), vec2->getSize()) << "size not equal";
-
-    if (vec1->useGpu_ == false) {
-      checkBuffer(vec1->getData(), vec2->getData(), vec1->getSize());
-    } else {
-      VectorPtr cpuVec1 = Vector::create(vec1->getSize(), false);
-      VectorPtr cpuVec2 = Vector::create(vec2->getSize(), false);
-      cpuVec1->copyFrom(*vec1, HPPL_STREAM_DEFAULT);
-      cpuVec2->copyFrom(*vec2, HPPL_STREAM_DEFAULT);
-      hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-      checkBuffer(cpuVec1->getData(), cpuVec2->getData(), cpuVec1->getSize());
-    }
-  }
-}
-
-TEST(GradientMachine, create) {
-#ifdef PADDLE_ONLY_CPU
-  FLAGS_use_gpu = false;
-#endif
-  mkDir(modelDir.c_str());
-  FLAGS_config = configFile;
-  FLAGS_config_args = "with_cost=False";
-  auto config = TrainerConfigHelper::createFromFlagConfig();
-
-  // save model to directory
-  unique_ptr<GradientMachine> gradientMachine1(
-      GradientMachine::create(*config));
-  gradientMachine1->saveParameters(modelDir);
-  Trainer trainer;
-  trainer.init(config);
-  ParameterUtil* paramUtil = trainer.getParameterUtilPtr();
-  if (paramUtil != NULL) {
-    paramUtil->saveConfigWithPath(modelDir);
-  }
-
-  // create a different GradientMachine
-  unique_ptr<GradientMachine> gradientMachine2(
-      GradientMachine::create(*config));
-  gradientMachine2->randParameters();
-
-  // merge config and model to one file
-  string cmd = FLAGS_merger + " --model_dir=" + modelDir +
-               " --config_args=with_cost=False" + " --model_file=" +
-               mergedModelFile;
-  LOG(INFO) << cmd;
-  int ret = system(cmd.c_str());
-  EXPECT_EQ(0, ret);
-  if (ret) {
-    return;
-  }
-
-  // create GradientMachine from the merged model
-  DataConfig dataConfig;
-  unique_ptr<GradientMachine> gradientMachine3(
-      GradientMachine::create(mergedModelFile, &dataConfig));
-  CHECK(gradientMachine3);
-  EXPECT_EQ(dataConfig.type(), "simple");
-  EXPECT_EQ(dataConfig.feat_dim(), 3);
-
-  // compare the parameters of GradientMachine and GradientMachine3
-  std::vector<ParameterPtr> paraMachine1 = gradientMachine1->getParameters();
-  std::vector<ParameterPtr> paraMachine3 = gradientMachine3->getParameters();
-  checkParameters(paraMachine1, paraMachine3);
-
-  // Test that the GradientMachine created from the merged model
-  // is same as the orginnal one.
-  vector<Argument> inArgs(1);
-  vector<Argument> outArgs;
-
-  int inputDim = 3;
-  int numSamples = 2;
-  CpuMatrix cpuInput(numSamples, inputDim);
-  for (int i = 0; i < numSamples; ++i) {
-    for (int j = 0; j < inputDim; ++j) {
-      cpuInput.getData()[i * inputDim + j] =
-          rand() / (real)RAND_MAX;  // NOLINT TODO(yuyang): use rand_r
-    }
-  }
-  MatrixPtr input = Matrix::create(numSamples,
-                                   inputDim,
-                                   /* trans */ false,
-                                   FLAGS_use_gpu);
-  input->copyFrom(cpuInput);
-  inArgs[0].value = input;
-  gradientMachine1->forward(inArgs, &outArgs, PASS_TEST);
-  EXPECT_EQ((size_t)1, outArgs.size());
-
-  vector<Argument> outArgs2;
-  gradientMachine2->forward(inArgs, &outArgs2, PASS_TEST);
-  CpuMatrix out1(outArgs[0].value->getHeight(), outArgs[0].value->getWidth());
-  CpuMatrix out2(outArgs2[0].value->getHeight(), outArgs2[0].value->getWidth());
-  out1.copyFrom(*outArgs[0].value);
-  out2.copyFrom(*outArgs2[0].value);
-  for (size_t i = 0; i < out1.getHeight() * out1.getWidth(); i++) {
-    EXPECT_NE(out1.getData()[i], out2.getData()[i]);
-  }
-
-  gradientMachine3->forward(inArgs, &outArgs2, PASS_TEST);
-  out2.copyFrom(*outArgs2[0].value);
-  checkBuffer(
-      out1.getData(), out2.getData(), out2.getHeight() * out2.getWidth());
-
-  cmd = " rm -rf " + modelDir + "/*";
-  LOG(INFO) << "cmd " << cmd;
-  ret = system(cmd.c_str());
-  EXPECT_EQ(0, ret);
-  if (ret) {
-    return;
-  }
-
-  cmd = " rm -rf " + mergedModelFile;
-  LOG(INFO) << "cmd " << cmd;
-  ret = system(cmd.c_str());
-  EXPECT_EQ(0, ret);
-  if (ret) {
-    return;
-  }
-
-  // clean up
-  rmDir(modelDir.c_str());
-  remove(mergedModelFile.c_str());
-}
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  initPython(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/utils/Compiler.h b/paddle/utils/Compiler.h
new file mode 100644
index 00000000000000..cebca5a2a37661
--- /dev/null
+++ b/paddle/utils/Compiler.h
@@ -0,0 +1,33 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+/**
+ * This header defines some useful attribute by each compiler. It is the
+ * abstract layer of compilers.
+ */
+#ifdef __GNUC__
+#define GCC_VERSION \
+  (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+#else
+#define GCC_VERSION
+#endif
+
+/**
+ * __must_check macro. It make the function's return value must be used,
+ * otherwise it will raise a compile warning. And also Paddle treat all compile
+ * warnings as errors.
+ */
+#if GCC_VERSION >= 30400
+#define __must_check __attribute__((warn_unused_result))
+#else
+#define __must_check
+#endif
diff --git a/paddle/utils/Error.h b/paddle/utils/Error.h
new file mode 100644
index 00000000000000..cda1b5c37dada8
--- /dev/null
+++ b/paddle/utils/Error.h
@@ -0,0 +1,136 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <glog/logging.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <memory>
+#include <string>
+#include "Compiler.h"
+
+namespace paddle {
+
+/**
+ * Error is Paddle error code. It only contain a std::string as error message.
+ *
+ *
+ * There are two styles to return error in Paddle.
+ *
+ * 1. Return Error
+ *    When method return a status, the return must use `__must_check` attribute.
+ *    Example as below.
+ * @code{cpp}
+ * Error __must_check foo();
+ *
+ * Error __must_check bar() {
+ *   // do something.
+ *   Error err = foo();  // invoke other method return status.
+ *   if (err) return err;
+ *   // do something else.
+ *   return Error();
+ * }
+ * @endcode{cpp}
+ *
+ * 2. Return by parameter.
+ *    It is another way to return an error, by using a pointer parameter.
+ *    Example as below.
+ *
+ * @code{cpp}
+ * Error bar();
+ *
+ * int foo(Error* error) {
+ *   // Do something.
+ *   Error err = bar();
+ *   if (err) {
+ *     *error = s;
+ *     return 0;
+ *   }
+ *   // Do something else.
+ *   if (someInternalErrorHappend) {
+ *     *error = Error("Some dimension is too large, %d", dimension);
+ *     return 0;
+ *   }
+ *   // End of method.
+ *   return someValue;
+ * }
+ *
+ * Error foobar() {
+ *   Error err;
+ *   // do something.
+ *   foo(&err);
+ *   if (err) return err;
+ * }
+ * @endcode{cpp}
+ *
+ *
+ * Currently there is a helper method 'check' in status, because Paddle always
+ * use log(FATAL) or CHECK to make program exit before. When we clean all
+ * log(FATAL) and CHECK in Paddle, 'check' method will be removed.
+ */
+class Error {
+public:
+  /**
+   * Construct a no-error value.
+   */
+  Error() {}
+
+  /**
+   * @brief Create an Error use printf syntax.
+   */
+  explicit Error(const char* fmt, ...) {
+    va_list ap;
+    va_start(ap, fmt);
+    constexpr size_t kBufferSize = 1024;
+    char buffer[kBufferSize];
+    vsnprintf(buffer, kBufferSize, fmt, ap);
+    this->msg_.reset(new std::string(buffer));
+    va_end(ap);
+  }
+
+  /**
+   * @brief msg will return the error message. If no error, return nullptr.
+   */
+  const char* msg() const {
+    if (msg_) {
+      return msg_->c_str();
+    } else {
+      return nullptr;
+    }
+  }
+
+  /**
+   * @brief operator bool, return True if there is something error.
+   */
+  operator bool() const { return !this->isOK(); }
+
+  /**
+   * @brief isOK return True if there is no error.
+   * @return True if no error.
+   */
+  bool isOK() const { return msg_ == nullptr; }
+
+  /**
+   * @brief check this status by glog.
+   * @note It is a temp method used during cleaning Paddle code. It will be
+   *       removed later.
+   */
+  void check() const { CHECK(this->isOK()) << msg(); }
+
+private:
+  std::shared_ptr<std::string> msg_;
+};
+
+}  // namespace paddle
diff --git a/paddle/utils/Flags.cpp b/paddle/utils/Flags.cpp
index 59d6cbdc513660..e8f31bc811ac30 100644
--- a/paddle/utils/Flags.cpp
+++ b/paddle/utils/Flags.cpp
@@ -33,12 +33,15 @@ DEFINE_int32(port, 20134, "Listening port for pserver");
 DEFINE_int32(data_server_port, 21134, "Listening port for dserver");
 DEFINE_int32(ports_num,
              1,
-             "The ports number for parameter send,"
-             " increment based on default port number");
+             "Number of ports for sending dense parameter,"
+             " following ports on parameter server will be visited"
+             " for sending dense parameter: [port, port+ports_num-1]");
 DEFINE_int32(ports_num_for_sparse,
              0,
-             "The ports number for parameter send,"
-             " increment based on default (port + ports_num)");
+             "Number of ports for sending sparse parameter,"
+             " following ports on parameter server will be visited"
+             " for sending sparse parameter:"
+             " [port+ports_num, port+ports_num+ports_num_for_sparse-1]");
 DEFINE_string(nics, "xgbe0,xgbe1", "network device name for pservers");
 DEFINE_string(rdma_tcp, "tcp", "use rdma or tcp rdma transport protocol");
 DEFINE_int32(trainer_id,
diff --git a/paddle/utils/Util.cpp b/paddle/utils/Util.cpp
index 411a64aa8d0737..dbab4ec43ca2fa 100644
--- a/paddle/utils/Util.cpp
+++ b/paddle/utils/Util.cpp
@@ -144,20 +144,20 @@ void runInitFunctions() {
 }
 
 void initMain(int argc, char** argv) {
-  initializeLogging(argc, argv);
   installLayerStackTracer();
   std::string line;
   for (int i = 0; i < argc; ++i) {
     line += argv[i];
     line += ' ';
   }
-  LOG(INFO) << "commandline: " << line;
 
 #ifndef GFLAGS_GFLAGS_H_
   namespace gflags = google;
 #endif
 
   gflags::ParseCommandLineFlags(&argc, &argv, true);
+  initializeLogging(argc, argv);
+  LOG(INFO) << "commandline: " << line;
   CHECK_EQ(argc, 1) << "Unknown commandline argument: " << argv[1];
 
   installProfilerSwitch();
@@ -289,6 +289,7 @@ void mkDir(const char* filename) {
 void mkDirRecursively(const char* dir) {
   struct stat sb;
 
+  if (*dir == 0) return;  // empty string
   if (!stat(dir, &sb)) return;
 
   mkDirRecursively(path::dirname(dir).c_str());
diff --git a/paddle/utils/tests/CMakeLists.txt b/paddle/utils/tests/CMakeLists.txt
index 26fafbd1ab3f29..aa923b35537775 100644
--- a/paddle/utils/tests/CMakeLists.txt
+++ b/paddle/utils/tests/CMakeLists.txt
@@ -4,6 +4,7 @@ add_simple_unittest(test_CustomStackTrace)
 add_simple_unittest(test_ThreadBarrier)
 add_simple_unittest(test_SpinLock)
 add_simple_unittest(test_SIMDFlags)
+add_simple_unittest(test_Error)
 
 add_executable(
     test_CustomStackTracePrint
diff --git a/paddle/utils/tests/test_Error.cpp b/paddle/utils/tests/test_Error.cpp
new file mode 100644
index 00000000000000..fdf326b17a1c8b
--- /dev/null
+++ b/paddle/utils/tests/test_Error.cpp
@@ -0,0 +1,34 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/utils/Error.h"
+
+#include <gtest/gtest.h>
+
+TEST(Error, testAll) {
+  paddle::Error error;
+  ASSERT_FALSE(error);
+  error = paddle::Error("I'm the error");
+  ASSERT_TRUE(error);
+  ASSERT_STREQ("I'm the error", error.msg());
+
+  error = paddle::Error("error2");
+  ASSERT_TRUE(error);
+  ASSERT_STREQ("error2", error.msg());
+
+  int i = 3;
+  auto error3 = paddle::Error("error%d", i);
+  ASSERT_TRUE(error3);
+  ASSERT_STREQ("error3", error3.msg());
+}
diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto
index 3a9d339976fff9..65d5d50277b665 100644
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -255,6 +255,13 @@ message PriorBoxConfig {
   repeated float variance = 4;
 }
 
+message PadConfig {
+  required ImageConfig image_conf = 1;
+  repeated uint32 pad_c = 2;
+  repeated uint32 pad_h = 3;
+  repeated uint32 pad_w = 4;
+}
+
 message LayerInputConfig {
   required string input_layer_name = 1;
   optional string input_parameter_name = 2;
@@ -271,6 +278,7 @@ message LayerInputConfig {
   optional MaxOutConfig maxout_conf = 11;
   optional SppConfig spp_conf = 12;
   optional PriorBoxConfig priorbox_conf = 13;
+  optional PadConfig pad_conf = 14;
 }
 
 message LayerConfig {
@@ -419,14 +427,14 @@ message LayerConfig {
   // bias size
   optional uint32 bias_size = 48 [default = 0];
 
-  // this parameter can be used as a user-defined parameter when necessary, 
+  // this parameter can be used as a user-defined parameter when necessary,
   // without changing the proto file.
-  // e.g., when a new layer with a user-defined parameter is implemented, 
+  // e.g., when a new layer with a user-defined parameter is implemented,
   // it can be used to pass that parameter, without modifying the proto file.
   // string type is used for flexibility: different types can be converted
-  // to string and reinterpreted in the user's own layer implementation.  
+  // to string and reinterpreted in the user's own layer implementation.
   optional string user_arg = 49;
-  
+
   // to indicate rectangle image data
   optional uint64 height = 50;
   optional uint64 width = 51;
@@ -467,6 +475,10 @@ message EvaluatorConfig {
   // Used by ChunkEvaluator
   // chunk of these types are not counted
   repeated int32 excluded_chunk_types = 12;
+
+  // Used by ClassificationErrorEvaluator
+  // top # classification error
+  optional int32 top_k = 13 [default = 1];
 }
 
 message LinkConfig {
diff --git a/proto/ParameterServerConfig.proto b/proto/ParameterServerConfig.proto
index 3068bba8b10d89..404f9613792653 100644
--- a/proto/ParameterServerConfig.proto
+++ b/proto/ParameterServerConfig.proto
@@ -27,11 +27,14 @@ message ParameterClientConfig {
  * Configuration structure for ParameterServer2.
  */
 message ParameterServerConfig {
-  // The ports number for parameter send,
-  // increment based on default port number
+  // Number of ports for sending dense parameter,
+  // following ports on parameter server will be visited
+  // for sending dense parameter: [port, port+ports_num-1]
   required int32 ports_num = 1 [default = 1];
-  // The ports number for parameter send,
-  // increment based on default (port + ports_num
+  // Number of ports for sending sparse parameter,
+  // following ports on parameter server will be visited
+  // for sending sparse parameter:
+  // [port+ports_num, port+ports_num+ports_num_for_sparse-1]
   required int32 ports_num_for_sparse = 2 [default = 0];
   // network device name for pservers
   required string nics = 3 [default = "xgbe0,xgbe1"];
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 1cda4762eb2a55..48e0a1993d07f8 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -4,11 +4,13 @@ set(OUTPUT_DIR
 file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py)
 file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py)
 file(GLOB UTILS_PY_FILES . ./paddle/utils/*.py)
+file(GLOB_RECURSE V2_PY_FILES ./paddle/v2/ *.py)
 
 set(PY_FILES paddle/__init__.py
              ${TRAINER_PY_FILES}
              ${HELPERS_PY_FILES}
-             ${UTILS_PY_FILES})
+             ${UTILS_PY_FILES}
+             ${V2_PY_FILES})
 
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
     ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
@@ -22,6 +24,8 @@ add_custom_target(paddle_python ALL DEPENDS
     ${OUTPUT_DIR}/.timestamp)
 
 add_subdirectory(paddle/trainer_config_helpers/tests)
+add_subdirectory(paddle/v2/reader/tests)
+add_subdirectory(paddle/v2/tests)
 
 install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/dist/
     DESTINATION opt/paddle/share/wheels
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 674b5ac58b6feb..da937152ee0ce7 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -493,6 +493,7 @@ def __init__(
             block_expand=None,
             maxout=None,
             spp=None,
+            pad=None,
             format=None,
             nnz=None,
             is_static=None,
@@ -829,7 +830,6 @@ def __init__(
             channels,
             size_x,
             size_y=None,
-            img_width=None,
             start=None,
             stride=None,  # 1 by defalut in protobuf
             stride_y=None,
@@ -844,6 +844,12 @@ def __init__(self, pool_type, pyramid_height, channels):
         self.add_keys(locals())
 
 
+@config_class
+class Pad(Cfg):
+    def __init__(self, channels, pad_c, pad_h, pad_w):
+        self.add_keys(locals())
+
+
 @config_class
 class Norm(Cfg):
     def __init__(self,
@@ -887,11 +893,11 @@ def __init__(self, channels, groups, img_size_x=0, img_size_y=0):
         self.add_keys(locals())
 
 
-def DataBase(async_load_data=False,
-             constant_slots=None,
-             data_ratio=1,
-             is_main_data=True,
-             usage_ratio=None):
+def create_data_config_proto(async_load_data=False,
+                             constant_slots=None,
+                             data_ratio=1,
+                             is_main_data=True,
+                             usage_ratio=None):
     # default: all sub dataproviders are treat as "main data".
     # see proto/DataConfig.proto for is_main_data
     data_config = DataConfig()
@@ -917,7 +923,7 @@ def SimpleData(files=None,
                context_len=None,
                buffer_capacity=None,
                **xargs):
-    data_config = DataBase(**xargs)
+    data_config = create_data_config_proto(**xargs)
     data_config.type = 'simple'
     data_config.files = files
     data_config.feat_dim = feat_dim
@@ -939,7 +945,7 @@ def PyData(files=None,
            constant_slots=None,
            load_thread_num=None,
            **xargs):
-    data_config = DataBase(**xargs)
+    data_config = create_data_config_proto(**xargs)
     data_config.type = 'py'
     if load_data_module in g_py_module_name_list:
 
@@ -990,7 +996,7 @@ def ProtoData(files=None,
               constant_slots=None,
               load_thread_num=None,
               **xargs):
-    data_config = DataBase(**xargs)
+    data_config = create_data_config_proto(**xargs)
     if type is None:
         data_config.type = 'proto'
     else:
@@ -1029,7 +1035,7 @@ def Data(type,
          buffer_capacity=None,
          **xargs):
 
-    data_config = DataBase(**xargs)
+    data_config = create_data_config_proto(**xargs)
     data_config.type = type
     data_config.files = files
     data_config.feat_dim = feat_dim
@@ -1102,7 +1108,7 @@ def parse_bilinear(bilinear, input_layer_name, bilinear_conf):
     bilinear_conf.out_size_y = bilinear.out_size_y
 
 
-def parse_pool(pool, input_layer_name, pool_conf):
+def parse_pool(pool, input_layer_name, pool_conf, ceil_mode):
     pool_conf.pool_type = pool.pool_type
     config_assert(pool.pool_type in [
         'max-projection', 'avg-projection', 'cudnn-max-pool', 'cudnn-avg-pool'
@@ -1127,10 +1133,10 @@ def parse_pool(pool, input_layer_name, pool_conf):
     pool_conf.padding_y = default(pool.padding_y, pool_conf.padding)
     pool_conf.output_x = cnn_output_size(pool_conf.img_size, pool_conf.size_x,
                                          pool_conf.padding, pool_conf.stride,
-                                         False)
+                                         not ceil_mode)
     pool_conf.output_y = cnn_output_size(pool_conf.img_size_y, pool_conf.size_y,
                                          pool_conf.padding_y,
-                                         pool_conf.stride_y, False)
+                                         pool_conf.stride_y, not ceil_mode)
 
 
 def parse_spp(spp, input_layer_name, spp_conf):
@@ -1247,6 +1253,7 @@ def Evaluator(
         dict_file=None,
         result_file=None,
         num_results=None,
+        top_k=None,
         delimited=None,
         excluded_chunk_types=None, ):
     evaluator = g_config.model_config.evaluators.add()
@@ -1274,6 +1281,8 @@ def Evaluator(
         evaluator.result_file = result_file
     if num_results is not None:
         evaluator.num_results = num_results
+    if top_k is not None:
+        evaluator.top_k = top_k
     if delimited is not None:
         evaluator.delimited = delimited
 
@@ -1803,9 +1812,8 @@ class ConvTransLayer(ConvTransLayerBase):
 
 @config_layer('norm')
 class NormLayer(LayerBase):
-    def __init__(self, name, inputs, device=None):
-        super(NormLayer, self).__init__(
-            name, 'norm', 0, inputs=inputs, device=device)
+    def __init__(self, name, inputs, **xargs):
+        super(NormLayer, self).__init__(name, 'norm', 0, inputs=inputs, **xargs)
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
             norm_conf = self.config.inputs[input_index].norm_conf
@@ -1817,23 +1825,22 @@ def __init__(self, name, inputs, device=None):
 
 @config_layer('pool')
 class PoolLayer(LayerBase):
-    def __init__(self, name, inputs, device=None):
-        super(PoolLayer, self).__init__(
-            name, 'pool', 0, inputs=inputs, device=device)
+    def __init__(self, name, inputs, ceil_mode=True, **xargs):
+        super(PoolLayer, self).__init__(name, 'pool', 0, inputs=inputs, **xargs)
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
             pool_conf = self.config.inputs[input_index].pool_conf
             parse_pool(self.inputs[input_index].pool, input_layer.name,
-                       pool_conf)
+                       pool_conf, ceil_mode)
             self.set_cnn_layer(name, pool_conf.output_y, pool_conf.output_x,
                                pool_conf.channels)
 
 
 @config_layer('spp')
 class SpatialPyramidPoolLayer(LayerBase):
-    def __init__(self, name, inputs, device=None):
+    def __init__(self, name, inputs, **xargs):
         super(SpatialPyramidPoolLayer, self).__init__(
-            name, 'spp', 0, inputs=inputs, device=device)
+            name, 'spp', 0, inputs=inputs, **xargs)
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
             spp_conf = self.config.inputs[input_index].spp_conf
@@ -1842,6 +1849,25 @@ def __init__(self, name, inputs, device=None):
             self.set_cnn_layer(name, 1, output_x, spp_conf.image_conf.channels)
 
 
+@config_layer('pad')
+class PadLayer(LayerBase):
+    def __init__(self, name, inputs, **xargs):
+        super(PadLayer, self).__init__(name, 'pad', 0, inputs=inputs, **xargs)
+        pad = self.inputs[0].pad
+        self.config.inputs[0].pad_conf.pad_c.extend(pad.pad_c)
+        self.config.inputs[0].pad_conf.pad_h.extend(pad.pad_h)
+        self.config.inputs[0].pad_conf.pad_w.extend(pad.pad_w)
+
+        input_layer = self.get_input_layer(0)
+        image_conf = self.config.inputs[0].pad_conf.image_conf
+        parse_image(pad, input_layer.name, image_conf)
+        out_ch = pad.channels + pad.pad_c[0] + pad.pad_c[1]
+        out_h = image_conf.img_size_y + pad.pad_h[0] + pad.pad_h[1]
+        out_w = image_conf.img_size + pad.pad_w[0] + pad.pad_w[1]
+        self.set_cnn_layer(name, out_h, out_w, out_ch)
+        self.config.size = out_ch * out_h * out_w
+
+
 @config_layer('batch_norm')
 class BatchNormLayer(LayerBase):
     layer_type = 'batch_norm'
@@ -1851,7 +1877,6 @@ def __init__(self,
                  inputs,
                  active_type="linear",
                  bias=True,
-                 device=None,
                  use_global_stats=True,
                  moving_average_fraction=0.9,
                  batch_norm_type=None,
@@ -1893,7 +1918,6 @@ def __init__(self,
             0,
             active_type=active_type,
             inputs=inputs,
-            device=device,
             **xargs)
 
         if use_global_stats is not None:
@@ -1905,8 +1929,8 @@ def __init__(self,
         image_conf = self.config.inputs[0].image_conf
         parse_image(self.inputs[0].image, input_layer.name, image_conf)
 
-        # Only pass the width and height of input to batch_norm layer 
-        # when either of it is non-zero. 
+        # Only pass the width and height of input to batch_norm layer
+        # when either of it is non-zero.
         if input_layer.width != 0 or input_layer.height != 0:
             self.set_cnn_layer(name, image_conf.img_size_y, image_conf.img_size,
                                image_conf.channels, False)
@@ -1927,9 +1951,9 @@ def calc_parameter_size(self, image_conf):
 
 @config_layer('trans')
 class TransLayer(LayerBase):
-    def __init__(self, name, inputs, device=None):
+    def __init__(self, name, inputs, **xargs):
         super(TransLayer, self).__init__(
-            name, 'trans', 0, inputs=inputs, device=device)
+            name, 'trans', 0, inputs=inputs, **xargs)
         config_assert(
             len(self.inputs) == 1,
             'TransLayer must have one and only one input')
@@ -1938,19 +1962,31 @@ def __init__(self, name, inputs, device=None):
 
 @config_layer('resize')
 class ResizeLayer(LayerBase):
-    def __init__(self, name, size, inputs, device=None):
+    def __init__(self, name, size, inputs, **xargs):
         super(ResizeLayer, self).__init__(
-            name, 'resize', size=size, inputs=inputs, device=device)
+            name, 'resize', size=size, inputs=inputs, **xargs)
         config_assert(
             len(self.inputs) == 1,
             'ResizeLayer must have one and only one input')
 
 
+@config_layer('rotate')
+class RotateLayer(LayerBase):
+    def __init__(self, name, inputs, height, width, device=None):
+        super(RotateLayer, self).__init__(
+            name, 'rotate', 0, inputs=inputs, device=device)
+        config_assert(
+            len(self.inputs) == 1,
+            'RotateLayer must have one and only one input')
+        self.set_layer_height_width(height, width)
+        self.set_layer_size(self.get_input_layer(0).size)
+
+
 @config_layer('blockexpand')
 class BlockExpandLayer(LayerBase):
-    def __init__(self, name, inputs, device=None):
+    def __init__(self, name, inputs, **xargs):
         super(BlockExpandLayer, self).__init__(
-            name, 'blockexpand', 0, inputs=inputs, device=device)
+            name, 'blockexpand', 0, inputs=inputs, **xargs)
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
             parse_block_expand(
@@ -2628,7 +2664,7 @@ def __init__(self,
 
 @config_layer('cos')
 class CosSimLayer(LayerBase):
-    def __init__(self, name, inputs, cos_scale=5, device=None):
+    def __init__(self, name, inputs, cos_scale=1, device=None):
         super(CosSimLayer, self).__init__(
             name, 'cos', 1, inputs=inputs, device=device)
         config_assert(len(self.inputs) == 2, 'CosSimLayer must have 2 inputs')
diff --git a/python/paddle/trainer/recurrent_units.py b/python/paddle/trainer/recurrent_units.py
old mode 100644
new mode 100755
index edca279dcadef4..ef92107a1093d2
--- a/python/paddle/trainer/recurrent_units.py
+++ b/python/paddle/trainer/recurrent_units.py
@@ -15,10 +15,13 @@
 # recurrent_units.py
 # Version 2.0
 #
-# Some recurrent units can be used in recurrent layer group, 
+# Some recurrent units can be used in recurrent layer group,
 #   to use these units, import this module in your config_file:
-#     import trainer.recurrent_units 
-# 
+#     import trainer.recurrent_units
+#
+# The modules in this file are DEPRECATED.
+# If you would like to use lstm/gru
+# please use the functions defined in paddle.trainer_config_helpers.
 
 from paddle.trainer.config_parser import *
 
@@ -26,7 +29,7 @@
 # long short term memory, can be used in recurrent machine
 # *inputs* must be a list of Projections, for example:
 #   inputs = [FullMatrixProjection("input_layer_name")],
-# *para_prefix* defines parameter names, if the *para_prefix* of 
+# *para_prefix* defines parameter names, if the *para_prefix* of
 #   two LstmRecurrentUnit is same, they share same parameters
 # *out_memory* can be defined outside if it's used outside
 def LstmRecurrentUnit(name,
@@ -194,7 +197,7 @@ def LstmRecurrentLayerGroup(name,
 # gated recurrent unit, can be used in recurrent machine
 # *inputs* should be a list of Projections, for example:
 #   inputs = [FullMatrixProjection("input_layer_name")],
-# *para_prefix* defines parameter names, if the *para_prefix* of 
+# *para_prefix* defines parameter names, if the *para_prefix* of
 #   two GatedRecurrentUnit is same, they share same parameters
 # *out_memory* can be defined outside if it's used outside
 
diff --git a/python/paddle/trainer_config_helpers/data_sources.py b/python/paddle/trainer_config_helpers/data_sources.py
index 622b4fc25ccff3..ab9a2562dcccb3 100644
--- a/python/paddle/trainer_config_helpers/data_sources.py
+++ b/python/paddle/trainer_config_helpers/data_sources.py
@@ -58,8 +58,8 @@ def define_py_data_source(file_list,
     :param obj: python object name. May be a function name if using
                 PyDataProviderWrapper.
     :type obj: basestring
-    :param args: The best practice is using dict to pass arguments into 
-                 DataProvider, and use :code:`@init_hook_wrapper` to 
+    :param args: The best practice is using dict to pass arguments into
+                 DataProvider, and use :code:`@init_hook_wrapper` to
                  receive arguments.
     :type args: string or picklable object
     :param async: Load Data asynchronously or not.
@@ -98,7 +98,7 @@ def define_py_data_sources(train_list,
     The annotation is almost the same as define_py_data_sources2, except that
     it can specific train_async and data_cls.
 
-    :param data_cls: 
+    :param data_cls:
     :param train_list: Train list name.
     :type train_list: basestring
     :param test_list: Test list name.
@@ -111,8 +111,8 @@ def define_py_data_sources(train_list,
                 a tuple or list to this argument.
     :type obj: basestring or tuple or list
     :param args: The best practice is using dict() to pass arguments into
-                 DataProvider, and use :code:`@init_hook_wrapper` to receive 
-                 arguments. If train and test is different, then pass a tuple 
+                 DataProvider, and use :code:`@init_hook_wrapper` to receive
+                 arguments. If train and test is different, then pass a tuple
                  or list to this argument.
     :type args: string or picklable object or list or tuple.
     :param train_async: Is training data load asynchronously or not.
@@ -163,12 +163,12 @@ def define_py_data_sources2(train_list, test_list, module, obj, args=None):
 
     ..  code-block:: python
 
-        define_py_data_sources2(train_list="train.list", 
-                                test_list="test.list", 
+        define_py_data_sources2(train_list="train.list",
+                                test_list="test.list",
                                 module="data_provider"
                                 # if train/test use different configurations,
                                 # obj=["process_train", "process_test"]
-                                obj="process", 
+                                obj="process",
                                 args={"dictionary": dict_name})
 
     The related data provider can refer to :ref:`api_pydataprovider2_sequential_model` .
@@ -185,8 +185,8 @@ def define_py_data_sources2(train_list, test_list, module, obj, args=None):
                 a tuple or list to this argument.
     :type obj: basestring or tuple or list
     :param args: The best practice is using dict() to pass arguments into
-                 DataProvider, and use :code:`@init_hook_wrapper` to receive 
-                 arguments. If train and test is different, then pass a tuple 
+                 DataProvider, and use :code:`@init_hook_wrapper` to receive
+                 arguments. If train and test is different, then pass a tuple
                  or list to this argument.
     :type args: string or picklable object or list or tuple.
     :return: None
@@ -195,13 +195,13 @@ def define_py_data_sources2(train_list, test_list, module, obj, args=None):
 
     def py_data2(files, load_data_module, load_data_object, load_data_args,
                  **kwargs):
-        data = DataBase()
+        data = create_data_config_proto()
         data.type = 'py2'
         data.files = files
         data.load_data_module = load_data_module
         data.load_data_object = load_data_object
         data.load_data_args = load_data_args
-        data.async_load_data = True
+        data.async_load_data = False
         return data
 
     define_py_data_sources(
diff --git a/python/paddle/trainer_config_helpers/default_decorators.py b/python/paddle/trainer_config_helpers/default_decorators.py
index ad3efcbf369411..2f25579fcdd979 100644
--- a/python/paddle/trainer_config_helpers/default_decorators.py
+++ b/python/paddle/trainer_config_helpers/default_decorators.py
@@ -52,6 +52,10 @@ def __wrapper__(*args, **kwargs):
                     kwargs[name] = default_factory(func)
             return func(*args, **kwargs)
 
+        if hasattr(func, 'argspec'):
+            __wrapper__.argspec = func.argspec
+        else:
+            __wrapper__.argspec = inspect.getargspec(func)
         return __wrapper__
 
     return __impl__
diff --git a/python/paddle/trainer_config_helpers/evaluators.py b/python/paddle/trainer_config_helpers/evaluators.py
index bd247ea9af9d8d..567521ee9dbadb 100644
--- a/python/paddle/trainer_config_helpers/evaluators.py
+++ b/python/paddle/trainer_config_helpers/evaluators.py
@@ -71,6 +71,7 @@ def evaluator_base(
         result_file=None,
         num_results=None,
         delimited=None,
+        top_k=None,
         excluded_chunk_types=None, ):
     """
     Evaluator will evaluate the network status while training/testing.
@@ -104,12 +105,15 @@ def evaluator_base(
     :param weight: An input layer which is a weight for each sample.
                    Each evaluator may calculate differently to use this weight.
     :type weight: LayerOutput.
+    :param top_k: number k in top-k error rate
+    :type top_k: int
     """
     # inputs type assertions.
     assert classification_threshold is None or isinstance(
         classification_threshold, float)
     assert positive_label is None or isinstance(positive_label, int)
     assert num_results is None or isinstance(num_results, int)
+    assert top_k is None or isinstance(top_k, int)
 
     if not isinstance(input, list):
         input = [input]
@@ -130,6 +134,8 @@ def evaluator_base(
         dict_file=dict_file,
         result_file=result_file,
         delimited=delimited,
+        num_results=num_results,
+        top_k=top_k,
         excluded_chunk_types=excluded_chunk_types, )
 
 
@@ -139,6 +145,7 @@ def classification_error_evaluator(input,
                                    label,
                                    name=None,
                                    weight=None,
+                                   top_k=None,
                                    threshold=None):
     """
     Classification Error Evaluator. It will print error rate for classification.
@@ -167,6 +174,8 @@ def classification_error_evaluator(input,
                   then means not set weight. The larger weight it is, the more
                   important this sample is.
     :type weight: LayerOutput
+    :param top_k: number k in top-k error rate
+    :type top_k: int
     :param threshold: The classification threshold.
     :type threshold: float
     :return: None.
@@ -178,6 +187,7 @@ def classification_error_evaluator(input,
         input=input,
         label=label,
         weight=weight,
+        top_k=top_k,
         classification_threshold=threshold, )
 
 
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
old mode 100644
new mode 100755
index 9b6e5774bc82dc..b68460b6a3ab62
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -14,6 +14,7 @@
 
 import functools
 import collections
+import inspect
 
 from paddle.trainer.config_parser import *
 from .activations import LinearActivation, SigmoidActivation, TanhActivation, \
@@ -37,6 +38,7 @@
     "dotmul_projection",
     "dotmul_operator",
     "repeat_layer",
+    "seq_reshape_layer",
     "table_projection",
     "mixed_layer",
     "data_layer",
@@ -59,6 +61,7 @@
     'img_cmrnorm_layer',
     'addto_layer',
     'concat_layer',
+    'seq_concat_layer',
     'lstm_step_layer',
     'recurrent_group',
     'memory',
@@ -70,6 +73,7 @@
     'interpolation_layer',
     'bilinear_interp_layer',
     'trans_layer',
+    'rotate_layer',
     'sum_to_one_norm_layer',
     'get_output_layer',
     'LayerType',
@@ -108,6 +112,9 @@
     'print_layer',
     'priorbox_layer',
     'spp_layer',
+    'pad_layer',
+    'eos_layer',
+    'layer_support',
 ]
 
 
@@ -122,6 +129,7 @@ class LayerType(object):
     GRUMEMORY = "gated_recurrent"
     SEQUENCE_LAST_INSTANCE = "seqlastins"
     SEQUENCE_FIRST_INSTANCE = "seqfirstins"
+    SEQUENCE_RESHAPE = "seqreshape"
     POOLING_MAX = "max"
     POOLING_AVG = 'average'
     FC_LAYER = "fc"
@@ -142,6 +150,7 @@ class LayerType(object):
 
     CONCAT_LAYER = 'concat'
     CONCAT_PROJ_LAYER = 'concat2'
+    SEQUENCE_CONCAT_LAYER = 'seqconcat'
 
     LSTM_STEP_LAYER = 'lstm_step'
     GRU_STEP_LAYER = 'gru_step'
@@ -153,6 +162,7 @@ class LayerType(object):
     POWER_LAYER = 'power'
     SCALING_LAYER = 'scaling'
     TRANS_LAYER = 'trans'
+    ROTATE_LAYER = 'rotate'
     OUT_PROD_LAYER = 'out_prod'
     FEATURE_MAP_EXPAND_LAYER = 'featmap_expand'
 
@@ -170,6 +180,7 @@ class LayerType(object):
     BLOCK_EXPAND = "blockexpand"
     MAXOUT = "maxout"
     SPP_LAYER = "spp"
+    PAD_LAYER = "pad"
 
     PRINT_LAYER = "print"
     PRIORBOX_LAYER = "priorbox"
@@ -306,6 +317,11 @@ def wrapper(*args, **kwargs):
                     val.check(method.__name__)
             return method(*args, **kwargs)
 
+        if hasattr(method, 'argspec'):
+            wrapper.argspec = method.argspec
+        else:
+            wrapper.argspec = inspect.getargspec(method)
+
         return wrapper
 
     return decorator
@@ -700,6 +716,7 @@ def __exit__(self, *args, **kwargs):
         # update the size which might be computed inside MixedLayer
         # according to the operator's output size
         self.size = ml.config.size
+        self.finalized = True
 
 
 @wrap_name_default("mixed")
@@ -1279,6 +1296,12 @@ def last_seq(input,
     """
     Get Last Timestamp Activation of a sequence.
 
+    The simple usage is:
+
+    .. code-block:: python
+
+       seq = last_seq(input=layer)
+
     :param agg_level: Aggregated level
     :param name: Layer name.
     :type name: basestring
@@ -1317,6 +1340,12 @@ def first_seq(input,
     """
     Get First Timestamp Activation of a sequence.
 
+    The simple usage is:
+
+    .. code-block:: python
+
+       seq = first_seq(input=layer)
+
     :param agg_level: aggregation level
     :param name: Layer name.
     :type name: basestring
@@ -1417,7 +1446,7 @@ def repeat_layer(input, num_repeats, name=None, layer_attr=None):
 
     .. code-block:: python
 
-       expand = repeat_layer(layer, 4)
+       expand = repeat_layer(input=layer, num_repeats=4)
 
     :param input: Input layer
     :type input: LayerOutput
@@ -1444,6 +1473,61 @@ def repeat_layer(input, num_repeats, name=None, layer_attr=None):
         parents=[input])
 
 
+@wrap_name_default("seqreshape")
+@wrap_act_default(act=IdentityActivation())
+@wrap_bias_attr_default(has_bias=False)
+@layer_support()
+def seq_reshape_layer(input,
+                      reshape_size,
+                      act=None,
+                      name=None,
+                      layer_attr=None,
+                      bias_attr=None):
+    """
+    A layer for reshaping the sequence. Assume the input sequence has T instances,
+    the dimension of each instance is M, and the input reshape_size is N, then the 
+    output sequence has T*M/N instances, the dimension of each instance is N.
+
+    Note that T*M/N must be an integer.
+
+    The example usage is:
+
+    .. code-block:: python
+
+       reshape = seq_reshape_layer(input=layer, reshape_size=4)
+
+    :param input: Input layer.
+    :type input: LayerOutput
+    :param reshape_size: the size of reshaped sequence.
+    :type reshape_size: int
+    :param name: Layer name.
+    :type name: basestring
+    :param act: Activation type.
+    :type act: BaseActivation
+    :param layer_attr: extra layer attributes.
+    :type layer_attr: ExtraLayerAttribute.
+    :param bias_attr: The Bias Attribute. If no bias, then pass False or
+                      something not type of ParameterAttribute. None will get a
+                      default Bias.
+    :type bias_attr: ParameterAttribute or None or bool
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+
+    Layer(
+        inputs=[input.name],
+        name=name,
+        size=reshape_size,
+        type=LayerType.SEQUENCE_RESHAPE,
+        bias=ParamAttr.to_bias(bias_attr),
+        **ExtraAttr.to_kwargs(layer_attr))
+    return LayerOutput(
+        name=name,
+        size=reshape_size,
+        layer_type=LayerType.SEQUENCE_RESHAPE,
+        parents=[input])
+
+
 @wrap_name_default()
 @layer_support()
 def interpolation_layer(input, weight, name=None, layer_attr=None):
@@ -1640,7 +1724,7 @@ def scaling_layer(input, weight, name=None, layer_attr=None):
 @layer_support()
 def trans_layer(input, name=None, layer_attr=None):
     """
-    A layer for transposition.
+    A layer for transposing a minibatch matrix.
 
     .. math::
        y = x^\mathrm{T}
@@ -1673,7 +1757,53 @@ def trans_layer(input, name=None, layer_attr=None):
 
 @wrap_name_default()
 @layer_support()
-def cos_sim(a, b, scale=5, size=1, name=None, layer_attr=None):
+def rotate_layer(input, height, width, name=None, layer_attr=None):
+    """
+    A layer for rotating 90 degrees (clock-wise) for each feature channel,
+    usually used when the input sample is some image or feature map.
+
+    .. math::
+       y(j,i,:) = x(M-i-1,j,:)
+
+    where :math:`x` is (M x N x C) input, and :math:`y` is (N x M x C) output.
+
+    The example usage is:
+
+    .. code-block:: python
+
+       rot = rotate_layer(input=layer,
+                          height=100,
+                          width=100)
+
+    :param input: Input layer.
+    :type input: LayerOutput
+    :param height: The height of the sample matrix
+    :type height: int
+    :param name: Layer name.
+    :type name: basestring
+    :param layer_attr: extra layer attributes.
+    :type layer_attr: ExtraLayerAttribute.
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input, LayerOutput)
+    l = Layer(
+        name=name,
+        height=height,
+        width=width,
+        type=LayerType.ROTATE_LAYER,
+        inputs=[input.name],
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name=name,
+        layer_type=LayerType.ROTATE_LAYER,
+        parents=[input],
+        size=l.config.size)
+
+
+@wrap_name_default()
+@layer_support()
+def cos_sim(a, b, scale=1, size=1, name=None, layer_attr=None):
     """
     Cosine Similarity Layer. The cosine similarity equation is here.
 
@@ -1688,6 +1818,12 @@ def cos_sim(a, b, scale=5, size=1, name=None, layer_attr=None):
     Note that the above computation is for one sample. Multiple samples are
     processed in one batch.
 
+    The example usage is:
+
+    .. code-block:: python
+
+       cos = cos_sim(a=layer1, b=layer2, size=3)
+
     :param name: layer name
     :type name: basestring
     :param a: input layer a
@@ -1824,14 +1960,14 @@ def img_conv_layer(input,
                    trans=False,
                    layer_type=None):
     """
-    Convolution layer for image. Paddle can support both square and non-square 
+    Convolution layer for image. Paddle can support both square and non-square
     input currently.
 
     The details of convolution layer, please refer UFLDL's `convolution
     <http://ufldl.stanford.edu/tutorial/supervised/
     FeatureExtractionUsingConvolution/>`_ .
 
-    Convolution Transpose (deconv) layer for image. Paddle can support both square 
+    Convolution Transpose (deconv) layer for image. Paddle can support both square
     and non-square input currently.
 
     The details of convolution transpose layer,
@@ -1849,6 +1985,16 @@ def img_conv_layer(input,
     pieces. First 256/4 = 64 channels will process by first 32 filters. The
     rest channels will be processed by rest group of filters.
 
+    The example usage is:
+
+    ..  code-block:: python
+
+        conv = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                              num_channels=8,
+                              num_filters=16, stride=1,
+                              bias_attr=False,
+                              act=ReluActivation())
+
     :param name: Layer name.
     :type name: basestring
     :param input: Layer Input.
@@ -1890,7 +2036,7 @@ def img_conv_layer(input,
     :param trans: true if it is a convTransLayer, false if it is a convLayer
     :type trans: bool
     :param layer_type: specify the layer_type, default is None. If trans=True,
-                       layer_type has to be "exconvt", otherwise layer_type 
+                       layer_type has to be "exconvt", otherwise layer_type
                        has to be either "exconv" or "cudnn_conv"
     :type layer_type: String
     :return: LayerOutput object.
@@ -1979,7 +2125,8 @@ def img_pool_layer(input,
                    layer_attr=None,
                    pool_size_y=None,
                    stride_y=None,
-                   padding_y=None):
+                   padding_y=None,
+                   ceil_mode=True):
     """
     Image pooling Layer.
 
@@ -1987,6 +2134,34 @@ def img_pool_layer(input,
 
     .. _pooling: http://ufldl.stanford.edu/tutorial/supervised/Pooling/
 
+    - ceil_mode=True:
+
+    ..  math::
+
+        w = 1 + int(ceil(input\_width + 2 * padding - pool\_size) / float(stride))
+        h = 1 + int(ceil(input\_height + 2 * padding\_y - pool\_size\_y) / float(stride\_y))
+
+    - ceil_mode=False:
+
+    ..  math::
+
+        w = 1 + int(floor(input\_width + 2 * padding - pool\_size) / float(stride))
+        h = 1 + int(floor(input\_height + 2 * padding\_y - pool\_size\_y) / float(stride\_y))
+
+    The example usage is:
+
+    ..  code-block:: python
+
+        maxpool = img_pool_layer(input=conv,
+                                 pool_size=3,
+                                 pool_size_y=5,
+                                 num_channels=8,
+                                 stride=1,
+                                 stride_y=2,
+                                 padding=1,
+                                 padding_y=2,
+                                 pool_type=MaxPooling())
+
     :param padding: pooling padding width.
     :type padding: int
     :param padding_y: pooling padding height. It's equal to padding by default.
@@ -2010,6 +2185,10 @@ def img_pool_layer(input,
     :type stride_y: int|None
     :param layer_attr: Extra Layer attribute.
     :type layer_attr: ExtraLayerAttribute
+    :param ceil_mode: Wether to use ceil mode to calculate output height and with.
+                      Defalut is True. If set false, Otherwise use floor.
+
+    :type ceil_mode: bool
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -2047,6 +2226,7 @@ def img_pool_layer(input,
                     stride_y=stride_y,
                     padding_y=padding_y))
         ],
+        ceil_mode=ceil_mode,
         **ExtraLayerAttribute.to_kwargs(layer_attr))
     return LayerOutput(
         name,
@@ -2069,6 +2249,15 @@ def spp_layer(input,
     The details please refer to
     `Kaiming He's paper <https://arxiv.org/abs/1406.4729>`_.
 
+    The example usage is:
+
+    ..  code-block:: python
+
+        spp = spp_layer(input=data, 
+                        pyramid_height=2, 
+                        num_channels=16, 
+                        pool_type=MaxPooling())
+
     :param name: layer name.
     :type name: basestring
     :param input: layer's input.
@@ -2157,6 +2346,12 @@ def img_cmrnorm_layer(input,
     The details please refer to
     `Alex's paper <http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf>`_.
 
+    The example usage is:
+
+    ..  code-block:: python
+    
+        norm = img_cmrnorm_layer(input=net, size=5)
+
     :param name: layer name.
     :type name: None|basestring
     :param input: layer's input.
@@ -2212,6 +2407,12 @@ def batch_norm_layer(input,
     The details of batch normalization please refer to this
     `paper <http://arxiv.org/abs/1502.03167>`_.
 
+    The example usage is:
+
+    ..  code-block:: python
+    
+        norm = batch_norm_layer(input=net, act=ReluActivation())
+
     :param name: layer name.
     :type name: basestring
     :param input: batch normalization input. Better be linear activation.
@@ -2501,6 +2702,63 @@ def __reduce_concat_type__(a, b):
         size=sz)
 
 
+@wrap_name_default("seqconcat")
+@wrap_act_default(act=IdentityActivation())
+@wrap_bias_attr_default(has_bias=False)
+@layer_support()
+def seq_concat_layer(a, b, act=None, name=None, layer_attr=None,
+                     bias_attr=None):
+    """
+    Concat sequence a with sequence b.
+
+    Inputs: 
+      - a = [a1, a2, ..., an]
+      - b = [b1, b2, ..., bn]
+      - Note that the length of a and b should be the same.
+        
+    Output: [a1, b1, a2, b2, ..., an, bn]
+
+    The example usage is:
+
+    ..  code-block:: python
+
+        concat = seq_concat_layer(a=layer1, b=layer2)
+
+    :param name: Layer name.
+    :type name: basestring
+    :param a: input sequence layer
+    :type a: LayerOutput
+    :param b: input sequence layer
+    :type b: LayerOutput
+    :param act: Activation type.
+    :type act: BaseActivation
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
+    :param bias_attr: The Bias Attribute. If no bias, then pass False or
+                      something not type of ParameterAttribute. None will get a
+                      default Bias.
+    :type bias_attr: ParameterAttribute or None or bool
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(a, LayerOutput) and isinstance(b, LayerOutput)
+    assert a.size == b.size
+    Layer(
+        name=name,
+        type=LayerType.SEQUENCE_CONCAT_LAYER,
+        inputs=[a.name, b.name],
+        active_type=act.name,
+        bias=ParamAttr.to_bias(bias_attr),
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+
+    return LayerOutput(
+        name,
+        layer_type=LayerType.SEQUENCE_CONCAT_LAYER,
+        parents=[a, b],
+        activation=act,
+        size=a.size)
+
+
 def memory(name,
            size,
            is_seq=False,
@@ -2661,6 +2919,7 @@ def lstm_step_layer(input,
 
 
 @wrap_bias_attr_default()
+@wrap_param_attr_default()
 @wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
 @wrap_act_default(act=TanhActivation())
 @wrap_name_default('gru_step')
@@ -2672,6 +2931,7 @@ def gru_step_layer(input,
                    name=None,
                    gate_act=None,
                    bias_attr=None,
+                   param_attr=None,
                    layer_attr=None):
     """
 
@@ -2683,6 +2943,8 @@ def gru_step_layer(input,
     :param name:
     :param gate_act:
     :param bias_attr:
+    :param param_attr: the parameter_attribute for transforming the output_mem
+                       from previous step.
     :param layer_attr:
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2693,7 +2955,12 @@ def gru_step_layer(input,
     Layer(
         name=name,
         type=LayerType.GRU_STEP_LAYER,
-        inputs=[input.name, output_mem.name],
+        # The parameter here is for transforming the output_mem. The input has
+        # already been transformed outside this module so it does not need
+        # parameter associated with it.
+        # The parameter here is instead grouped with input is due to
+        # backward model compatibility.
+        inputs=[Input(input.name, **param_attr.attr), output_mem.name],
         bias=ParamAttr.to_bias(bias_attr),
         size=size,
         active_type=act.name,
@@ -3342,6 +3609,7 @@ def classification_cost(input,
                         label,
                         weight=None,
                         name=None,
+                        top_k=None,
                         evaluator=classification_error_evaluator,
                         layer_attr=None):
     """
@@ -3356,6 +3624,8 @@ def classification_cost(input,
     :param weight: The weight affects the cost, namely the scale of cost.
                    It is an optional argument.
     :type weight: LayerOutput
+    :param top_k: number k in top-k error rate
+    :type top_k: int
     :param evaluator: Evaluator method.
     :param layer_attr: layer's extra attribute.
     :type layer_attr: ExtraLayerAttribute
@@ -3383,7 +3653,7 @@ def __add_evaluator__(e):
         assert isinstance(e.for_classification, bool)
         assert e.for_classification
 
-        e(name=e.__name__, input=input, label=label, weight=weight)
+        e(name=e.__name__, input=input, label=label, weight=weight, top_k=top_k)
 
     if not isinstance(evaluator, collections.Sequence):
         evaluator = [evaluator]
@@ -3488,9 +3758,6 @@ def conv_projection(input,
                     groups=1,
                     param_attr=None):
     """
-    ConvProjection with a layer as input.
-    It performs element-wise multiplication with weight.
-
     Different from img_conv_layer and conv_op, conv_projection is an Projection,
     which can be used in mixed_layer and conat_layer. It use cudnn to implement
     conv and only support GPU mode.
@@ -3499,7 +3766,7 @@ def conv_projection(input,
 
     .. code-block:: python
 
-       proj = conv_projection(img=input1,
+       proj = conv_projection(input=input1,
                               filter_size=3,
                               num_filters=64,
                               num_channels=64)
@@ -3582,6 +3849,110 @@ def conv_projection(input,
     return proj
 
 
+@wrap_name_default("pad")
+@layer_support()
+def pad_layer(input,
+              pad_c=None,
+              pad_h=None,
+              pad_w=None,
+              name=None,
+              layer_attr=None):
+    """
+    This operation pads zeros to the input data according to pad_c,pad_h
+    and pad_w. pad_c, pad_h, pad_w specifies the which dimension and size
+    of padding. And the input data shape is NCHW.
+
+    For example, pad_c=[2,3] means padding 2 zeros before the
+    input data and 3 zeros after the input data in channel dimension.
+    pad_h means padding zeros in height dimension. pad_w means padding zeros
+    in width dimension.
+
+    For example,
+
+    .. code-block:: python
+
+       input(2,2,2,3)  = [
+                           [ [[1,2,3], [3,4,5]],
+                             [[2,3,5], [1,6,7]] ],
+                           [ [[4,3,1], [1,8,7]],
+                             [[3,8,9], [2,3,5]] ]
+                         ]
+
+       pad_c=[1,1], pad_h=[0,0], pad_w=[0,0]
+
+       output(2,4,2,3) = [
+                           [ [[0,0,0], [0,0,0]],
+                             [[1,2,3], [3,4,5]],
+                             [[2,3,5], [1,6,7]],
+                             [[0,0,0], [0,0,0]] ],
+                           [ [[0,0,0], [0,0,0]],
+                             [[4,3,1], [1,8,7]],
+                             [[3,8,9], [2,3,5]],
+                             [[0,0,0], [0,0,0]] ]
+                         ]
+
+    The simply usage is:
+
+    .. code-block:: python
+
+       pad = pad_layer(input=ipt,
+                       pad_c=[4,4],
+                       pad_h=[0,0],
+                       pad_w=[2,2])
+
+    :param input: layer's input.
+    :type input: LayerOutput
+    :param pad_c: padding size in channel dimension.
+    :type pad_c: list|None
+    :param pad_h: padding size in height dimension.
+    :type pad_h: list|None
+    :param pad_w: padding size in width dimension.
+    :type pad_w: list|None
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
+    :param name: layer name.
+    :type name: basestring
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    if pad_c is not None:
+        assert isinstance(pad_c, collections.Sequence) and len(pad_c) == 2
+    else:
+        pad_c = [0, 0]
+
+    if pad_h is not None:
+        assert isinstance(pad_h, collections.Sequence) and len(pad_h) == 2
+    else:
+        pad_h = [0, 0]
+
+    if pad_w is not None:
+        assert isinstance(pad_w, collections.Sequence) and len(pad_w) == 2
+    else:
+        pad_w = [0, 0]
+
+    assert input.num_filters is not None
+    in_ch = input.num_filters
+    out_ch = in_ch + pad_c[0] + pad_c[1]
+
+    l = Layer(
+        name=name,
+        type=LayerType.PAD_LAYER,
+        inputs=Input(
+            input.name,
+            pad=Pad(
+                channels=in_ch,
+                pad_c=pad_c,
+                pad_h=pad_h,
+                pad_w=pad_w, )),
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name,
+        layer_type=LayerType.PAD_LAYER,
+        parents=[input],
+        num_filters=out_ch,
+        size=l.config.size)
+
+
 @wrap_name_default()
 @layer_support()
 def conv_shift_layer(a, b, name=None, layer_attr=None):
@@ -3605,13 +3976,13 @@ def conv_shift_layer(a, b, name=None, layer_attr=None):
 
     .. code-block:: python
 
-       conv_shift = conv_shift_layer(input=[layer1, layer2])
+       conv_shift = conv_shift_layer(a=layer1, b=layer2)
 
     :param name: layer name
     :type name: basestring
     :param a: Input layer a.
     :type a: LayerOutput
-    :param b: input layer b
+    :param b: input layer b.
     :type b: LayerOutput
     :param layer_attr: layer's extra attribute.
     :type layer_attr: ExtraLayerAttribute
@@ -3703,8 +4074,8 @@ def tensor_layer(a,
 @wrap_act_default()
 @layer_support()
 def selective_fc_layer(input,
-                       select,
                        size,
+                       select=None,
                        act=None,
                        name=None,
                        pass_generation=False,
@@ -3731,6 +4102,7 @@ def selective_fc_layer(input,
     :type input: LayerOutput|list|tuple
     :param select: The select layer. The output of select layer should be a
                    sparse binary matrix, and treat as the mask of selective fc.
+                   If is None, acts exactly like fc_layer.
     :type select: LayerOutput
     :param size: The layer dimension.
     :type size: int
@@ -3959,7 +4331,7 @@ def block_expand_layer(input,
 
     .. code-block:: python
 
-       block_expand = block_expand_layer(input,
+       block_expand = block_expand_layer(input=layer,
                                          num_channels=128,
                                          stride_x=1,
                                          stride_y=1,
@@ -4013,13 +4385,7 @@ def block_expand_layer(input,
 
 @wrap_name_default()
 @layer_support()
-def maxout_layer(input,
-                 groups,
-                 num_channels=None,
-                 size_x=None,
-                 size_y=None,
-                 name=None,
-                 layer_attr=None):
+def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None):
     """
     A layer to do max out on conv layer output.
       - Input: output of a conv layer.
@@ -4049,12 +4415,6 @@ def maxout_layer(input,
     :type num_channels: int|None
     :param groups: The group number of input layer.
     :type groups: int
-    :param size_x: conv output width. If None will be set
-                   automatically from previous output.
-    :type size_x: int|None
-    :param size_y: conv output height. If None will be set
-                   automatically from previous output.
-    :type size_y: int|None
     :param name: The name of this layer, which can not specify.
     :type name: None|basestring.
     :param layer_attr: Extra Layer attribute.
@@ -4175,7 +4535,7 @@ def warp_ctc_layer(input,
         - You can set 'blank' to any value ranged in [0, num_classes], which
           should be consistent as that used in your labels.
         - As a native 'softmax' activation is interated to the warp-ctc library,
-         'linear' activation is expected instead in the 'input' layer.
+          'linear' activation is expected instead in the 'input' layer.
 
     The simple usage:
 
@@ -4308,6 +4668,13 @@ def crf_decoding_layer(input,
     this layer will also calculate error. output.value[i] is 1 for incorrect
     decoding or 0 for correct decoding.
 
+    The simple usage:
+
+    .. code-block:: python
+
+      crf_decoding = crf_decoding_layer(input=input,
+                                        size=label_dim)
+
     :param input: The first input layer.
     :type input: LayerOutput
     :param size: size of this layer.
@@ -4616,6 +4983,7 @@ def cross_entropy_with_selfnorm(input,
                                 layer_attr=None):
     """
     A loss layer for multi class entropy with selfnorm.
+    Input should be a vector of positive numbers, without normalization.
 
     .. code-block:: python
 
diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
old mode 100644
new mode 100755
index 375bea34e8aa0a..cadde11ff81658
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -737,12 +737,12 @@ def lstmemory_group(input,
                     lstm_layer_attr=None,
                     get_output_layer_attr=None):
     """
-    lstm_group is a recurrent layer group version Long Short Term Memory. It
+    lstm_group is a recurrent layer group version of Long Short Term Memory. It
     does exactly the same calculation as the lstmemory layer (see lstmemory in
     layers.py for the maths) does. A promising benefit is that LSTM memory
-    cell states, or hidden states in every time step are accessible to for the
+    cell states, or hidden states in every time step are accessible to the
     user. This is especially useful in attention model. If you do not need to
-    access to the internal states of the lstm, but merely use its outputs,
+    access the internal states of the lstm, but merely use its outputs,
     it is recommended to use the lstmemory, which is relatively faster than
     lstmemory_group.
 
@@ -822,6 +822,7 @@ def gru_unit(input,
              size=None,
              name=None,
              gru_bias_attr=None,
+             gru_param_attr=None,
              act=None,
              gate_act=None,
              gru_layer_attr=None):
@@ -862,6 +863,7 @@ def gru_unit(input,
         output_mem=out_mem,
         size=size,
         bias_attr=gru_bias_attr,
+        param_attr=gru_param_attr,
         act=act,
         gate_act=gate_act,
         layer_attr=gru_layer_attr)
@@ -874,15 +876,16 @@ def gru_group(input,
               name=None,
               reverse=False,
               gru_bias_attr=None,
+              gru_param_attr=None,
               act=None,
               gate_act=None,
               gru_layer_attr=None):
     """
-    gru_group is a recurrent layer group version Gated Recurrent Unit. It
+    gru_group is a recurrent layer group version of Gated Recurrent Unit. It
     does exactly the same calculation as the grumemory layer does. A promising
-    benefit is that gru hidden sates are accessible to for the user. This is
-    especially useful in attention model. If you do not need to access to
-    any internal state, but merely use the outputs of a GRU, it is recommanded
+    benefit is that gru hidden states are accessible to the user. This is
+    especially useful in attention model. If you do not need to access
+    any internal state, but merely use the outputs of a GRU, it is recommended
     to use the grumemory, which is relatively faster.
 
     Please see grumemory in layers.py for more detail about the maths.
@@ -922,6 +925,7 @@ def __gru_step__(ipt):
             name=name,
             size=size,
             gru_bias_attr=gru_bias_attr,
+            gru_param_attr=gru_param_attr,
             act=act,
             gate_act=gate_act,
             gru_layer_attr=gru_layer_attr)
@@ -942,6 +946,7 @@ def simple_gru(input,
                mixed_bias_param_attr=None,
                mixed_layer_attr=None,
                gru_bias_attr=None,
+               gru_param_attr=None,
                act=None,
                gate_act=None,
                gru_layer_attr=None):
@@ -952,22 +957,22 @@ def simple_gru(input,
     use one complete layer to implement rnn (including simple rnn, gru and lstm)
     with multiple time steps, such as recurrent_layer, lstmemory, grumemory. But,
     the multiplication operation :math:`W x_t` is not computed in these layers.
-    See details in their interfaces in layers.py. 
+    See details in their interfaces in layers.py.
     The other implementation is to use an recurrent group which can ensemble a
     series of layers to compute rnn step by step. This way is flexible for
     attenion mechanism or other complex connections.
 
     - gru_step_layer: only compute rnn by one step. It needs an memory as input
       and can be used in recurrent group.
-    - gru_unit: a wrapper of gru_step_layer with memory. 
+    - gru_unit: a wrapper of gru_step_layer with memory.
     - gru_group: a GRU cell implemented by a combination of multiple layers in
       recurrent group.
-      But :math:`W x_t` is not done in group.  
+      But :math:`W x_t` is not done in group.
     - gru_memory: a GRU cell implemented by one layer, which does same calculation
-      with gru_group and is faster than gru_group. 
-    - simple_gru: a complete GRU implementation inlcuding :math:`W x_t` and 
+      with gru_group and is faster than gru_group.
+    - simple_gru: a complete GRU implementation inlcuding :math:`W x_t` and
       gru_group. :math:`W` contains :math:`W_r`, :math:`W_z` and :math:`W`, see
-      formula in grumemory. 
+      formula in grumemory.
 
     The computational speed is that, grumemory is relatively better than
     gru_group, and gru_group is relatively better than simple_gru.
@@ -1010,6 +1015,7 @@ def simple_gru(input,
         input=m,
         reverse=reverse,
         gru_bias_attr=gru_bias_attr,
+        gru_param_attr=gru_param_attr,
         act=act,
         gate_act=gate_act,
         gru_layer_attr=gru_layer_attr)
diff --git a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
index 403aafabe91434..93dd7796c246ae 100644
--- a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
+++ b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
@@ -9,17 +9,10 @@ add_test(NAME test_reset_hook
         ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
     WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
 
-if (PROTOBUF_3)
-  add_paddle_exe(protobuf_equal
-    ProtobufEqualMain.cpp)
-  add_test(NAME test_layerHelpers
-    COMMAND
-    ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh ${PYTHON_EXECUTABLE}
-    ${CMAKE_CURRENT_BINARY_DIR}/protobuf_equal
-  )
-else()
-  add_test(NAME test_layerHelpers
-    COMMAND
-    ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh ${PYTHON_EXECUTABLE}
-  )
-endif()
+add_paddle_exe(protobuf_equal
+  ProtobufEqualMain.cpp)
+add_test(NAME test_layerHelpers
+  COMMAND
+  ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh ${PYTHON_EXECUTABLE}
+  ${CMAKE_CURRENT_BINARY_DIR}/protobuf_equal
+)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
index 3f1d99701afe54..c9178e3c6a46a2 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -3,7 +3,8 @@ export configs=(test_fc layer_activations projections test_print_layer
 test_sequence_pooling test_lstmemory_layer test_grumemory_layer
 last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
 img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cost_layers
-test_rnn_group shared_fc shared_lstm test_cost_layers_with_weight
-test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops)
+test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight
+test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
+test_seq_concat_reshape)
 
 export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr
new file mode 100644
index 00000000000000..b6905824f0cb09
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr
@@ -0,0 +1,295 @@
+type: "recurrent_nn"
+layers {
+  name: "data_a"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "data_b"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__simple_gru_0___transform"
+  type: "mixed"
+  size: 600
+  active_type: ""
+  inputs {
+    input_layer_name: "data_a"
+    input_parameter_name: "mixed_param"
+    proj_conf {
+      type: "fc"
+      name: "___simple_gru_0___transform.w0"
+      input_size: 100
+      output_size: 600
+    }
+  }
+}
+layers {
+  name: "__simple_gru_0___recurrent_group"
+  type: "recurrent_layer_group"
+  active_type: ""
+}
+layers {
+  name: "__simple_gru_0___transform@__simple_gru_0___recurrent_group"
+  type: "scatter_agent"
+  size: 600
+  active_type: ""
+}
+layers {
+  name: "__simple_gru_0__+delay1@__simple_gru_0___recurrent_group"
+  type: "agent"
+  size: 200
+  active_type: ""
+}
+layers {
+  name: "__simple_gru_0__@__simple_gru_0___recurrent_group"
+  type: "gru_step"
+  size: 200
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "__simple_gru_0___transform@__simple_gru_0___recurrent_group"
+    input_parameter_name: "gru_param"
+  }
+  inputs {
+    input_layer_name: "__simple_gru_0__+delay1@__simple_gru_0___recurrent_group"
+  }
+  bias_parameter_name: "gru_bias"
+  active_gate_type: "sigmoid"
+}
+layers {
+  name: "__simple_gru_0__"
+  type: "gather_agent"
+  size: 200
+  active_type: ""
+}
+layers {
+  name: "__simple_gru_1___transform"
+  type: "mixed"
+  size: 600
+  active_type: ""
+  inputs {
+    input_layer_name: "data_b"
+    input_parameter_name: "mixed_param"
+    proj_conf {
+      type: "fc"
+      name: "___simple_gru_1___transform.w0"
+      input_size: 100
+      output_size: 600
+    }
+  }
+}
+layers {
+  name: "__simple_gru_1___recurrent_group"
+  type: "recurrent_layer_group"
+  active_type: ""
+}
+layers {
+  name: "__simple_gru_1___transform@__simple_gru_1___recurrent_group"
+  type: "scatter_agent"
+  size: 600
+  active_type: ""
+}
+layers {
+  name: "__simple_gru_1__+delay1@__simple_gru_1___recurrent_group"
+  type: "agent"
+  size: 200
+  active_type: ""
+}
+layers {
+  name: "__simple_gru_1__@__simple_gru_1___recurrent_group"
+  type: "gru_step"
+  size: 200
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "__simple_gru_1___transform@__simple_gru_1___recurrent_group"
+    input_parameter_name: "gru_param"
+  }
+  inputs {
+    input_layer_name: "__simple_gru_1__+delay1@__simple_gru_1___recurrent_group"
+  }
+  bias_parameter_name: "gru_bias"
+  active_gate_type: "sigmoid"
+}
+layers {
+  name: "__simple_gru_1__"
+  type: "gather_agent"
+  size: 200
+  active_type: ""
+}
+layers {
+  name: "__last_seq_0__"
+  type: "seqlastins"
+  size: 200
+  active_type: "linear"
+  inputs {
+    input_layer_name: "__simple_gru_0__"
+  }
+  trans_type: "non-seq"
+}
+layers {
+  name: "__last_seq_1__"
+  type: "seqlastins"
+  size: 200
+  active_type: "linear"
+  inputs {
+    input_layer_name: "__simple_gru_1__"
+  }
+  trans_type: "non-seq"
+}
+layers {
+  name: "__fc_layer_0__"
+  type: "fc"
+  size: 10
+  active_type: "softmax"
+  inputs {
+    input_layer_name: "__last_seq_0__"
+    input_parameter_name: "softmax_param"
+  }
+  inputs {
+    input_layer_name: "__last_seq_1__"
+    input_parameter_name: "softmax_param"
+  }
+}
+layers {
+  name: "label"
+  type: "data"
+  size: 10
+  active_type: ""
+}
+layers {
+  name: "__cost_0__"
+  type: "multi-class-cross-entropy"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+  }
+  inputs {
+    input_layer_name: "label"
+  }
+  coeff: 1.0
+}
+parameters {
+  name: "mixed_param"
+  size: 60000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 600
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "gru_param"
+  size: 120000
+  initial_mean: 0.0
+  initial_std: 0.0707106781187
+  dims: 200
+  dims: 600
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "gru_bias"
+  size: 600
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 600
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "softmax_param"
+  size: 2000
+  initial_mean: 0.0
+  initial_std: 0.0707106781187
+  dims: 200
+  dims: 10
+  initial_strategy: 0
+  initial_smart: true
+}
+input_layer_names: "data_a"
+input_layer_names: "data_b"
+input_layer_names: "label"
+output_layer_names: "__cost_0__"
+evaluators {
+  name: "classification_error_evaluator"
+  type: "classification_error"
+  input_layers: "__fc_layer_0__"
+  input_layers: "label"
+}
+sub_models {
+  name: "root"
+  layer_names: "data_a"
+  layer_names: "data_b"
+  layer_names: "__simple_gru_0___transform"
+  layer_names: "__simple_gru_0___recurrent_group"
+  layer_names: "__simple_gru_0__"
+  layer_names: "__simple_gru_1___transform"
+  layer_names: "__simple_gru_1___recurrent_group"
+  layer_names: "__simple_gru_1__"
+  layer_names: "__last_seq_0__"
+  layer_names: "__last_seq_1__"
+  layer_names: "__fc_layer_0__"
+  layer_names: "label"
+  layer_names: "__cost_0__"
+  input_layer_names: "data_a"
+  input_layer_names: "data_b"
+  input_layer_names: "label"
+  output_layer_names: "__cost_0__"
+  evaluator_names: "classification_error_evaluator"
+  is_recurrent_layer_group: false
+}
+sub_models {
+  name: "__simple_gru_0___recurrent_group"
+  layer_names: "__simple_gru_0___transform@__simple_gru_0___recurrent_group"
+  layer_names: "__simple_gru_0__+delay1@__simple_gru_0___recurrent_group"
+  layer_names: "__simple_gru_0__@__simple_gru_0___recurrent_group"
+  is_recurrent_layer_group: true
+  reversed: false
+  memories {
+    layer_name: "__simple_gru_0__@__simple_gru_0___recurrent_group"
+    link_name: "__simple_gru_0__+delay1@__simple_gru_0___recurrent_group"
+    is_sequence: false
+  }
+  in_links {
+    layer_name: "__simple_gru_0___transform"
+    link_name: "__simple_gru_0___transform@__simple_gru_0___recurrent_group"
+    has_subseq: false
+  }
+  out_links {
+    layer_name: "__simple_gru_0__@__simple_gru_0___recurrent_group"
+    link_name: "__simple_gru_0__"
+    has_subseq: false
+  }
+  target_inlinkid: -1
+}
+sub_models {
+  name: "__simple_gru_1___recurrent_group"
+  layer_names: "__simple_gru_1___transform@__simple_gru_1___recurrent_group"
+  layer_names: "__simple_gru_1__+delay1@__simple_gru_1___recurrent_group"
+  layer_names: "__simple_gru_1__@__simple_gru_1___recurrent_group"
+  is_recurrent_layer_group: true
+  reversed: false
+  memories {
+    layer_name: "__simple_gru_1__@__simple_gru_1___recurrent_group"
+    link_name: "__simple_gru_1__+delay1@__simple_gru_1___recurrent_group"
+    is_sequence: false
+  }
+  in_links {
+    layer_name: "__simple_gru_1___transform"
+    link_name: "__simple_gru_1___transform@__simple_gru_1___recurrent_group"
+    has_subseq: false
+  }
+  out_links {
+    layer_name: "__simple_gru_1__@__simple_gru_1___recurrent_group"
+    link_name: "__simple_gru_1__"
+    has_subseq: false
+  }
+  target_inlinkid: -1
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_ntm_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_ntm_layers.protostr
index b30bbb2a4e24d7..c1bfdf1b19c61d 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_ntm_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_ntm_layers.protostr
@@ -79,7 +79,7 @@ layers {
   inputs {
     input_layer_name: "b"
   }
-  cos_scale: 5
+  cos_scale: 1
 }
 layers {
   name: "__cos_sim_1__"
@@ -92,7 +92,7 @@ layers {
   inputs {
     input_layer_name: "c"
   }
-  cos_scale: 5
+  cos_scale: 1
 }
 layers {
   name: "__sum_to_one_norm_layer_0__"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
index 41d2e2f2671f5c..3e9d28416ed506 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
@@ -465,11 +465,11 @@ parameters {
   name: "___gru_group_0__@__gru_group_0___recurrent_group.w0"
   size: 30000
   initial_mean: 0.0
-  initial_std: 0.01
+  initial_std: 0.1
   dims: 100
   dims: 300
   initial_strategy: 0
-  initial_smart: false
+  initial_smart: true
 }
 parameters {
   name: "___gru_group_0__@__gru_group_0___recurrent_group.wbias"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_concat_reshape.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_concat_reshape.protostr
new file mode 100644
index 00000000000000..91284b4fb32fcf
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_concat_reshape.protostr
@@ -0,0 +1,51 @@
+type: "nn"
+layers {
+  name: "data1"
+  type: "data"
+  size: 30
+  active_type: ""
+}
+layers {
+  name: "data2"
+  type: "data"
+  size: 30
+  active_type: ""
+}
+layers {
+  name: "__seqconcat_0__"
+  type: "seqconcat"
+  size: 30
+  active_type: ""
+  inputs {
+    input_layer_name: "data1"
+  }
+  inputs {
+    input_layer_name: "data2"
+  }
+}
+layers {
+  name: "__seqreshape_0__"
+  type: "seqreshape"
+  size: 5
+  active_type: "linear"
+  inputs {
+    input_layer_name: "data1"
+  }
+}
+input_layer_names: "data1"
+input_layer_names: "data2"
+output_layer_names: "__seqconcat_0__"
+output_layer_names: "__seqreshape_0__"
+sub_models {
+  name: "root"
+  layer_names: "data1"
+  layer_names: "data2"
+  layer_names: "__seqconcat_0__"
+  layer_names: "__seqreshape_0__"
+  input_layer_names: "data1"
+  input_layer_names: "data2"
+  output_layer_names: "__seqconcat_0__"
+  output_layer_names: "__seqreshape_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_split_datasource.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_split_datasource.protostr
index 1cfb92255aa92f..569b0b945a762e 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_split_datasource.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_split_datasource.protostr
@@ -19,7 +19,7 @@ model_config {
 data_config {
   type: "py2"
   files: "train.list"
-  async_load_data: true
+  async_load_data: false
   for_test: false
   load_data_module: "a"
   load_data_object: "c"
@@ -58,7 +58,7 @@ opt_config {
 test_data_config {
   type: "py2"
   files: "test.list"
-  async_load_data: true
+  async_load_data: false
   for_test: true
   load_data_module: "b"
   load_data_object: "d"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh b/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
index a37eb6439e6d28..c8a3b190b19148 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
@@ -2,16 +2,18 @@
 cd `dirname $0`
 
 set -e
+PYTHON_EXEC=$1
+COMPARE_PROTO_UTIL=$2
 
 protostr=`dirname $0`/protostr
 
 files=`ls $protostr | grep -v "unittest"`
 
-./generate_protostr.sh $1
+./generate_protostr.sh ${PYTHON_EXEC}
 
 . ./file_list.sh
 
-if [ -z $1 ]; then
+if [ -z ${COMPARE_PROTO_UTIL} ]; then
   for file in $files
   do
       base_protostr=$protostr/$file
@@ -22,20 +24,20 @@ if [ -z $1 ]; then
 else
   for file in ${configs[*]}
   do
-    if ! $1 $protostr/$file.protostr $protostr/$file.protostr.unittest; then
+    if ! ${COMPARE_PROTO_UTIL} $protostr/$file.protostr $protostr/$file.protostr.unittest; then
       diff $protostr/$file.protostr $protostr/$file.protostr.unittest -u
     fi
-    if ! $1 $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest; then
+    if ! ${COMPARE_PROTO_UTIL} $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest; then
       diff $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest -u
     fi
   done
 
   for file in ${whole_configs[*]}
   do
-    if ! $1 $protostr/$file.protostr $protostr/$file.protostr.unittest --whole; then
+    if ! ${COMPARE_PROTO_UTIL} $protostr/$file.protostr $protostr/$file.protostr.unittest --whole; then
       diff $protostr/$file.protostr $protostr/$file.protostr.unittest -u
     fi
-    if ! $1 $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest --whole; then
+    if ! ${COMPARE_PROTO_UTIL} $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest --whole; then
       diff $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest -u
     fi
   done
diff --git a/python/paddle/trainer_config_helpers/tests/configs/shared_gru.py b/python/paddle/trainer_config_helpers/tests/configs/shared_gru.py
new file mode 100644
index 00000000000000..c19bb9685aa24c
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/shared_gru.py
@@ -0,0 +1,40 @@
+from paddle.trainer_config_helpers import *
+
+settings(learning_rate=1e-4, batch_size=1000)
+
+data_1 = data_layer(name='data_a', size=100)
+data_2 = data_layer(name='data_b', size=100)
+
+mixed_param = ParamAttr(name='mixed_param')
+
+gru_param = ParamAttr(name='gru_param')
+gru_bias = ParamAttr(name='gru_bias', initial_mean=0., initial_std=0.)
+
+gru1 = simple_gru(
+    input=data_1,
+    size=200,
+    mixed_param_attr=mixed_param,
+    mixed_bias_param_attr=False,
+    gru_bias_attr=gru_bias,
+    gru_param_attr=gru_param)
+
+gru2 = simple_gru(
+    input=data_2,
+    size=200,
+    mixed_param_attr=mixed_param,
+    mixed_bias_param_attr=False,
+    gru_bias_attr=gru_bias,
+    gru_param_attr=gru_param)
+
+softmax_param = ParamAttr(name='softmax_param')
+
+predict = fc_layer(
+    input=[last_seq(input=gru1), last_seq(input=gru2)],
+    size=10,
+    param_attr=[softmax_param, softmax_param],
+    bias_attr=False,
+    act=SoftmaxActivation())
+outputs(
+    classification_cost(
+        input=predict, label=data_layer(
+            name='label', size=10)))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_pad.py b/python/paddle/trainer_config_helpers/tests/configs/test_pad.py
new file mode 100644
index 00000000000000..bb5f13410dbbba
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_pad.py
@@ -0,0 +1,21 @@
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=1000, learning_rate=1e-5)
+
+data = data_layer(name='data', size=2304, height=48, width=42)
+
+conv = img_conv_layer(
+    input=data,
+    filter_size=3,
+    num_channels=1,
+    num_filters=16,
+    padding=1,
+    act=LinearActivation(),
+    bias_attr=True)
+
+pool = img_pool_layer(
+    input=conv, num_channels=8, pool_size=2, stride=2, pool_type=MaxPooling())
+
+pad = pad_layer(input=pool, pad_c=[2, 3], pad_h=[1, 2], pad_w=[3, 1])
+
+outputs(pad)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_seq_concat_reshape.py b/python/paddle/trainer_config_helpers/tests/configs/test_seq_concat_reshape.py
new file mode 100644
index 00000000000000..5c161ba805fb30
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_seq_concat_reshape.py
@@ -0,0 +1,12 @@
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=1000, learning_rate=1e-5)
+
+din1 = data_layer(name='data1', size=30)
+din2 = data_layer(name='data2', size=30)
+
+opts = []
+opts.append(seq_concat_layer(a=din1, b=din2))
+opts.append(seq_reshape_layer(input=din1, reshape_size=5))
+
+outputs(opts)
diff --git a/python/paddle/trainer_config_helpers/tests/layers_test_config.py b/python/paddle/trainer_config_helpers/tests/layers_test_config.py
index ae275735aa2b85..e6cd35ee761d1a 100644
--- a/python/paddle/trainer_config_helpers/tests/layers_test_config.py
+++ b/python/paddle/trainer_config_helpers/tests/layers_test_config.py
@@ -39,6 +39,7 @@
 assert z1.size > 0
 
 y2 = fc_layer(input=y, size=15)
+z2 = rotate_layer(input=y2, height=5, width=3)
 
 cos1 = cos_sim(a=x1, b=y1)
 cos3 = cos_sim(a=x1, b=y2, size=3)
@@ -46,7 +47,7 @@
 linear_comb = linear_comb_layer(weights=x1, vectors=y2, size=3)
 
 out = fc_layer(
-    input=[cos1, cos3, linear_comb, z, z1],
+    input=[cos1, cos3, linear_comb, z, z1, z2],
     size=num_classes,
     act=SoftmaxActivation())
 
diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py
new file mode 100644
index 00000000000000..d548d1adaafacd
--- /dev/null
+++ b/python/paddle/v2/__init__.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import optimizer
+import layer
+import activation
+import parameters
+import trainer
+import event
+import data_type
+import topology
+import data_feeder
+from . import dataset
+from . import reader
+import attr
+import pooling
+import inferencer
+import networks
+import py_paddle.swig_paddle as api
+
+__all__ = [
+    'optimizer', 'layer', 'activation', 'parameters', 'init', 'trainer',
+    'event', 'data_type', 'attr', 'pooling', 'data_feeder', 'dataset', 'reader',
+    'topology', 'networks', 'inferencer', 'infer'
+]
+
+
+def init(**kwargs):
+    args = []
+    for key in kwargs.keys():
+        args.append('--%s=%s' % (key, str(kwargs[key])))
+
+    api.initPaddle(*args)
+
+
+infer = inferencer.infer
diff --git a/python/paddle/v2/activation.py b/python/paddle/v2/activation.py
new file mode 100644
index 00000000000000..1f3aab9ef3c5f6
--- /dev/null
+++ b/python/paddle/v2/activation.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers.activations import *
+
+__all__ = [
+    "Base", "Tanh", "Sigmoid", "Softmax", "Identity", "Linear",
+    'SequenceSoftmax', "Exp", "Relu", "BRelu", "SoftRelu", "STanh", "Abs",
+    "Square", "Log"
+]
+
+Base = BaseActivation
+Tanh = TanhActivation
+Sigmoid = SigmoidActivation
+Softmax = SoftmaxActivation
+SequenceSoftmax = SequenceSoftmaxActivation
+Identity = IdentityActivation
+Linear = Identity
+Relu = ReluActivation
+BRelu = BReluActivation
+SoftRelu = SoftReluActivation
+STanh = STanhActivation
+Abs = AbsActivation
+Square = SquareActivation
+Exp = ExpActivation
+Log = LogActivation
diff --git a/python/paddle/v2/attr.py b/python/paddle/v2/attr.py
new file mode 100644
index 00000000000000..40c64f621b443d
--- /dev/null
+++ b/python/paddle/v2/attr.py
@@ -0,0 +1,23 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers.attrs import *
+
+__all__ = [
+    "Param",
+    "Extra",
+]
+
+Param = ParameterAttribute
+Extra = ExtraLayerAttribute
diff --git a/python/paddle/v2/config_base.py b/python/paddle/v2/config_base.py
new file mode 100644
index 00000000000000..035f96b0f2e978
--- /dev/null
+++ b/python/paddle/v2/config_base.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+
+from paddle.trainer_config_helpers.default_decorators import wrap_name_default
+import paddle.trainer_config_helpers as conf_helps
+
+
+class Layer(object):
+    def __init__(self, name=None, parent_layers=None):
+        assert isinstance(parent_layers, dict)
+        self.name = name
+        self.__parent_layers__ = parent_layers
+
+    def to_proto(self, context):
+        """
+        function to set proto attribute
+        """
+        kwargs = dict()
+        for layer_name in self.__parent_layers__:
+            if not isinstance(self.__parent_layers__[layer_name],
+                              collections.Sequence):
+                v1_layer = self.__parent_layers__[layer_name].to_proto(
+                    context=context)
+            else:
+                v1_layer = map(lambda x: x.to_proto(context=context),
+                               self.__parent_layers__[layer_name])
+            kwargs[layer_name] = v1_layer
+
+        if self.name is None:
+            return self.to_proto_impl(**kwargs)
+        elif self.name not in context:
+            context[self.name] = self.to_proto_impl(**kwargs)
+
+        return context[self.name]
+
+    def to_proto_impl(self, **kwargs):
+        raise NotImplementedError()
+
+
+def __convert_to_v2__(method_name, parent_names, is_default_name=True):
+    if is_default_name:
+        wrapper = wrap_name_default(name_prefix=method_name)
+    else:
+        wrapper = None
+
+    class V2LayerImpl(Layer):
+        def __init__(self, **kwargs):
+            parent_layers = dict()
+            other_kwargs = dict()
+            for pname in parent_names:
+                if kwargs.has_key(pname):
+                    parent_layers[pname] = kwargs[pname]
+
+            for key in kwargs.keys():
+                if key not in parent_names:
+                    other_kwargs[key] = kwargs[key]
+
+            name = kwargs.get('name', None)
+            super(V2LayerImpl, self).__init__(name, parent_layers)
+            self.__other_kwargs__ = other_kwargs
+
+        if wrapper is not None:
+            __init__ = wrapper(__init__)
+
+        def to_proto_impl(self, **kwargs):
+            args = dict()
+            for each in kwargs:
+                args[each] = kwargs[each]
+            for each in self.__other_kwargs__:
+                args[each] = self.__other_kwargs__[each]
+            return getattr(conf_helps, method_name)(**args)
+
+    return V2LayerImpl
diff --git a/python/paddle/v2/data_feeder.py b/python/paddle/v2/data_feeder.py
new file mode 100644
index 00000000000000..3b106e100cff75
--- /dev/null
+++ b/python/paddle/v2/data_feeder.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from py_paddle import swig_paddle
+from py_paddle import DataProviderConverter
+import data_type
+
+__all__ = ['DataFeeder']
+
+
+class DataFeeder(DataProviderConverter):
+    """
+    DataFeeder converts the data returned by paddle.reader into a data structure
+    of Arguments which is defined in the API. The paddle.reader usually returns
+    a list of mini-batch data entries. Each data entry in the list is one sample.
+    Each sample is a list or a tuple with one feature or multiple features.
+    DataFeeder converts this mini-batch data entries into Arguments in order
+    to feed it to C++ interface.
+    
+    The example usage:
+    
+        data_types = [('image', paddle.data_type.dense_vector(784)),
+                      ('label', paddle.data_type.integer_value(10))]
+        reader_dict = {'image':0, 'label':1}
+        feeder = DataFeeder(data_types=data_types, reader_dict=reader_dict)
+        minibatch_data = [
+                           ( [1.0,2.0,3.0,4.0], 5, [6,7,8] ),  # first sample
+                           ( [1.0,2.0,3.0,4.0], 5, [6,7,8] )   # second sample
+                         ]
+        # or minibatch_data = [
+        #                       [ [1.0,2.0,3.0,4.0], 5, [6,7,8] ],  # first sample
+        #                       [ [1.0,2.0,3.0,4.0], 5, [6,7,8] ]   # second sample
+        #                     ]
+        arg = feeder(minibatch_data)
+    """
+
+    def __init__(self, data_types, reader_dict):
+        """
+        :param data_types: A list to specify data name and type. Each item is
+                           a tuple of (data_name, data_type). For example:
+                           [('image', paddle.data_type.dense_vector(784)),
+                            ('label', paddle.data_type.integer_value(10))]
+
+        :type data_types: A list of tuple
+        :param reader_dict: A dictionary to specify the position of each data
+                            in the input data.
+        :type reader_dict: dict()
+        """
+        self.input_names = []
+        input_types = []
+        self.reader_dict = reader_dict
+        for each in data_types:
+            self.input_names.append(each[0])
+            assert isinstance(each[1], data_type.InputType)
+            input_types.append(each[1])
+        DataProviderConverter.__init__(self, input_types)
+
+    def convert(self, dat, argument=None):
+        """
+        :param dat: A list of mini-batch data. Each sample is a list or tuple
+                    one feature or multiple features.
+                    for example:
+                    [ 
+                      ([0.2, 0.2], ), # first sample
+                      ([0.8, 0.3], ), # second sample
+                    ]
+                    or,
+                    [ 
+                      [[0.2, 0.2], ], # first sample
+                      [[0.8, 0.3], ], # second sample
+                    ]
+
+        :type dat: List
+        :param argument: An Arguments object contains this mini-batch data with
+                         one or multiple features. The Arguments definition is
+                         in the API.
+        :type argument: swig_paddle.Arguments
+        """
+
+        def reorder_data(data):
+            retv = []
+            for each in data:
+                reorder = []
+                for name in self.input_names:
+                    reorder.append(each[self.reader_dict[name]])
+                retv.append(reorder)
+            return retv
+
+        return DataProviderConverter.convert(self, reorder_data(dat), argument)
diff --git a/python/paddle/v2/data_type.py b/python/paddle/v2/data_type.py
new file mode 100644
index 00000000000000..522ddfdaacce44
--- /dev/null
+++ b/python/paddle/v2/data_type.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer.PyDataProvider2 import \
+    InputType, DataType, dense_vector, sparse_binary_vector,\
+    sparse_vector, integer_value, integer_value_sequence
+
+__all__ = [
+    'InputType', 'DataType', 'dense_vector', 'sparse_binary_vector',
+    'sparse_vector', 'integer_value', 'integer_value_sequence'
+]
diff --git a/python/paddle/v2/dataset/__init__.py b/python/paddle/v2/dataset/__init__.py
new file mode 100644
index 00000000000000..9647e9850332b7
--- /dev/null
+++ b/python/paddle/v2/dataset/__init__.py
@@ -0,0 +1,3 @@
+import mnist
+
+__all__ = ['mnist']
diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py
new file mode 100644
index 00000000000000..77c54bd268b5d9
--- /dev/null
+++ b/python/paddle/v2/dataset/cifar.py
@@ -0,0 +1,61 @@
+"""
+CIFAR dataset: https://www.cs.toronto.edu/~kriz/cifar.html
+"""
+import cPickle
+import itertools
+import numpy
+import paddle.v2.dataset.common
+import tarfile
+
+__all__ = ['train100', 'test100', 'train10', 'test10']
+
+URL_PREFIX = 'https://www.cs.toronto.edu/~kriz/'
+CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz'
+CIFAR10_MD5 = 'c58f30108f718f92721af3b95e74349a'
+CIFAR100_URL = URL_PREFIX + 'cifar-100-python.tar.gz'
+CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85'
+
+
+def reader_creator(filename, sub_name):
+    def read_batch(batch):
+        data = batch['data']
+        labels = batch.get('labels', batch.get('fine_labels', None))
+        assert labels is not None
+        for sample, label in itertools.izip(data, labels):
+            yield (sample / 255.0).astype(numpy.float32), int(label)
+
+    def reader():
+        with tarfile.open(filename, mode='r') as f:
+            names = (each_item.name for each_item in f
+                     if sub_name in each_item.name)
+
+            for name in names:
+                batch = cPickle.load(f.extractfile(name))
+                for item in read_batch(batch):
+                    yield item
+
+    return reader
+
+
+def train100():
+    return reader_creator(
+        paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
+        'train')
+
+
+def test100():
+    return reader_creator(
+        paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
+        'test')
+
+
+def train10():
+    return reader_creator(
+        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
+        'data_batch')
+
+
+def test10():
+    return reader_creator(
+        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
+        'test_batch')
diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py
new file mode 100644
index 00000000000000..fcf4437ffaf329
--- /dev/null
+++ b/python/paddle/v2/dataset/common.py
@@ -0,0 +1,41 @@
+import requests
+import hashlib
+import os
+import shutil
+
+__all__ = ['DATA_HOME', 'download', 'md5file']
+
+DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset')
+
+if not os.path.exists(DATA_HOME):
+    os.makedirs(DATA_HOME)
+
+
+def md5file(fname):
+    hash_md5 = hashlib.md5()
+    f = open(fname, "rb")
+    for chunk in iter(lambda: f.read(4096), b""):
+        hash_md5.update(chunk)
+    f.close()
+    return hash_md5.hexdigest()
+
+
+def download(url, module_name, md5sum):
+    dirname = os.path.join(DATA_HOME, module_name)
+    if not os.path.exists(dirname):
+        os.makedirs(dirname)
+
+    filename = os.path.join(dirname, url.split('/')[-1])
+    if not (os.path.exists(filename) and md5file(filename) == md5sum):
+        r = requests.get(url, stream=True)
+        with open(filename, 'w') as f:
+            shutil.copyfileobj(r.raw, f)
+
+    return filename
+
+
+def dict_add(a_dict, ele):
+    if ele in a_dict:
+        a_dict[ele] += 1
+    else:
+        a_dict[ele] = 1
diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py
new file mode 100644
index 00000000000000..433e37380f840f
--- /dev/null
+++ b/python/paddle/v2/dataset/imdb.py
@@ -0,0 +1,120 @@
+# /usr/bin/env python
+# -*- coding:utf-8 -*-
+
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+IMDB dataset: http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz
+"""
+import paddle.v2.dataset.common
+import tarfile
+import Queue
+import re
+import string
+import threading
+
+__all__ = ['build_dict', 'train', 'test']
+
+URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz'
+MD5 = '7c2ac02c03563afcf9b574c7e56c153a'
+
+
+# Read files that match pattern.  Tokenize and yield each file.
+def tokenize(pattern):
+    with tarfile.open(paddle.v2.dataset.common.download(URL, 'imdb',
+                                                        MD5)) as tarf:
+        # Note that we should use tarfile.next(), which does
+        # sequential access of member files, other than
+        # tarfile.extractfile, which does random access and might
+        # destroy hard disks.
+        tf = tarf.next()
+        while tf != None:
+            if bool(pattern.match(tf.name)):
+                # newline and punctuations removal and ad-hoc tokenization.
+                yield tarf.extractfile(tf).read().rstrip("\n\r").translate(
+                    None, string.punctuation).lower().split()
+            tf = tarf.next()
+
+
+def build_dict(pattern, cutoff):
+    word_freq = {}
+    for doc in tokenize(pattern):
+        for word in doc:
+            paddle.v2.dataset.common.dict_add(word_freq, word)
+
+    # Not sure if we should prune less-frequent words here.
+    word_freq = filter(lambda x: x[1] > cutoff, word_freq.items())
+
+    dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0]))
+    words, _ = list(zip(*dictionary))
+    word_idx = dict(zip(words, xrange(len(words))))
+    word_idx['<unk>'] = len(words)
+    return word_idx
+
+
+def reader_creator(pos_pattern, neg_pattern, word_idx, buffer_size):
+    UNK = word_idx['<unk>']
+
+    qs = [Queue.Queue(maxsize=buffer_size), Queue.Queue(maxsize=buffer_size)]
+
+    def load(pattern, queue):
+        for doc in tokenize(pattern):
+            queue.put(doc)
+        queue.put(None)
+
+    def reader():
+        # Creates two threads that loads positive and negative samples
+        # into qs.
+        t0 = threading.Thread(
+            target=load, args=(
+                pos_pattern,
+                qs[0], ))
+        t0.daemon = True
+        t0.start()
+
+        t1 = threading.Thread(
+            target=load, args=(
+                neg_pattern,
+                qs[1], ))
+        t1.daemon = True
+        t1.start()
+
+        # Read alternatively from qs[0] and qs[1].
+        i = 0
+        doc = qs[i].get()
+        while doc != None:
+            yield [word_idx.get(w, UNK) for w in doc], i % 2
+            i += 1
+            doc = qs[i % 2].get()
+
+        # If any queue is empty, reads from the other queue.
+        i += 1
+        doc = qs[i % 2].get()
+        while doc != None:
+            yield [word_idx.get(w, UNK) for w in doc], i % 2
+            doc = qs[i % 2].get()
+
+    return reader()
+
+
+def train(word_idx):
+    return reader_creator(
+        re.compile("aclImdb/train/pos/.*\.txt$"),
+        re.compile("aclImdb/train/neg/.*\.txt$"), word_idx, 1000)
+
+
+def test(word_idx):
+    return reader_creator(
+        re.compile("aclImdb/test/pos/.*\.txt$"),
+        re.compile("aclImdb/test/neg/.*\.txt$"), word_idx, 1000)
diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py
new file mode 100644
index 00000000000000..b3791ddad66e58
--- /dev/null
+++ b/python/paddle/v2/dataset/imikolov.py
@@ -0,0 +1,79 @@
+"""
+imikolov's simple dataset: http://www.fit.vutbr.cz/~imikolov/rnnlm/
+"""
+import paddle.v2.dataset.common
+import tarfile
+
+__all__ = ['train', 'test']
+
+URL = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz'
+MD5 = '30177ea32e27c525793142b6bf2c8e2d'
+
+
+def word_count(f, word_freq=None):
+    add = paddle.v2.dataset.common.dict_add
+    if word_freq == None:
+        word_freq = {}
+
+    for l in f:
+        for w in l.strip().split():
+            add(word_freq, w)
+        add(word_freq, '<s>')
+        add(word_freq, '<e>')
+
+    return word_freq
+
+
+def build_dict(train_filename, test_filename):
+    with tarfile.open(
+            paddle.v2.dataset.common.download(
+                paddle.v2.dataset.imikolov.URL, 'imikolov',
+                paddle.v2.dataset.imikolov.MD5)) as tf:
+        trainf = tf.extractfile(train_filename)
+        testf = tf.extractfile(test_filename)
+        word_freq = word_count(testf, word_count(trainf))
+
+        TYPO_FREQ = 50
+        word_freq = filter(lambda x: x[1] > TYPO_FREQ, word_freq.items())
+
+        dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0]))
+        words, _ = list(zip(*dictionary))
+        word_idx = dict(zip(words, xrange(len(words))))
+        word_idx['<unk>'] = len(words)
+
+    return word_idx
+
+
+word_idx = {}
+
+
+def reader_creator(filename, n):
+    global word_idx
+    if len(word_idx) == 0:
+        word_idx = build_dict('./simple-examples/data/ptb.train.txt',
+                              './simple-examples/data/ptb.valid.txt')
+
+    def reader():
+        with tarfile.open(
+                paddle.v2.dataset.common.download(
+                    paddle.v2.dataset.imikolov.URL, 'imikolov',
+                    paddle.v2.dataset.imikolov.MD5)) as tf:
+            f = tf.extractfile(filename)
+
+            UNK = word_idx['<unk>']
+            for l in f:
+                l = ['<s>'] + l.strip().split() + ['<e>']
+                if len(l) >= n:
+                    l = [word_idx.get(w, UNK) for w in l]
+                    for i in range(n, len(l) + 1):
+                        yield tuple(l[i - n:i])
+
+    return reader
+
+
+def train(n):
+    return reader_creator('./simple-examples/data/ptb.train.txt', n)
+
+
+def test(n):
+    return reader_creator('./simple-examples/data/ptb.valid.txt', n)
diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py
new file mode 100644
index 00000000000000..ebcdff78b317ce
--- /dev/null
+++ b/python/paddle/v2/dataset/mnist.py
@@ -0,0 +1,74 @@
+"""
+MNIST dataset.
+"""
+import paddle.v2.dataset.common
+import subprocess
+import numpy
+import platform
+__all__ = ['train', 'test']
+
+URL_PREFIX = 'http://yann.lecun.com/exdb/mnist/'
+TEST_IMAGE_URL = URL_PREFIX + 't10k-images-idx3-ubyte.gz'
+TEST_IMAGE_MD5 = '9fb629c4189551a2d022fa330f9573f3'
+TEST_LABEL_URL = URL_PREFIX + 't10k-labels-idx1-ubyte.gz'
+TEST_LABEL_MD5 = 'ec29112dd5afa0611ce80d1b7f02629c'
+TRAIN_IMAGE_URL = URL_PREFIX + 'train-images-idx3-ubyte.gz'
+TRAIN_IMAGE_MD5 = 'f68b3c2dcbeaaa9fbdd348bbdeb94873'
+TRAIN_LABEL_URL = URL_PREFIX + 'train-labels-idx1-ubyte.gz'
+TRAIN_LABEL_MD5 = 'd53e105ee54ea40749a09fcbcd1e9432'
+
+
+def reader_creator(image_filename, label_filename, buffer_size):
+    def reader():
+        if platform.system() == 'Darwin':
+            zcat_cmd = 'gzcat'
+        elif platform.system() == 'Linux':
+            zcat_cmd = 'zcat'
+        else:
+            raise NotImplementedError()
+
+        # According to http://stackoverflow.com/a/38061619/724872, we
+        # cannot use standard package gzip here.
+        m = subprocess.Popen([zcat_cmd, image_filename], stdout=subprocess.PIPE)
+        m.stdout.read(16)  # skip some magic bytes
+
+        l = subprocess.Popen([zcat_cmd, label_filename], stdout=subprocess.PIPE)
+        l.stdout.read(8)  # skip some magic bytes
+
+        try:  # reader could be break.
+            while True:
+                labels = numpy.fromfile(
+                    l.stdout, 'ubyte', count=buffer_size).astype("int")
+
+                if labels.size != buffer_size:
+                    break  # numpy.fromfile returns empty slice after EOF.
+
+                images = numpy.fromfile(
+                    m.stdout, 'ubyte', count=buffer_size * 28 * 28).reshape(
+                        (buffer_size, 28 * 28)).astype('float32')
+
+                images = images / 255.0 * 2.0 - 1.0
+
+                for i in xrange(buffer_size):
+                    yield images[i, :], int(labels[i])
+        finally:
+            m.terminate()
+            l.terminate()
+
+    return reader
+
+
+def train():
+    return reader_creator(
+        paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist',
+                                          TRAIN_IMAGE_MD5),
+        paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist',
+                                          TRAIN_LABEL_MD5), 100)
+
+
+def test():
+    return reader_creator(
+        paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist',
+                                          TEST_IMAGE_MD5),
+        paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist',
+                                          TEST_LABEL_MD5), 100)
diff --git a/python/paddle/v2/dataset/movielens.py b/python/paddle/v2/dataset/movielens.py
new file mode 100644
index 00000000000000..dcffcff2f58c63
--- /dev/null
+++ b/python/paddle/v2/dataset/movielens.py
@@ -0,0 +1,120 @@
+import zipfile
+from common import download
+import re
+import random
+import functools
+
+__all__ = ['train_creator', 'test_creator']
+
+
+class MovieInfo(object):
+    def __init__(self, index, categories, title):
+        self.index = int(index)
+        self.categories = categories
+        self.title = title
+
+    def value(self):
+        return [
+            self.index, [CATEGORIES_DICT[c] for c in self.categories],
+            [MOVIE_TITLE_DICT[w.lower()] for w in self.title.split()]
+        ]
+
+
+class UserInfo(object):
+    def __init__(self, index, gender, age, job_id):
+        self.index = int(index)
+        self.is_male = gender == 'M'
+        self.age = [1, 18, 25, 35, 45, 50, 56].index(int(age))
+        self.job_id = int(job_id)
+
+    def value(self):
+        return [self.index, 0 if self.is_male else 1, self.age, self.job_id]
+
+
+MOVIE_INFO = None
+MOVIE_TITLE_DICT = None
+CATEGORIES_DICT = None
+USER_INFO = None
+
+
+def __initialize_meta_info__():
+    fn = download(
+        url='http://files.grouplens.org/datasets/movielens/ml-1m.zip',
+        md5='c4d9eecfca2ab87c1945afe126590906')
+    global MOVIE_INFO
+    if MOVIE_INFO is None:
+        pattern = re.compile(r'^(.*)\((\d+)\)$')
+        with zipfile.ZipFile(file=fn) as package:
+            for info in package.infolist():
+                assert isinstance(info, zipfile.ZipInfo)
+                MOVIE_INFO = dict()
+                title_word_set = set()
+                categories_set = set()
+                with package.open('ml-1m/movies.dat') as movie_file:
+                    for i, line in enumerate(movie_file):
+                        movie_id, title, categories = line.strip().split('::')
+                        categories = categories.split('|')
+                        for c in categories:
+                            categories_set.add(c)
+                        title = pattern.match(title).group(1)
+                        MOVIE_INFO[int(movie_id)] = MovieInfo(
+                            index=movie_id, categories=categories, title=title)
+                        for w in title.split():
+                            title_word_set.add(w.lower())
+
+                global MOVIE_TITLE_DICT
+                MOVIE_TITLE_DICT = dict()
+                for i, w in enumerate(title_word_set):
+                    MOVIE_TITLE_DICT[w] = i
+
+                global CATEGORIES_DICT
+                CATEGORIES_DICT = dict()
+                for i, c in enumerate(categories_set):
+                    CATEGORIES_DICT[c] = i
+
+                global USER_INFO
+                USER_INFO = dict()
+                with package.open('ml-1m/users.dat') as user_file:
+                    for line in user_file:
+                        uid, gender, age, job, _ = line.strip().split("::")
+                        USER_INFO[int(uid)] = UserInfo(
+                            index=uid, gender=gender, age=age, job_id=job)
+    return fn
+
+
+def __reader__(rand_seed=0, test_ratio=0.1, is_test=False):
+    fn = __initialize_meta_info__()
+    rand = random.Random(x=rand_seed)
+    with zipfile.ZipFile(file=fn) as package:
+        with package.open('ml-1m/ratings.dat') as rating:
+            for line in rating:
+                if (rand.random() < test_ratio) == is_test:
+                    uid, mov_id, rating, _ = line.strip().split("::")
+                    uid = int(uid)
+                    mov_id = int(mov_id)
+                    rating = float(rating) * 2 - 5.0
+
+                    mov = MOVIE_INFO[mov_id]
+                    usr = USER_INFO[uid]
+                    yield usr.value() + mov.value() + [[rating]]
+
+
+def __reader_creator__(**kwargs):
+    return lambda: __reader__(**kwargs)
+
+
+train_creator = functools.partial(__reader_creator__, is_test=False)
+test_creator = functools.partial(__reader_creator__, is_test=True)
+
+
+def unittest():
+    for train_count, _ in enumerate(train_creator()()):
+        pass
+    for test_count, _ in enumerate(test_creator()()):
+        pass
+
+    print train_count, test_count
+
+
+if __name__ == '__main__':
+    unittest()
diff --git a/python/paddle/v2/dataset/tests/cifar_test.py b/python/paddle/v2/dataset/tests/cifar_test.py
new file mode 100644
index 00000000000000..a2af45ecf50846
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/cifar_test.py
@@ -0,0 +1,42 @@
+import paddle.v2.dataset.cifar
+import unittest
+
+
+class TestCIFAR(unittest.TestCase):
+    def check_reader(self, reader):
+        sum = 0
+        label = 0
+        for l in reader():
+            self.assertEqual(l[0].size, 3072)
+            if l[1] > label:
+                label = l[1]
+            sum += 1
+        return sum, label
+
+    def test_test10(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.cifar.test10())
+        self.assertEqual(instances, 10000)
+        self.assertEqual(max_label_value, 9)
+
+    def test_train10(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.cifar.train10())
+        self.assertEqual(instances, 50000)
+        self.assertEqual(max_label_value, 9)
+
+    def test_test100(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.cifar.test100())
+        self.assertEqual(instances, 10000)
+        self.assertEqual(max_label_value, 99)
+
+    def test_train100(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.cifar.train100())
+        self.assertEqual(instances, 50000)
+        self.assertEqual(max_label_value, 99)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/common_test.py b/python/paddle/v2/dataset/tests/common_test.py
new file mode 100644
index 00000000000000..7d8406171b8478
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/common_test.py
@@ -0,0 +1,23 @@
+import paddle.v2.dataset.common
+import unittest
+import tempfile
+
+
+class TestCommon(unittest.TestCase):
+    def test_md5file(self):
+        _, temp_path = tempfile.mkstemp()
+        with open(temp_path, 'w') as f:
+            f.write("Hello\n")
+        self.assertEqual('09f7e02f1290be211da707a266f153b3',
+                         paddle.v2.dataset.common.md5file(temp_path))
+
+    def test_download(self):
+        yi_avatar = 'https://avatars0.githubusercontent.com/u/1548775?v=3&s=460'
+        self.assertEqual(
+            paddle.v2.dataset.common.DATA_HOME + '/test/1548775?v=3&s=460',
+            paddle.v2.dataset.common.download(
+                yi_avatar, 'test', 'f75287202d6622414c706c36c16f8e0d'))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/imdb_test.py b/python/paddle/v2/dataset/tests/imdb_test.py
new file mode 100644
index 00000000000000..e887af16634d2d
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/imdb_test.py
@@ -0,0 +1,43 @@
+import paddle.v2.dataset.imdb
+import unittest
+import re
+
+TRAIN_POS_PATTERN = re.compile("aclImdb/train/pos/.*\.txt$")
+TRAIN_NEG_PATTERN = re.compile("aclImdb/train/neg/.*\.txt$")
+TRAIN_PATTERN = re.compile("aclImdb/train/.*\.txt$")
+
+TEST_POS_PATTERN = re.compile("aclImdb/test/pos/.*\.txt$")
+TEST_NEG_PATTERN = re.compile("aclImdb/test/neg/.*\.txt$")
+TEST_PATTERN = re.compile("aclImdb/test/.*\.txt$")
+
+
+class TestIMDB(unittest.TestCase):
+    word_idx = None
+
+    def test_build_dict(self):
+        if self.word_idx == None:
+            self.word_idx = paddle.v2.dataset.imdb.build_dict(TRAIN_PATTERN,
+                                                              150)
+
+        self.assertEqual(len(self.word_idx), 7036)
+
+    def check_dataset(self, dataset, expected_size):
+        if self.word_idx == None:
+            self.word_idx = paddle.v2.dataset.imdb.build_dict(TRAIN_PATTERN,
+                                                              150)
+
+        sum = 0
+        for l in dataset(self.word_idx):
+            self.assertEqual(l[1], sum % 2)
+            sum += 1
+        self.assertEqual(sum, expected_size)
+
+    def test_train(self):
+        self.check_dataset(paddle.v2.dataset.imdb.train, 25000)
+
+    def test_test(self):
+        self.check_dataset(paddle.v2.dataset.imdb.test, 25000)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/imikolov_test.py b/python/paddle/v2/dataset/tests/imikolov_test.py
new file mode 100644
index 00000000000000..9b1748eaaa7f91
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/imikolov_test.py
@@ -0,0 +1,20 @@
+import paddle.v2.dataset.imikolov
+import unittest
+
+
+class TestMikolov(unittest.TestCase):
+    def check_reader(self, reader, n):
+        for l in reader():
+            self.assertEqual(len(l), n)
+
+    def test_train(self):
+        n = 5
+        self.check_reader(paddle.v2.dataset.imikolov.train(n), n)
+
+    def test_test(self):
+        n = 5
+        self.check_reader(paddle.v2.dataset.imikolov.test(n), n)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/mnist_test.py b/python/paddle/v2/dataset/tests/mnist_test.py
new file mode 100644
index 00000000000000..b4408cc2f590d4
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/mnist_test.py
@@ -0,0 +1,30 @@
+import paddle.v2.dataset.mnist
+import unittest
+
+
+class TestMNIST(unittest.TestCase):
+    def check_reader(self, reader):
+        sum = 0
+        label = 0
+        for l in reader():
+            self.assertEqual(l[0].size, 784)
+            if l[1] > label:
+                label = l[1]
+            sum += 1
+        return sum, label
+
+    def test_train(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.mnist.train())
+        self.assertEqual(instances, 60000)
+        self.assertEqual(max_label_value, 9)
+
+    def test_test(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.mnist.test())
+        self.assertEqual(instances, 10000)
+        self.assertEqual(max_label_value, 9)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/event.py b/python/paddle/v2/event.py
new file mode 100644
index 00000000000000..a78bcf076cc65e
--- /dev/null
+++ b/python/paddle/v2/event.py
@@ -0,0 +1,79 @@
+"""
+All training events.
+
+There are:
+
+* BeginIteration
+* EndIteration
+* BeginPass
+* EndPass
+
+TODO(yuyang18): Complete it!
+"""
+import py_paddle.swig_paddle as api
+
+__all__ = [
+    'EndIteration', 'BeginIteration', 'BeginPass', 'EndPass', 'TestResult'
+]
+
+
+class WithMetric(object):
+    def __init__(self, evaluator):
+        if not isinstance(evaluator, api.Evaluator):
+            raise TypeError("Evaluator should be api.Evaluator type")
+        self.__evaluator__ = evaluator
+
+    @property
+    def metrics(self):
+        names = self.__evaluator__.getNames()
+        retv = dict()
+        for each_name in names:
+            val = self.__evaluator__.getValue(each_name)
+            retv[each_name] = val
+        return retv
+
+
+class TestResult(WithMetric):
+    def __init__(self, evaluator):
+        super(TestResult, self).__init__(evaluator)
+
+
+class BeginPass(object):
+    """
+    Event On One Pass Training Start.
+    """
+
+    def __init__(self, pass_id):
+        self.pass_id = pass_id
+
+
+class EndPass(WithMetric):
+    """
+    Event On One Pass Training Complete.
+    """
+
+    def __init__(self, pass_id, evaluator):
+        self.pass_id = pass_id
+        WithMetric.__init__(self, evaluator)
+
+
+class BeginIteration(object):
+    """
+    Event On One Batch Training Start.
+    """
+
+    def __init__(self, pass_id, batch_id):
+        self.pass_id = pass_id
+        self.batch_id = batch_id
+
+
+class EndIteration(WithMetric):
+    """
+    Event On One Batch Training Complete.
+    """
+
+    def __init__(self, pass_id, batch_id, cost, evaluator):
+        self.pass_id = pass_id
+        self.batch_id = batch_id
+        self.cost = cost
+        WithMetric.__init__(self, evaluator)
diff --git a/python/paddle/v2/inferencer.py b/python/paddle/v2/inferencer.py
new file mode 100644
index 00000000000000..ac03b016c9b8bf
--- /dev/null
+++ b/python/paddle/v2/inferencer.py
@@ -0,0 +1,59 @@
+import py_paddle.swig_paddle as api
+
+import topology
+from data_feeder import DataFeeder
+import itertools
+import numpy
+
+__all__ = ['Inference', 'infer']
+
+
+class Inference(object):
+    def __init__(self, output, parameters):
+        topo = topology.Topology(output)
+        gm = api.GradientMachine.createFromConfigProto(
+            topo.proto(), api.CREATE_MODE_TESTING, [api.PARAMETER_VALUE])
+        for param in gm.getParameters():
+            val = param.getBuf(api.PARAMETER_VALUE)
+            name = param.getName()
+            assert isinstance(val, api.Vector)
+            val.copyFromNumpyArray(parameters.get(name).flatten())
+        self.__gradient_machine__ = gm
+        self.__data_types__ = topo.data_type()
+
+    def iter_infer(self, reader, reader_dict=None):
+        if reader_dict is None:
+            reader_dict = self.default_reader_dict()
+        feeder = DataFeeder(self.__data_types__, reader_dict)
+        self.__gradient_machine__.start()
+        for data_batch in reader():
+            yield self.__gradient_machine__.forwardTest(feeder(data_batch))
+        self.__gradient_machine__.finish()
+
+    def iter_infer_field(self, field, **kwargs):
+        for result in self.iter_infer(**kwargs):
+            yield [each_result[field] for each_result in result]
+
+    def infer(self, field='value', **kwargs):
+        retv = None
+        for result in self.iter_infer_field(field=field, **kwargs):
+            if retv is None:
+                retv = [[]] * len(result)
+            for i, item in enumerate(result):
+                retv[i].append(item)
+        retv = [numpy.concatenate(out) for out in retv]
+        if len(retv) == 1:
+            return retv[0]
+        else:
+            return retv
+
+    def default_reader_dict(self):
+        reader_dict = dict()
+        for i, tp in enumerate(self.__data_types__):
+            reader_dict[tp[0]] = i
+        return reader_dict
+
+
+def infer(output, parameters, reader, reader_dict=None, field='value'):
+    inferer = Inference(output=output, parameters=parameters)
+    return inferer.infer(field=field, reader=reader, reader_dict=reader_dict)
diff --git a/python/paddle/v2/layer.py b/python/paddle/v2/layer.py
new file mode 100644
index 00000000000000..67111f1315fbb0
--- /dev/null
+++ b/python/paddle/v2/layer.py
@@ -0,0 +1,269 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Before this new package paddle.v2.layer, users would need to use functions
+in paddle.trainer_config_helpers.layers to configure networks.
+
+The Old Way:
+=========
+This old way requires that the creation of a network be defined in a Python
+function, say network_config, and that this Python function being passed to
+paddle.trainer_config_helpers.parse_network_config for the creation of
+protobuf message description of this network.
+
+```python
+def network_config():
+  img = paddle.trainer_config_helpers.data_layer(name="pixel", size=784)
+  inference = paddle.trainer_config_helpers.fc_layer(
+    input=img,
+    size=10,
+    act=paddle.trainer_config_helpers.SoftmaxActivation())
+  cost = paddle.trainer_config_helpers.classification_cost(
+    input=inference,
+    label=paddle.trainer_config_helpers.data_layer(name="label", size=10))
+
+proto_desc = parse_network_config(network_config)
+```
+
+When parse_network_config executes network_config, those layer definition
+functions like data_layer and fc_layer would change some Python global variables,
+so that after the execution, parse_network_config could collect information from
+these global variables and generates the protobuf message.
+
+
+
+The New Way:
+=========
+In this PR, we define a function in paddle.v2.layer which creates a Python
+class for each layer creation function in paddle.trainer_config_helpers.layers.
+Users can use create a network as follows:
+
+```python
+img = paddle.v2.layer.data(name="pixel", size=784)
+inference = paddle.v2.layer.fc(input=img, size=10, act=paddle.v2.layer.Softmax())
+cost = paddle.v2.layer.classification(
+  input=inference,
+  label=paddle.v2.layer.data(name="label", size=10))
+
+parameters = paddle.v2.parameters.create(cost)
+```
+
+This new way doesn't require those invocations to layer definition functions
+to be in a Python function but could be anywhere.
+
+Also, the creation of a protobuf message is hidden in the invocation of
+paddle.v2.parameters.create, no longer exposed to users.
+"""
+from config_base import Layer, __convert_to_v2__
+import paddle.trainer_config_helpers as conf_helps
+from paddle.trainer_config_helpers.config_parser_utils import \
+    parse_network_config as __parse__
+
+from paddle.trainer_config_helpers.default_decorators import wrap_name_default
+from paddle.trainer_config_helpers.default_decorators import wrap_act_default
+from paddle.trainer_config_helpers.default_decorators import \
+    wrap_bias_attr_default
+from paddle.trainer_config_helpers.layers import layer_support
+
+import data_type
+import activation
+
+__all__ = ['parse_network', 'data']
+
+__projection_names__ = filter(lambda x: x.endswith('_projection'),
+                              dir(conf_helps))
+__all__ += __projection_names__
+
+__operator_names__ = filter(lambda x: x.endswith('_operator'), dir(conf_helps))
+__all__ += __operator_names__
+
+
+def parse_network(*outputs):
+    """
+    parse all output layers and then generate a model config proto.
+    :param outputs:
+    :return:
+    """
+
+    def __real_func__():
+        context = dict()
+        real_output = [each.to_proto(context=context) for each in outputs]
+        conf_helps.outputs(real_output)
+
+    return __parse__(__real_func__)
+
+
+"""
+Some layer may need some special config, and can not use __convert_to_v2__ to convert.
+So we also need to implement some special LayerV2.
+"""
+
+
+class DataLayerV2(Layer):
+    def __init__(self, name, type, **kwargs):
+        assert isinstance(type, data_type.InputType)
+
+        self.type = type
+        self.__method_name__ = 'data_layer'
+        self.__kwargs__ = kwargs
+
+        super(DataLayerV2, self).__init__(name=name, parent_layers=dict())
+
+    def to_proto_impl(self, **kwargs):
+        args = dict()
+        args['size'] = self.type.dim
+        for each in kwargs:
+            args[each] = kwargs[each]
+        for each in self.__kwargs__:
+            args[each] = self.__kwargs__[each]
+        return getattr(conf_helps, self.__method_name__)(name=self.name, **args)
+
+
+class MixedLayerV2(Layer):
+    """
+    This class is use to support `with` grammar. If not, the following code
+    could convert mixed_layer simply.
+
+        mixed = __convert_to_v2__(
+            'mixed_layer', name_prefix='mixed', parent_names=['input'])
+    """
+
+    class AddToSealedMixedLayerExceptionV2(Exception):
+        pass
+
+    def __init__(self,
+                 size=0,
+                 input=None,
+                 name=None,
+                 act=None,
+                 bias_attr=None,
+                 layer_attr=None):
+        self.__method_name__ = 'mixed_layer'
+        self.finalized = False
+        self.__inputs__ = []
+        if input is not None:
+            self.__inputs__ = input
+
+        other_kwargs = dict()
+        other_kwargs['name'] = name
+        other_kwargs['size'] = size
+        other_kwargs['act'] = act
+        other_kwargs['bias_attr'] = bias_attr
+        other_kwargs['layer_attr'] = layer_attr
+
+        parent_layers = {"input": self.__inputs__}
+        super(MixedLayerV2, self).__init__(name, parent_layers)
+        self.__other_kwargs__ = other_kwargs
+
+    def __iadd__(self, other):
+        if not self.finalized:
+            self.__inputs__.append(other)
+            return self
+        else:
+            raise MixedLayerTypeV2.AddToSealedMixedLayerExceptionV2()
+
+    def __enter__(self):
+        assert len(self.__inputs__) == 0
+        return self
+
+    def __exit__(self, *args, **kwargs):
+        self.finalized = True
+
+    def to_proto_impl(self, **kwargs):
+        args = dict()
+        for each in kwargs:
+            args[each] = kwargs[each]
+        for each in self.__other_kwargs__:
+            args[each] = self.__other_kwargs__[each]
+        return getattr(conf_helps, self.__method_name__)(**args)
+
+
+@wrap_name_default("mixed")
+@wrap_act_default(act=activation.Linear())
+@wrap_bias_attr_default(has_bias=False)
+@layer_support(conf_helps.layers.ERROR_CLIPPING, conf_helps.layers.DROPOUT)
+def mixed(size=0,
+          name=None,
+          input=None,
+          act=None,
+          bias_attr=False,
+          layer_attr=None):
+    return MixedLayerV2(size, input, name, act, bias_attr, layer_attr)
+
+
+LayerV2 = Layer
+data = DataLayerV2
+AggregateLevel = conf_helps.layers.AggregateLevel
+ExpandLevel = conf_helps.layers.ExpandLevel
+
+
+def __layer_name_mapping__(inname):
+    if inname in ['data_layer', 'memory', 'mixed_layer']:
+        # Do Not handle these layers
+        return
+    elif inname == 'maxid_layer':
+        return 'max_id'
+    elif inname.endswith('memory') or inname.endswith(
+            '_seq') or inname.endswith('_sim') or inname == 'hsigmoid':
+        return inname
+    elif inname in [
+            'cross_entropy', 'multi_binary_label_cross_entropy',
+            'cross_entropy_with_selfnorm'
+    ]:
+        return inname + "_cost"
+    elif inname.endswith('_cost'):
+        return inname
+    elif inname.endswith("_layer"):
+        return inname[:-len("_layer")]
+
+
+def __layer_name_mapping_parent_names__(inname):
+    all_args = getattr(conf_helps, inname).argspec.args
+    return filter(
+        lambda x: x in ['input1', 'input2','label', 'input', 'a', 'b', 'expand_as',
+                        'weights', 'vectors', 'weight', 'score', 'left', 'right'],
+        all_args)
+
+
+def __convert_layer__(_new_name_, _old_name_, _parent_names_):
+    global __all__
+    __all__.append(_new_name_)
+    globals()[new_name] = __convert_to_v2__(_old_name_, _parent_names_)
+
+
+for each_layer_name in dir(conf_helps):
+    new_name = __layer_name_mapping__(each_layer_name)
+    if new_name is not None:
+        parent_names = __layer_name_mapping_parent_names__(each_layer_name)
+        assert len(parent_names) != 0, each_layer_name
+        __convert_layer__(new_name, each_layer_name, parent_names)
+
+del parent_names
+del new_name
+del each_layer_name
+
+# convert projection
+for prj in __projection_names__:
+    globals()[prj] = __convert_to_v2__(
+        prj, parent_names=['input'], is_default_name=False)
+
+# convert operator
+operator_list = [
+    # [V1_method_name, parent_names],
+    ['dotmul_operator', ['a', 'b']],
+    ['conv_operator', ['img', 'filter']]
+]
+for op in operator_list:
+    globals()[op[0]] = __convert_to_v2__(
+        op[0], parent_names=op[1], is_default_name=False)
diff --git a/python/paddle/v2/networks.py b/python/paddle/v2/networks.py
new file mode 100644
index 00000000000000..74d91593d8551b
--- /dev/null
+++ b/python/paddle/v2/networks.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.trainer_config_helpers.networks as conf_nw
+import inspect
+from config_base import __convert_to_v2__
+
+__all__ = []
+
+
+def __initialize__():
+    for each_subnetwork in conf_nw.__all__:
+        if each_subnetwork in ['inputs', 'outputs']:
+            continue
+        func = getattr(conf_nw, each_subnetwork)
+        if hasattr(func, 'argspec'):
+            argspec = func.argspec
+        else:
+            argspec = inspect.getargspec(func)
+        if each_subnetwork == 'simple_attention':
+            parents = ['encoded_sequence', 'encoded_proj', 'decoder_state']
+        else:
+            parents = filter(lambda x: x.startswith('input'), argspec.args)
+        assert len(parents) != 0, each_subnetwork
+        v2_subnet = __convert_to_v2__(
+            each_subnetwork,
+            parent_names=parents,
+            is_default_name='name' in argspec.args)
+        globals()[each_subnetwork] = v2_subnet
+        global __all__
+        __all__.append(each_subnetwork)
+
+
+__initialize__()
diff --git a/python/paddle/v2/optimizer.py b/python/paddle/v2/optimizer.py
new file mode 100644
index 00000000000000..10e255dc945efb
--- /dev/null
+++ b/python/paddle/v2/optimizer.py
@@ -0,0 +1,107 @@
+import py_paddle.swig_paddle as swig_api
+import paddle.trainer_config_helpers.optimizers as v1_optimizers
+import paddle.trainer_config_helpers.config_parser_utils as config_parser_utils
+import paddle.v2
+
+__all__ = [
+    'Momentum', 'Adam', 'Adamax', 'AdaGrad', 'DecayedAdaGrad', 'AdaDelta',
+    'RMSProp', 'ModelAverage', 'L2Regularization'
+]
+
+
+class Optimizer(object):
+    def __init__(self, **kwargs):
+        if 'batch_size' in kwargs:
+            del kwargs['batch_size']  # not important for python library.
+
+        def __impl__():
+            v1_optimizers.settings(batch_size=1, **kwargs)
+
+        self.__opt_conf_proto__ = config_parser_utils.parse_optimizer_config(
+            __impl__)
+        self.__opt_conf__ = swig_api.OptimizationConfig.createFromProto(
+            self.__opt_conf_proto__)
+
+    def enable_types(self):
+        """
+        get enable_types for each optimizer.
+        enable_types = [value, gradient, momentum, etc]
+        For each optimizer(SGD, Adam), GradientMachine should enable different
+        buffers.
+        """
+        tmp = swig_api.ParameterOptimizer.create(self.__opt_conf__)
+        assert isinstance(tmp, swig_api.ParameterOptimizer)
+        return tmp.getParameterTypes()
+
+    def create_local_updater(self):
+        return swig_api.ParameterUpdater.createLocalUpdater(self.__opt_conf__)
+
+    def create_remote_updater(self, pass_num):
+        return swig_api.ParameterUpdater.createRemoteUpdater(self.__opt_conf__,
+                                                             pass_num)
+
+
+class Momentum(Optimizer):
+    def __init__(self, momentum=None, sparse=False, **kwargs):
+        learning_method = v1_optimizers.MomentumOptimizer(
+            momentum=None, sparse=False)
+        super(Momentum, self).__init__(
+            learning_method=learning_method, **kwargs)
+
+
+class Adam(Optimizer):
+    def __init__(self, beta1=0.9, beta2=0.999, epsilon=1e-8, **kwargs):
+        learning_method = v1_optimizers.AdamOptimizer(
+            beta1=beta1, beta2=beta2, epsilon=epsilon)
+        super(Adam, self).__init__(learning_method=learning_method, **kwargs)
+
+
+class Adamax(Optimizer):
+    def __init__(self, beta1=0.9, beta2=0.999, **kwargs):
+        learning_method = v1_optimizers.AdamaxOptimizer(
+            beta1=beta1, beta2=beta2)
+        super(Adamax, self).__init__(learning_method=learning_method, **kwargs)
+
+
+class AdaGrad(Optimizer):
+    def __init__(self, **kwargs):
+        learning_method = v1_optimizers.AdaGradOptimizer()
+        super(AdaGrad, self).__init__(learning_method=learning_method, **kwargs)
+
+
+class DecayedAdaGrad(Optimizer):
+    def __init__(self, rho=0.95, epsilon=1e-06, **kwargs):
+        learning_method = v1_optimizers.DecayedAdaGradOptimizer(
+            rho=rho, epsilon=epsilon)
+        super(DecayedAdaGrad, self).__init__(
+            learning_method=learning_method, **kwargs)
+
+
+class AdaDelta(Optimizer):
+    def __init__(self, rho=0.95, epsilon=1e-06, **kwargs):
+        learning_method = v1_optimizers.AdaDeltaOptimizer(
+            rho=rho, epsilon=epsilon)
+        super(AdaDelta, self).__init__(
+            learning_method=learning_method, **kwargs)
+
+
+class RMSProp(Optimizer):
+    def __init__(self, rho=0.95, epsilon=1e-6, **kwargs):
+        learning_method = v1_optimizers.RMSPropOptimizer(
+            rho=rho, epsilon=epsilon)
+        super(RMSProp, self).__init__(learning_method=learning_method, **kwargs)
+
+
+ModelAverage = v1_optimizers.ModelAverage
+L2Regularization = v1_optimizers.L2Regularization
+
+if __name__ == '__main__':
+    swig_api.initPaddle('--use_gpu=false')
+    for opt in [
+            Momentum(), Adam(), Adamax(), AdaGrad(), DecayedAdaGrad(),
+            AdaDelta(), RMSProp(), Adam(
+                model_average=ModelAverage(average_window=0.5),
+                regularization=L2Regularization(rate=0.5),
+                gradient_clipping_threshold=25)
+    ]:
+        print opt, opt.enable_types()
diff --git a/python/paddle/v2/parameters.py b/python/paddle/v2/parameters.py
new file mode 100644
index 00000000000000..2a6026bcab1c8a
--- /dev/null
+++ b/python/paddle/v2/parameters.py
@@ -0,0 +1,261 @@
+import numpy as np
+import py_paddle.swig_paddle as api
+from paddle.proto.ParameterConfig_pb2 import ParameterConfig
+
+from topology import Topology
+
+__all__ = ['Parameters', 'create']
+
+
+def create(layers):
+    """
+    Create parameter pool by topology.
+    :param layers:
+    :return:
+    """
+    topology = Topology(layers)
+    pool = Parameters()
+    for param in topology.proto().parameters:
+        pool.__append_config__(param)
+    return pool
+
+
+class Parameters(object):
+    """
+    Parameters is a dictionary contains Paddle's parameter. The key of
+    Parameters is the name of parameter. The value of Parameters is a plain
+    :code:`numpy.ndarry` .
+
+    Basically usage is
+
+    ..  code-block:: python
+
+        data = paddle.layers.data(...)
+        ...
+        out = paddle.layers.fc(...)
+
+        parameters = paddle.parameters.create(out)
+
+        parameter_names = parameters.names()
+        fc_mat = parameters.get('fc')
+        print fc_mat
+    """
+
+    def __init__(self):
+        self.__param_conf__ = dict()
+        self.__gradient_machines__ = []
+        self.__tmp_params__ = []
+
+    def __append_config__(self, param_conf):
+        """
+        Append a parameter configuration. It used to initialize Parameters and
+        should be invoked only in paddle.parameters.create
+
+        :param param_conf: The parameter configuration in protobuf
+        :type param_conf: ParameterConfig
+        :return: Nothing
+        """
+
+        if not isinstance(param_conf, ParameterConfig):
+            raise ValueError("param_conf must be paddle.proto.ParameterConfig")
+
+        if param_conf.name in self.__param_conf__:
+            raise ValueError("duplicated parameter %s" % param_conf.name)
+
+        self.__param_conf__[param_conf.name] = param_conf
+
+    def keys(self):
+        """
+        keys are the names of each parameter.
+        :return: list of parameter name
+        :rtype: list
+        """
+        return self.__param_conf__.keys()
+
+    def names(self):
+        """
+        names of each parameter.
+        :return: list of parameter name
+        :rtype: list
+        """
+        return self.keys()
+
+    def has_key(self, key):
+        """
+        has_key return true if there are such parameter name == key
+        :param key: Parameter name
+        :type key: basestring
+        :return: True if contains such key
+        """
+        return key in self.__param_conf__.keys()
+
+    def __iter__(self):
+        """
+        Return an iterator of parameter name. It is used by `for loop`
+        or `in` operator.
+
+        ..  code-block:: python
+
+            parameters = paddle.parameters.create(...)
+            if "fc_param" in parameters:
+                print 'OK'
+        :return: an iterator of parameter name
+        :rtype: iterator
+        """
+        return iter(self.__param_conf__)
+
+    def __getitem__(self, key):
+        """
+        Get parameter by parameter name. It uses Python dict syntax.
+
+        :note: It will always copy the parameter from C++ side.
+        :param key: Parameter name
+        :type key: basestring
+        :return: parameter value
+        :rtype: np.ndarray
+        """
+        shape = self.get_shape(key)
+
+        if len(self.__gradient_machines__) == 0:
+            # create new parameter in python numpy.
+            return np.ndarray(shape=shape, dtype=np.float32)
+        else:
+            for each_gradient_machine in self.__gradient_machines__:
+                param = __get_parameter_in_gradient_machine__(
+                    each_gradient_machine, key)
+                # for simplify implementation now, we always copy from C++
+                assert isinstance(param, api.Parameter)
+                val = param.getBuf(api.PARAMETER_VALUE)
+                assert isinstance(val, api.Vector)
+                val = val.copyToNumpyArray()
+                return val
+                # else continue
+
+            raise RuntimeError("Unexpected branch")
+
+    def get_shape(self, key):
+        """
+        get shape of the parameter.
+        :param key: parameter name
+        :type key: basestring
+        :return: parameter's shape
+        :rtype: tuple
+        """
+        if not isinstance(key, basestring):
+            raise ValueError("parameter name should be string")
+        if not self.has_key(key):
+            raise ValueError("No such parameter %s" % key)
+        conf = self.__param_conf__[key]
+        return tuple(map(int, conf.dims))
+
+    def __setitem__(self, key, value):
+        """
+        Set parameter by parameter name & value. It use Python dict syntax.
+
+        :note: It will always copy the parameter to C++ side.
+        :param key: Parameter name
+        :type key: basestring
+        :param value: Parameter matrix.
+        :type value: np.ndarray
+        :return: Nothing
+        """
+
+        if not isinstance(value, np.ndarray):
+            raise ValueError("Must return ndarray")
+        value = value.astype(dtype=np.float32)
+        shape = self.get_shape(key)
+        if value.shape != shape:
+            raise ValueError("Value shape mismatch, expect %s, should %s" %
+                             (shape, value.shape))
+
+        if len(self.__gradient_machines__) == 0:
+            self.__tmp_params__.append((key, value))
+        else:
+            for each_gradient_machine in self.__gradient_machines__:
+                __copy_parameter_to_gradient_machine__(each_gradient_machine,
+                                                       key, value)
+
+    def get(self, parameter_name):
+        """
+        Get parameter by parameter name.
+
+        :note: It will always copy the parameter from C++ side.
+        :param parameter_name: parameter name
+        :type parameter_name: basestring
+        :return: The parameter matrix.
+        :rtype: np.ndarray
+        """
+        return self.__getitem__(key=parameter_name)
+
+    def set(self, parameter_name, value):
+        """
+        Set parameter by parameter name & matrix.
+        :param parameter_name: parameter name
+        :type parameter_name: basestring
+        :param value: parameter matrix
+        :type value: np.ndarray
+        :return: Nothing.
+        """
+        self.__setitem__(key=parameter_name, value=value)
+
+    def append_gradient_machine(self, gradient_machine):
+        """
+        append gradient machine to parameters. This method is used internally in
+        Trainer.train.
+
+        :param gradient_machine: Paddle C++ GradientMachine object.
+        :type gradient_machine: api.GradientMachine
+        :return:
+        """
+
+        if not isinstance(gradient_machine, api.GradientMachine):
+            raise ValueError("gradient_machine should be api.GradientMachine")
+
+        if len(self.__tmp_params__) != 0:
+            for name, val in self.__tmp_params__:
+                try:
+                    __copy_parameter_to_gradient_machine__(gradient_machine,
+                                                           name, val)
+                except ValueError:
+                    # If no such parameter in gradient machine, then don't copy
+                    pass
+
+        self.__gradient_machines__.append(gradient_machine)
+
+
+def __get_parameter_in_gradient_machine__(gradient_machine, name):
+    """
+
+    :param gradient_machine:
+    :type gradient_machine: api.GradientMachine
+    :param name:
+    :return:
+    :rtype: api.Parameter
+    """
+    params = filter(lambda p: p.getName() == name,
+                    gradient_machine.getParameters())
+
+    if len(params) == 0:
+        raise ValueError("No such parameter")
+    elif len(params) > 1:
+        raise ValueError("Unexpected branch")
+    else:
+        return params[0]
+
+
+def __copy_parameter_to_gradient_machine__(gradient_machine, name, arr):
+    """
+    Copy a python ndarray into the gradient machine.
+
+    :param gradient_machine:
+    :type gradient_machine: api.GradientMachine
+    :param name:
+    :param arr:
+    :type arr: np.ndarray
+    :return:
+    :rtype: api.Parameter
+    """
+    param = __get_parameter_in_gradient_machine__(gradient_machine, name)
+    vec = param.getBuf(api.PARAMETER_VALUE)
+    assert isinstance(vec, api.Vector)
+    vec.copyFromNumpyArray(arr.flatten())
diff --git a/python/paddle/v2/pooling.py b/python/paddle/v2/pooling.py
new file mode 100644
index 00000000000000..9076a159bb4f2c
--- /dev/null
+++ b/python/paddle/v2/pooling.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers.poolings import *
+
+__all__ = ["Max", "CudnnMax", "Avg", "CudnnAvg", "Sum", "SquareRootN"]
+
+Max = MaxPooling
+CudnnMax = CudnnMaxPooling
+Avg = AvgPooling
+CudnnAvg = CudnnAvgPooling
+Sum = SumPooling
+SquareRootN = SquareRootNPooling
diff --git a/python/paddle/v2/reader/__init__.py b/python/paddle/v2/reader/__init__.py
new file mode 100644
index 00000000000000..7373dc461b1d31
--- /dev/null
+++ b/python/paddle/v2/reader/__init__.py
@@ -0,0 +1,25 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# It would be too lengthy to require our users to prefix decorators with `decorator`.
+# For example, we want the following line
+#
+#     r = paddle.reader.decorator.bufferd(paddle.reader.creator.text("hello.txt"))
+#
+# to be a shorter version:
+#
+#     r = paddle.reader.buffered(paddle.reader.creator.text("hello.txt"))
+from decorator import *
+
+import creator
diff --git a/python/paddle/v2/reader/creator.py b/python/paddle/v2/reader/creator.py
new file mode 100644
index 00000000000000..5a91bb0b8ef6d1
--- /dev/null
+++ b/python/paddle/v2/reader/creator.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ['np_array', 'text_file']
+
+
+def np_array(x):
+    """
+    Creates a reader that yields elements of x, if it is a
+    numpy vector. Or rows of x, if it is a numpy matrix.
+    Or any sub-hyperplane indexed by the highest dimension.
+
+    :param x: the numpy array to create reader from.
+    :returns: data reader created from x.
+    """
+
+    def reader():
+        if x.ndim < 1:
+            yield x
+
+        for e in x:
+            yield e
+
+    return reader
+
+
+def text_file(path):
+    """
+    Creates a data reader that outputs text line by line from given text file.
+    Trailing new line ('\n') of each line will be removed.
+
+    :path: path of the text file.
+    :returns: data reader of text file
+    """
+
+    def reader():
+        f = open(path, "r")
+        for l in f:
+            yield l.rstrip('\n')
+        f.close()
+
+    return reader
diff --git a/python/paddle/v2/reader/decorator.py b/python/paddle/v2/reader/decorator.py
new file mode 100644
index 00000000000000..b7657e27764f09
--- /dev/null
+++ b/python/paddle/v2/reader/decorator.py
@@ -0,0 +1,232 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = [
+    'map_readers', 'buffered', 'compose', 'chain', 'shuffle',
+    'ComposeNotAligned', 'batched', 'firstn'
+]
+
+import itertools
+import random
+from Queue import Queue
+from threading import Thread
+
+
+def map_readers(func, *readers):
+    """
+    Creates a data reader that outputs return value of function using
+    output of each data readers as arguments.
+
+    :param func: function to use.
+    :param *readers: readers whose outputs will be used as arguments of func.
+    :returns: the created data reader.
+    """
+
+    def reader():
+        rs = []
+        for r in readers:
+            rs.append(r())
+        for e in itertools.imap(func, *rs):
+            yield e
+
+    return reader
+
+
+def shuffle(reader, buf_size):
+    """
+    Creates a data reader whose data output is suffled.
+
+    Output from the iterator that created by original reader will be
+    buffered into shuffle buffer, and then shuffled. The size of shuffle buffer
+    is determined by argument buf_size.
+
+    :param reader: the original reader whose output will be shuffled.
+    :param buf_size: shuffle buffer size.
+
+    :returns:the new reader whose output is shuffled.
+    """
+
+    def data_reader():
+        buf = []
+        for e in reader():
+            buf.append(e)
+            if len(buf) >= buf_size:
+                random.shuffle(buf)
+                for b in buf:
+                    yield b
+                buf = []
+
+        if len(buf) > 0:
+            random.shuffle(buf)
+            for b in buf:
+                yield b
+
+    return data_reader
+
+
+def chain(*readers):
+    """
+    Creates a data reader whose output is the outputs of input data
+    readers chained together.
+
+    If input readers output following data entries:
+    [0, 0, 0]
+    [1, 1, 1]
+    [2, 2, 2]
+    The chained reader will output:
+    [0, 0, 0, 1, 1, 1, 2, 2, 2]
+
+    :param readers: input readers.
+    :returns: the new data reader.
+    """
+
+    def reader():
+        rs = []
+        for r in readers:
+            rs.append(r())
+
+        for e in itertools.chain(*rs):
+            yield e
+
+    return reader
+
+
+class ComposeNotAligned(ValueError):
+    pass
+
+
+def compose(*readers, **kwargs):
+    """
+    Creates a data reader whose output is the combination of input readers.
+
+    If input readers output following data entries:
+    (1, 2)    3    (4, 5)
+    The composed reader will output:
+    (1, 2, 3, 4, 5)
+
+    :*readers: readers that will be composed together.
+    :check_alignment: if True, will check if input readers are aligned
+        correctly. If False, will not check alignment and trailing outputs
+        will be discarded. Defaults to True.
+
+    :returns: the new data reader.
+
+    :raises ComposeNotAligned: outputs of readers are not aligned.
+        Will not raise when check_alignment is set to False.
+    """
+    check_alignment = kwargs.pop('check_alignment', True)
+
+    def make_tuple(x):
+        if isinstance(x, tuple):
+            return x
+        else:
+            return (x, )
+
+    def reader():
+        rs = []
+        for r in readers:
+            rs.append(r())
+        if not check_alignment:
+            for outputs in itertools.izip(*rs):
+                yield sum(map(make_tuple, outputs), ())
+        else:
+            for outputs in itertools.izip_longest(*rs):
+                for o in outputs:
+                    if o is None:
+                        # None will be not be present if compose is aligned
+                        raise ComposeNotAligned(
+                            "outputs of readers are not aligned.")
+                yield sum(map(make_tuple, outputs), ())
+
+    return reader
+
+
+def buffered(reader, size):
+    """
+    Creates a buffered data reader.
+
+    The buffered data reader will read and save data entries into a
+    buffer. Reading from the buffered data reader will proceed as long
+    as the buffer is not empty.
+    
+    :param reader: the data reader to read from.
+    :param size: max buffer size.
+    
+    :returns: the buffered data reader.
+    """
+
+    class EndSignal():
+        pass
+
+    end = EndSignal()
+
+    def read_worker(r, q):
+        for d in r:
+            q.put(d)
+        q.put(end)
+
+    def data_reader():
+        r = reader()
+        q = Queue(maxsize=size)
+        t = Thread(
+            target=read_worker, args=(
+                r,
+                q, ))
+        t.daemon = True
+        t.start()
+        e = q.get()
+        while e != end:
+            yield e
+            e = q.get()
+
+    return data_reader
+
+
+def batched(reader, batch_size):
+    """
+    Create a batched reader.
+    :param reader: the data reader to read from.
+    :param batch_size: batch_size
+    :return: the batched reader.
+    """
+
+    def batched_reader():
+        r = reader()
+        batch = []
+        for instance in r:
+            batch.append(instance)
+            if len(batch) == batch_size:
+                yield batch
+                batch = []
+        if batch:
+            yield batch
+
+    return batched_reader
+
+
+def firstn(reader, n):
+    """
+    Limit the max number of samples that reader could return.
+    """
+
+    # TODO(yuyang18): Check if just drop the reader, could clean the opened
+    # resource or not?
+
+    def firstn_reader():
+        for i, item in enumerate(reader()):
+            if i == n:
+                break
+            yield item
+
+    return firstn_reader
diff --git a/python/paddle/v2/reader/tests/CMakeLists.txt b/python/paddle/v2/reader/tests/CMakeLists.txt
new file mode 100644
index 00000000000000..a51f700406b48f
--- /dev/null
+++ b/python/paddle/v2/reader/tests/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_test(NAME reader_tests
+  COMMAND bash ${PROJ_ROOT}/python/paddle/v2/reader/tests/run_tests.sh
+  ${PYTHON_EXECUTABLE})
diff --git a/python/paddle/v2/reader/tests/__init__.py b/python/paddle/v2/reader/tests/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/python/paddle/v2/reader/tests/creator_test.py b/python/paddle/v2/reader/tests/creator_test.py
new file mode 100644
index 00000000000000..9f8d7133b8694a
--- /dev/null
+++ b/python/paddle/v2/reader/tests/creator_test.py
@@ -0,0 +1,40 @@
+# Copyright PaddlePaddle contributors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import unittest
+
+import numpy as np
+
+import paddle.v2.reader.creator
+
+
+class TestNumpyArray(unittest.TestCase):
+    def test_numpy_array(self):
+        l = [[1, 2, 3], [4, 5, 6]]
+        x = np.array(l, np.int32)
+        reader = paddle.v2.reader.creator.np_array(x)
+        for idx, e in enumerate(reader()):
+            self.assertItemsEqual(e, l[idx])
+
+
+class TestTextFile(unittest.TestCase):
+    def test_text_file(self):
+        path = os.path.join(os.path.dirname(__file__), "test_data_creator.txt")
+        reader = paddle.v2.reader.creator.text_file(path)
+        for idx, e in enumerate(reader()):
+            self.assertEqual(e, str(idx * 2) + " " + str(idx * 2 + 1))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/reader/tests/decorator_test.py b/python/paddle/v2/reader/tests/decorator_test.py
new file mode 100644
index 00000000000000..734154b9790a4d
--- /dev/null
+++ b/python/paddle/v2/reader/tests/decorator_test.py
@@ -0,0 +1,125 @@
+# Copyright PaddlePaddle contributors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+import unittest
+
+import paddle.v2.reader
+
+
+def reader_creator_10(dur):
+    def reader():
+        for i in range(10):
+            # this invocation helps testing paddle.reader.buffer
+            time.sleep(dur)
+            yield i
+
+    return reader
+
+
+class TestMap(unittest.TestCase):
+    def test_map(self):
+        d = {"h": 0, "i": 1}
+
+        def tokenize(x):
+            return d[x]
+
+        def read():
+            yield "h"
+            yield "i"
+
+        r = paddle.v2.reader.map_readers(tokenize, read)
+        for i, e in enumerate(r()):
+            self.assertEqual(e, i)
+
+
+class TestBuffered(unittest.TestCase):
+    def test_read(self):
+        for size in range(20):
+            b = paddle.v2.reader.buffered(reader_creator_10(0), size)
+            c = 0
+            for i in b():
+                self.assertEqual(i, c)
+                c += 1
+            self.assertEqual(c, 10)
+
+    def test_buffering(self):
+        # read have 30ms delay.
+        b = paddle.v2.reader.buffered(reader_creator_10(0.03), 10)
+        last_time = time.time()
+        for idx, i in enumerate(b()):
+            elapsed_time = time.time() - last_time
+            if i == 0:
+                time.sleep(0.3)
+            else:
+                # read time should be short, meaning already buffered.
+                self.assertLess(elapsed_time, 0.05)
+            last_time = time.time()
+
+
+class TestCompose(unittest.TestCase):
+    def test_compse(self):
+        reader = paddle.v2.reader.compose(
+            reader_creator_10(0), reader_creator_10(0))
+        for idx, e in enumerate(reader()):
+            self.assertEqual(e, (idx, idx))
+
+    def test_compose_not_aligned(self):
+        total = 0
+        reader = paddle.v2.reader.compose(
+            paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0)),
+            reader_creator_10(0))
+        with self.assertRaises(paddle.v2.reader.ComposeNotAligned):
+            for e in reader():
+                total += 1
+        # expecting 10, not 20
+        self.assertEqual(total, 10)
+
+    def test_compose_not_aligned_no_check(self):
+        total = 0
+        reader = paddle.v2.reader.compose(
+            paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0)),
+            reader_creator_10(0),
+            check_alignment=False)
+        for e in reader():
+            total += 1
+        # expecting 10, not 20
+        self.assertEqual(total, 10)
+
+
+class TestChain(unittest.TestCase):
+    def test_chain(self):
+        c = paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0))
+        idx = 0
+        for e in c():
+            self.assertEqual(e, idx % 10)
+            idx += 1
+        self.assertEqual(idx, 20)
+
+
+class TestShuffle(unittest.TestCase):
+    def test_shuffle(self):
+        case = [(0, True), (1, True), (10, False), (100, False)]
+        a = reader_creator_10(0)
+        for size, checkEq in case:
+            s = paddle.v2.reader.shuffle(a, size)
+            total = 0
+            for idx, e in enumerate(s()):
+                if checkEq:
+                    self.assertEqual(idx, e)
+                total += 1
+            self.assertEqual(total, 10)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/reader/tests/run_tests.sh b/python/paddle/v2/reader/tests/run_tests.sh
new file mode 100755
index 00000000000000..a544a5636021bc
--- /dev/null
+++ b/python/paddle/v2/reader/tests/run_tests.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+pushd `dirname $0` > /dev/null
+SCRIPTPATH=$PWD
+popd > /dev/null
+
+cd $SCRIPTPATH
+$1 -m pip install ../../../../../paddle/dist/*.whl
+
+test_list="creator_test.py decorator_test.py"
+
+export PYTHONPATH=$PWD/../../../../../python/
+
+for fn in $test_list
+do
+  echo "test $fn"
+  $1 $fn
+  if [ $? -ne 0 ]; then
+    exit 1
+  fi
+done
diff --git a/python/paddle/v2/reader/tests/test_data_creator.txt b/python/paddle/v2/reader/tests/test_data_creator.txt
new file mode 100644
index 00000000000000..a2a8d47d43868d
--- /dev/null
+++ b/python/paddle/v2/reader/tests/test_data_creator.txt
@@ -0,0 +1,3 @@
+0 1
+2 3
+4 5
diff --git a/python/paddle/v2/tests/CMakeLists.txt b/python/paddle/v2/tests/CMakeLists.txt
new file mode 100644
index 00000000000000..46b5d08b8761ea
--- /dev/null
+++ b/python/paddle/v2/tests/CMakeLists.txt
@@ -0,0 +1,12 @@
+add_test(NAME test_v2_layer
+        COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
+        ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/v2/tests/test_layer.py
+        WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
+
+add_test(NAME test_v2_api
+        COMMAND bash ${PROJ_ROOT}/python/paddle/v2/tests/run_tests.sh ${PYTHON_EXECUTABLE})
+
+add_test(NAME topology_test
+        COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
+        ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/v2/tests/test_topology.py
+        WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
diff --git a/python/paddle/v2/tests/run_tests.sh b/python/paddle/v2/tests/run_tests.sh
new file mode 100755
index 00000000000000..b96f54fe9cc78a
--- /dev/null
+++ b/python/paddle/v2/tests/run_tests.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+pushd `dirname $0` > /dev/null
+SCRIPTPATH=$PWD
+popd > /dev/null
+
+cd $SCRIPTPATH
+
+$1 -m pip install ../../../../paddle/dist/*.whl
+
+test_list="test_data_feeder.py"
+
+export PYTHONPATH=$PWD/../../../../python/
+
+for fn in $test_list
+do
+  echo "test $fn"
+  $1 $fn
+  if [ $? -ne 0 ]; then
+    exit 1
+  fi
+done
diff --git a/python/paddle/v2/tests/test_data_feeder.py b/python/paddle/v2/tests/test_data_feeder.py
new file mode 100644
index 00000000000000..ab2bc5df76cd83
--- /dev/null
+++ b/python/paddle/v2/tests/test_data_feeder.py
@@ -0,0 +1,242 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import py_paddle.swig_paddle as api
+import numpy as np
+
+from paddle.v2 import data_type
+from paddle.v2.data_feeder import DataFeeder
+
+
+class DataFeederTest(unittest.TestCase):
+    def dense_reader(self, size):
+        data = np.random.random(size)
+        return data
+
+    def sparse_binary_reader(self, high, size_limit, non_empty=False):
+        num = np.random.randint(size_limit)  # num could be 0
+        while non_empty and num == 0:
+            num = np.random.randint(size_limit)
+        return np.random.randint(high, size=num).tolist()
+
+    def test_dense(self):
+        def compare(input):
+            feeder = DataFeeder([('image', data_type.dense_vector(784))],
+                                {'image': 0})
+            arg = feeder(input)
+            output = arg.getSlotValue(0).copyToNumpyMat()
+            input = np.array(input, dtype='float32')
+            self.assertAlmostEqual(input.all(), output.all())
+
+        # test numpy array
+        batch_size = 32
+        dim = 784
+        data = []
+        for i in xrange(batch_size):
+            each_sample = []
+            each_sample.append(self.dense_reader(dim))
+            data.append(each_sample)
+        compare(data)
+
+        # each feature is a list
+        data = []
+        for i in xrange(batch_size):
+            each_sample = []
+            each_sample.append(self.dense_reader(dim).tolist())
+            data.append(each_sample)
+        compare(data)
+
+        # test tuple
+        data = []
+        for i in xrange(batch_size):
+            each_sample = (self.dense_reader(dim).tolist(), )
+            data.append(each_sample)
+        compare(data)
+
+    def test_sparse_binary(self):
+        dim = 10000
+        batch_size = 32
+        data = []
+        for i in xrange(batch_size):
+            each_sample = []
+            each_sample.append(self.sparse_binary_reader(dim, 50))
+            data.append(each_sample)
+        feeder = DataFeeder([('input', data_type.sparse_binary_vector(dim))],
+                            {'input': 0})
+        arg = feeder(data)
+        output = arg.getSlotValue(0)
+        assert isinstance(output, api.Matrix)
+        for i in xrange(batch_size):
+            self.assertEqual(output.getSparseRowCols(i), data[i][0])
+
+    def test_sparse(self):
+        dim = 10000
+        batch_size = 32
+        v = []
+        w = []
+        data = []
+        for dat in xrange(batch_size):
+            each_sample = []
+            a = self.sparse_binary_reader(dim, 40, non_empty=True)
+            b = self.dense_reader(len(a)).tolist()
+            v.append(a)
+            w.append(np.array(b, dtype="float32"))
+            each_sample.append(zip(a, b))
+            data.append(each_sample)
+
+        feeder = DataFeeder([('input', data_type.sparse_vector(dim))],
+                            {'input': 0})
+        arg = feeder(data)
+        output = arg.getSlotValue(0)
+        assert isinstance(output, api.Matrix)
+        for i in xrange(batch_size):
+            self.assertEqual(output.getSparseRowCols(i), v[i])
+            cols_value = output.getSparseRowColsVal(i)
+            value = [val[1] for val in cols_value]
+            value = np.array(value, dtype="float32")
+            self.assertAlmostEqual(value.all(), w[i].all())
+
+    def test_integer(self):
+        dim = 100
+        batch_size = 32
+        index = []
+        for i in xrange(batch_size):
+            each_sample = []
+            each_sample.append(np.random.randint(dim))
+            index.append(each_sample)
+        feeder = DataFeeder([('input', data_type.integer_value(dim))],
+                            {'input': 0})
+        arg = feeder(index)
+        output = arg.getSlotIds(0).copyToNumpyArray()
+        index = np.array(index, dtype='int')
+        self.assertEqual(output.all(), index.flatten().all())
+
+    def test_integer_sequence(self):
+        dim = 10000
+        batch_size = 32
+        start = [0]
+        data = []
+        for i in xrange(batch_size):
+            each_sample = []
+            each_sample.append(
+                self.sparse_binary_reader(
+                    dim, 30, non_empty=True))
+            data.append(each_sample)
+            start.append(len(each_sample[0]) + start[-1])
+        feeder = DataFeeder([('input', data_type.integer_value_sequence(dim))],
+                            {'input': 0})
+        arg = feeder(data)
+        output_data = arg.getSlotIds(0).copyToNumpyArray()
+        output_start = arg.getSlotSequenceStartPositions(0).copyToNumpyArray()
+
+        index = []
+        for dat in data:
+            index.extend(x for x in dat[0])  # only one feature, so dat[0]
+        index = np.array(index, dtype='int')
+        start = np.array(start, dtype='int')
+        self.assertEqual(output_data.all(), index.all())
+        self.assertEqual(output_start.all(), start.all())
+
+    def test_multiple_features(self):
+        batch_size = 2
+        data = []
+        for i in xrange(batch_size):
+            each_sample = []
+            each_sample.append(np.random.randint(10))
+            each_sample.append(
+                self.sparse_binary_reader(
+                    20000, 40, non_empty=True))
+            each_sample.append(self.dense_reader(100))
+            data.append(each_sample)
+
+        # test multiple features
+        data_types = [('fea0', data_type.dense_vector(100)),
+                      ('fea1', data_type.sparse_binary_vector(20000)),
+                      ('fea2', data_type.integer_value(10))]
+        feeder = DataFeeder(data_types, {'fea0': 2, 'fea1': 1, 'fea2': 0})
+        arg = feeder(data)
+        output_dense = arg.getSlotValue(0).copyToNumpyMat()
+        output_sparse = arg.getSlotValue(1)
+        output_index = arg.getSlotIds(2).copyToNumpyArray()
+        for i in xrange(batch_size):
+            self.assertEqual(output_dense[i].all(), data[i][2].all())
+            self.assertEqual(output_sparse.getSparseRowCols(i), data[i][1])
+            self.assertEqual(output_index[i], data[i][0])
+
+        # reader returns 3 features, but only use 2 features
+        data_types = [('fea0', data_type.dense_vector(100)),
+                      ('fea2', data_type.integer_value(10))]
+        feeder = DataFeeder(data_types, {'fea0': 2, 'fea2': 0})
+        arg = feeder(data)
+        output_dense = arg.getSlotValue(0).copyToNumpyMat()
+        output_index = arg.getSlotIds(1).copyToNumpyArray()
+        for i in xrange(batch_size):
+            self.assertEqual(output_dense[i].all(), data[i][2].all())
+            self.assertEqual(output_index[i], data[i][0])
+
+        # reader returns 3 featreus, one is duplicate data
+        data_types = [('fea0', data_type.dense_vector(100)),
+                      ('fea1', data_type.sparse_binary_vector(20000)),
+                      ('fea2', data_type.integer_value(10)),
+                      ('fea3', data_type.dense_vector(100))]
+        feeder = DataFeeder(data_types,
+                            {'fea0': 2,
+                             'fea1': 1,
+                             'fea2': 0,
+                             'fea3': 2})
+        arg = feeder(data)
+        fea0 = arg.getSlotValue(0).copyToNumpyMat()
+        fea1 = arg.getSlotValue(1)
+        fea2 = arg.getSlotIds(2).copyToNumpyArray()
+        fea3 = arg.getSlotValue(3).copyToNumpyMat()
+        for i in xrange(batch_size):
+            self.assertEqual(fea0[i].all(), data[i][2].all())
+            self.assertEqual(fea1.getSparseRowCols(i), data[i][1])
+            self.assertEqual(fea2[i], data[i][0])
+            self.assertEqual(fea3[i].all(), data[i][2].all())
+
+    def test_multiple_features_tuple(self):
+        batch_size = 2
+        data = []
+        for i in xrange(batch_size):
+            a = np.random.randint(10)
+            b = self.sparse_binary_reader(20000, 40, non_empty=True)
+            c = self.dense_reader(100)
+            each_sample = (a, b, c)
+            data.append(each_sample)
+
+        # test multiple features
+        data_types = [('fea0', data_type.dense_vector(100)),
+                      ('fea1', data_type.sparse_binary_vector(20000)),
+                      ('fea2', data_type.integer_value(10))]
+        feeder = DataFeeder(data_types, {'fea0': 2, 'fea1': 1, 'fea2': 0})
+        arg = feeder(data)
+        out_dense = arg.getSlotValue(0).copyToNumpyMat()
+        out_sparse = arg.getSlotValue(1)
+        out_index = arg.getSlotIds(2).copyToNumpyArray()
+        for i in xrange(batch_size):
+            self.assertEqual(out_dense[i].all(), data[i][2].all())
+            self.assertEqual(out_sparse.getSparseRowCols(i), data[i][1])
+            self.assertEqual(out_index[i], data[i][0])
+
+
+if __name__ == '__main__':
+    api.initPaddle("--use_gpu=0")
+    suite = unittest.TestLoader().loadTestsFromTestCase(DataFeederTest)
+    unittest.TextTestRunner().run(suite)
+    if api.isGpuVersion():
+        api.setUseGpu(True)
+        unittest.main()
diff --git a/python/paddle/v2/tests/test_layer.py b/python/paddle/v2/tests/test_layer.py
new file mode 100644
index 00000000000000..0055679a91801a
--- /dev/null
+++ b/python/paddle/v2/tests/test_layer.py
@@ -0,0 +1,264 @@
+# Copyright PaddlePaddle contributors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import paddle.v2.activation as activation
+import paddle.v2.attr as attr
+import paddle.v2.data_type as data_type
+import paddle.v2.layer as layer
+import paddle.v2.pooling as pooling
+import paddle.v2.networks as networks
+
+pixel = layer.data(name='pixel', type=data_type.dense_vector(128))
+label = layer.data(name='label', type=data_type.integer_value(10))
+weight = layer.data(name='weight', type=data_type.dense_vector(10))
+score = layer.data(name='score', type=data_type.dense_vector(1))
+
+hidden = layer.fc(input=pixel,
+                  size=100,
+                  act=activation.Sigmoid(),
+                  param_attr=attr.Param(name='hidden'))
+inference = layer.fc(input=hidden, size=10, act=activation.Softmax())
+conv = layer.img_conv(
+    input=pixel,
+    filter_size=1,
+    filter_size_y=1,
+    num_channels=8,
+    num_filters=16,
+    act=activation.Linear())
+
+
+class ImageLayerTest(unittest.TestCase):
+    def test_conv_layer(self):
+        conv_shift = layer.conv_shift(a=pixel, b=score)
+        print layer.parse_network(conv, conv_shift)
+
+    def test_pooling_layer(self):
+        maxpool = layer.img_pool(
+            input=conv,
+            pool_size=2,
+            num_channels=16,
+            padding=1,
+            pool_type=pooling.Max())
+        spp = layer.spp(input=conv,
+                        pyramid_height=2,
+                        num_channels=16,
+                        pool_type=pooling.Max())
+        maxout = layer.maxout(input=conv, num_channels=16, groups=4)
+        print layer.parse_network(maxpool, spp, maxout)
+
+    def test_norm_layer(self):
+        norm1 = layer.img_cmrnorm(input=conv, size=5)
+        norm2 = layer.batch_norm(input=conv)
+        norm3 = layer.sum_to_one_norm(input=conv)
+        print layer.parse_network(norm1, norm2, norm3)
+
+
+class AggregateLayerTest(unittest.TestCase):
+    def test_aggregate_layer(self):
+        pool = layer.pooling(
+            input=pixel,
+            pooling_type=pooling.Avg(),
+            agg_level=layer.AggregateLevel.EACH_SEQUENCE)
+        last_seq = layer.last_seq(input=pixel)
+        first_seq = layer.first_seq(input=pixel)
+        concat = layer.concat(input=[last_seq, first_seq])
+        seq_concat = layer.seq_concat(a=last_seq, b=first_seq)
+        print layer.parse_network(pool, last_seq, first_seq, concat, seq_concat)
+
+
+class MathLayerTest(unittest.TestCase):
+    def test_math_layer(self):
+        addto = layer.addto(input=[pixel, pixel])
+        linear_comb = layer.linear_comb(weights=weight, vectors=hidden, size=10)
+        interpolation = layer.interpolation(
+            input=[hidden, hidden], weight=score)
+        bilinear = layer.bilinear_interp(input=conv, out_size_x=4, out_size_y=4)
+        power = layer.power(input=pixel, weight=score)
+        scaling = layer.scaling(input=pixel, weight=score)
+        slope = layer.slope_intercept(input=pixel)
+        tensor = layer.tensor(a=pixel, b=pixel, size=1000)
+        cos_sim = layer.cos_sim(a=pixel, b=pixel)
+        trans = layer.trans(input=tensor)
+        print layer.parse_network(addto, linear_comb, interpolation, power,
+                                  scaling, slope, tensor, cos_sim, trans)
+
+
+class ReshapeLayerTest(unittest.TestCase):
+    def test_reshape_layer(self):
+        block_expand = layer.block_expand(
+            input=conv, num_channels=4, stride_x=1, block_x=1)
+        expand = layer.expand(
+            input=weight,
+            expand_as=pixel,
+            expand_level=layer.ExpandLevel.FROM_TIMESTEP)
+        repeat = layer.repeat(input=pixel, num_repeats=4)
+        reshape = layer.seq_reshape(input=pixel, reshape_size=4)
+        rotate = layer.rotate(input=pixel, height=16, width=49)
+        print layer.parse_network(block_expand, expand, repeat, reshape, rotate)
+
+
+class RecurrentLayerTest(unittest.TestCase):
+    def test_recurrent_layer(self):
+        word = layer.data(name='word', type=data_type.integer_value(12))
+        recurrent = layer.recurrent(input=word)
+        lstm = layer.lstmemory(input=word)
+        gru = layer.grumemory(input=word)
+        print layer.parse_network(recurrent, lstm, gru)
+
+
+class CostLayerTest(unittest.TestCase):
+    def test_cost_layer(self):
+        cost1 = layer.classification_cost(input=inference, label=label)
+        cost2 = layer.classification_cost(
+            input=inference, label=label, weight=weight)
+        cost3 = layer.cross_entropy_cost(input=inference, label=label)
+        cost4 = layer.cross_entropy_with_selfnorm_cost(
+            input=inference, label=label)
+        cost5 = layer.regression_cost(input=inference, label=label)
+        cost6 = layer.regression_cost(
+            input=inference, label=label, weight=weight)
+        cost7 = layer.multi_binary_label_cross_entropy_cost(
+            input=inference, label=label)
+        cost8 = layer.rank_cost(left=score, right=score, label=score)
+        cost9 = layer.lambda_cost(input=inference, score=score)
+        cost10 = layer.sum_cost(input=inference)
+        cost11 = layer.huber_cost(input=score, label=label)
+
+        print layer.parse_network(cost1, cost2)
+        print layer.parse_network(cost3, cost4)
+        print layer.parse_network(cost5, cost6)
+        print layer.parse_network(cost7, cost8, cost9, cost10, cost11)
+
+        crf = layer.crf(input=inference, label=label)
+        crf_decoding = layer.crf_decoding(input=inference, size=3)
+        ctc = layer.ctc(input=inference, label=label)
+        warp_ctc = layer.warp_ctc(input=pixel, label=label)
+        nce = layer.nce(input=inference, label=label, num_classes=3)
+        hsigmoid = layer.hsigmoid(input=inference, label=label, num_classes=3)
+
+        print layer.parse_network(crf, crf_decoding, ctc, warp_ctc, nce,
+                                  hsigmoid)
+
+
+class OtherLayerTest(unittest.TestCase):
+    def test_sampling_layer(self):
+        maxid = layer.max_id(input=inference)
+        sampling_id = layer.sampling_id(input=inference)
+        eos = layer.eos(input=maxid, eos_id=5)
+        print layer.parse_network(maxid, sampling_id, eos)
+
+    def test_slicing_joining_layer(self):
+        pad = layer.pad(input=conv, pad_c=[2, 3], pad_h=[1, 2], pad_w=[3, 1])
+        print layer.parse_network(pad)
+
+
+class ProjOpTest(unittest.TestCase):
+    def test_projection(self):
+        input = layer.data(name='data', type=data_type.dense_vector(784))
+        word = layer.data(
+            name='word', type=data_type.integer_value_sequence(10000))
+        fc0 = layer.fc(input=input, size=100, act=activation.Sigmoid())
+        fc1 = layer.fc(input=input, size=200, act=activation.Sigmoid())
+        mixed0 = layer.mixed(
+            size=256,
+            input=[
+                layer.full_matrix_projection(input=fc0),
+                layer.full_matrix_projection(input=fc1)
+            ])
+        with layer.mixed(size=200) as mixed1:
+            mixed1 += layer.full_matrix_projection(input=fc0)
+            mixed1 += layer.identity_projection(input=fc1)
+
+        table = layer.table_projection(input=word)
+        emb0 = layer.mixed(size=512, input=table)
+        with layer.mixed(size=512) as emb1:
+            emb1 += table
+
+        scale = layer.scaling_projection(input=fc0)
+        scale0 = layer.mixed(size=100, input=scale)
+        with layer.mixed(size=100) as scale1:
+            scale1 += scale
+
+        dotmul = layer.dotmul_projection(input=fc0)
+        dotmul0 = layer.mixed(size=100, input=dotmul)
+        with layer.mixed(size=100) as dotmul1:
+            dotmul1 += dotmul
+
+        context = layer.context_projection(input=fc0, context_len=5)
+        context0 = layer.mixed(size=100, input=context)
+        with layer.mixed(size=100) as context1:
+            context1 += context
+
+        conv = layer.conv_projection(
+            input=input,
+            filter_size=1,
+            num_channels=1,
+            num_filters=128,
+            stride=1,
+            padding=0)
+        conv0 = layer.mixed(input=conv, bias_attr=True)
+        with layer.mixed(bias_attr=True) as conv1:
+            conv1 += conv
+
+        print layer.parse_network(mixed0)
+        print layer.parse_network(mixed1)
+        print layer.parse_network(emb0)
+        print layer.parse_network(emb1)
+        print layer.parse_network(scale0)
+        print layer.parse_network(scale1)
+        print layer.parse_network(dotmul0)
+        print layer.parse_network(dotmul1)
+        print layer.parse_network(conv0)
+        print layer.parse_network(conv1)
+
+    def test_operator(self):
+        ipt0 = layer.data(name='data', type=data_type.dense_vector(784))
+        ipt1 = layer.data(name='word', type=data_type.dense_vector(128))
+        fc0 = layer.fc(input=ipt0, size=100, act=activation.Sigmoid())
+        fc1 = layer.fc(input=ipt0, size=100, act=activation.Sigmoid())
+
+        dotmul_op = layer.dotmul_operator(a=fc0, b=fc1)
+        dotmul0 = layer.mixed(input=dotmul_op)
+        with layer.mixed() as dotmul1:
+            dotmul1 += dotmul_op
+
+        conv = layer.conv_operator(
+            img=ipt0,
+            filter=ipt1,
+            filter_size=1,
+            num_channels=1,
+            num_filters=128,
+            stride=1,
+            padding=0)
+        conv0 = layer.mixed(input=conv)
+        with layer.mixed() as conv1:
+            conv1 += conv
+
+        print layer.parse_network(dotmul0)
+        print layer.parse_network(dotmul1)
+        print layer.parse_network(conv0)
+        print layer.parse_network(conv1)
+
+
+class NetworkTests(unittest.TestCase):
+    def test_vgg(self):
+        img = layer.data(name='pixel', type=data_type.dense_vector(784))
+        vgg_out = networks.small_vgg(
+            input_image=img, num_channels=1, num_classes=2)
+        print layer.parse_network(vgg_out)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/tests/test_topology.py b/python/paddle/v2/tests/test_topology.py
new file mode 100644
index 00000000000000..1bf55a5bc68dfd
--- /dev/null
+++ b/python/paddle/v2/tests/test_topology.py
@@ -0,0 +1,83 @@
+# Copyright PaddlePaddle contributors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import paddle.v2.layer as layer
+import paddle.v2.topology as topology
+import paddle.v2.data_type as data_type
+import paddle.trainer_config_helpers as conf_helps
+
+
+class TestTopology(unittest.TestCase):
+    def test_data_type(self):
+        pixel = layer.data(name='pixel', type=data_type.dense_vector(784))
+        label = layer.data(name='label', type=data_type.integer_value(10))
+        hidden = layer.fc(input=pixel,
+                          size=100,
+                          act=conf_helps.SigmoidActivation())
+        inference = layer.fc(input=hidden,
+                             size=10,
+                             act=conf_helps.SoftmaxActivation())
+        cost = layer.classification_cost(input=inference, label=label)
+        topo = topology.Topology(cost)
+        data_types = topo.data_type()
+        self.assertEqual(len(data_types), 2)
+        pixel_data_type = filter(lambda type: type[0] == "pixel", data_types)
+        self.assertEqual(len(pixel_data_type), 1)
+        pixel_data_type = pixel_data_type[0]
+        self.assertEqual(pixel_data_type[1].type, data_type.DataType.Dense)
+        self.assertEqual(pixel_data_type[1].dim, 784)
+
+        label_data_type = filter(lambda type: type[0] == "label", data_types)
+        self.assertEqual(len(label_data_type), 1)
+        label_data_type = label_data_type[0]
+        self.assertEqual(label_data_type[1].type, data_type.DataType.Index)
+        self.assertEqual(label_data_type[1].dim, 10)
+
+    def test_get_layer(self):
+        pixel = layer.data(name='pixel', type=data_type.dense_vector(784))
+        label = layer.data(name='label', type=data_type.integer_value(10))
+        hidden = layer.fc(input=pixel,
+                          size=100,
+                          act=conf_helps.SigmoidActivation())
+        inference = layer.fc(input=hidden,
+                             size=10,
+                             act=conf_helps.SoftmaxActivation())
+        cost = layer.classification_cost(input=inference, label=label)
+        topo = topology.Topology(cost)
+        pixel_layer = topo.get_layer("pixel")
+        label_layer = topo.get_layer("label")
+        self.assertEqual(pixel_layer, pixel)
+        self.assertEqual(label_layer, label)
+
+    def test_parse(self):
+        pixel = layer.data(name='pixel', type=data_type.dense_vector(784))
+        label = layer.data(name='label', type=data_type.integer_value(10))
+        hidden = layer.fc(input=pixel,
+                          size=100,
+                          act=conf_helps.SigmoidActivation())
+        inference = layer.fc(input=hidden,
+                             size=10,
+                             act=conf_helps.SoftmaxActivation())
+        maxid = layer.max_id(input=inference)
+        cost1 = layer.classification_cost(input=inference, label=label)
+        cost2 = layer.cross_entropy_cost(input=inference, label=label)
+
+        topology.Topology(cost2).proto()
+        topology.Topology([cost1]).proto()
+        topology.Topology([cost1, cost2]).proto()
+        topology.Topology([inference, maxid]).proto()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/topology.py b/python/paddle/v2/topology.py
new file mode 100644
index 00000000000000..4c211254319bbd
--- /dev/null
+++ b/python/paddle/v2/topology.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+
+from paddle.proto.ModelConfig_pb2 import ModelConfig
+
+import layer as v2_layer
+
+__all__ = ['Topology']
+
+
+def __flatten__(lis):
+    """
+    Given a list, possibly nested to any level, return it flattened.
+    """
+    new_lis = []
+    for item in lis:
+        if isinstance(item, collections.Sequence):
+            new_lis.extend(__flatten__(item))
+        else:
+            new_lis.append(item)
+    return new_lis
+
+
+def __bfs_travel__(callback, *layers):
+    layers = __flatten__(layers)
+    for each_layer in layers:
+        __break__ = callback(each_layer)
+        if __break__:
+            return
+        __bfs_travel__(callback, *each_layer.__parent_layers__.values())
+
+
+class Topology(object):
+    """
+    Topology is used to store the information about all layers
+    and network configs.
+    """
+
+    def __init__(self, layers):
+        if not isinstance(layers, collections.Sequence):
+            __check_layer_type__(layers)
+            layers = [layers]
+        for layer in layers:
+            __check_layer_type__(layer)
+        self.layers = layers
+        self.__model_config__ = v2_layer.parse_network(*layers)
+        assert isinstance(self.__model_config__, ModelConfig)
+
+    def proto(self):
+        return self.__model_config__
+
+    def get_layer(self, name):
+        """
+        get v2.Layer Class instance by layer name
+        :param name:
+        :return:
+        """
+        result_layer = [None]
+
+        def __impl__(l):
+            if l.name == name:
+                result_layer[0] = l
+                return True  # break
+            return False
+
+        __bfs_travel__(__impl__, *self.layers)
+        if result_layer[0] is None:
+            raise ValueError("No such layer %s" % name)
+        return result_layer[0]
+
+    def data_layers(self):
+        """
+        get all data layer
+        :return:
+        """
+        data_layers = dict()
+
+        def __impl__(l):
+            if isinstance(l, v2_layer.DataLayerV2):
+                data_layers[l.name] = l
+
+        __bfs_travel__(__impl__, *self.layers)
+        return data_layers
+
+    def data_type(self):
+        """
+        get data_type from proto, such as:
+        [('image', dense_vector(768)), ('label', integer_value(10))]
+        """
+        data_layers = self.data_layers()
+        return [(nm, data_layers[nm].type)
+                for nm in self.proto().input_layer_names]
+
+
+def __check_layer_type__(layer):
+    if not isinstance(layer, v2_layer.LayerV2):
+        raise ValueError('layer should have type paddle.layer.Layer')
diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py
new file mode 100644
index 00000000000000..5003f55f3e0d15
--- /dev/null
+++ b/python/paddle/v2/trainer.py
@@ -0,0 +1,174 @@
+import collections
+
+import py_paddle.swig_paddle as api
+
+from data_feeder import DataFeeder
+from topology import Topology
+from . import event as v2_event
+from . import optimizer as v2_optimizer
+from . import parameters as v2_parameters
+
+__all__ = ['ITrainer', 'SGD']
+
+
+def default_event_handler(event):
+    """
+    Default event handler. It will print some log and save mode.
+
+    TODO(yuyang18): Complete it!
+    :param event:
+    :return:
+    """
+    pass
+
+
+class ITrainer(object):
+    """
+    The interface of Trainer. The only exposed method is `train`.
+    """
+
+    def train(self, reader, topology, parameters, event_handler=None):
+        """
+        train method.
+
+        :param reader:
+        :param topology:
+        :param parameters:
+        :param event_handler:
+        :return:
+        """
+
+        raise NotImplementedError()
+
+
+class SGD(ITrainer):
+    def __init__(self, cost, parameters, update_equation):
+        """
+        Simple SGD Trainer.
+
+        :param update_equation: The optimizer object.
+        :type update_equation: v2_optimizer.Optimizer
+        """
+
+        if not isinstance(parameters, v2_parameters.Parameters):
+            raise TypeError('parameters should be parameters')
+
+        if not isinstance(update_equation, v2_optimizer.Optimizer):
+            raise TypeError("update equation parameter must be "
+                            "paddle.v2.optimizer.Optimizer")
+        topology = Topology(cost)
+        self.__optimizer__ = update_equation
+        self.__topology__ = topology
+        self.__parameters__ = parameters
+        self.__topology_in_proto__ = topology.proto()
+        self.__data_types__ = topology.data_type()
+        gm = api.GradientMachine.createFromConfigProto(
+            self.__topology_in_proto__, api.CREATE_MODE_NORMAL,
+            self.__optimizer__.enable_types())
+        assert isinstance(gm, api.GradientMachine)
+        parameters.append_gradient_machine(gm)
+        self.__gradient_machine__ = gm
+        self.__gradient_machine__.randParameters()
+
+    def train(self, reader, num_passes=1, event_handler=None, reader_dict=None):
+        """
+        Training method. Will train num_passes of input data.
+
+        :param reader:
+        :param topology: Network Topology, use one or more Layers to represent it.
+        :param parameters: The parameter pools.
+        :param num_passes: The total train passes.
+        :param event_handler: Event handler. A method will be invoked when event
+                              occurred.
+        :type event_handler: (BaseEvent) => None
+        :return:
+        """
+        if event_handler is None:
+            event_handler = default_event_handler
+
+        if reader_dict is None:
+            reader_dict = self.default_reader_dict()
+
+        __check_train_args__(**locals())
+
+        updater = self.__optimizer__.create_local_updater()
+        updater.init(self.__gradient_machine__)
+
+        self.__gradient_machine__.start()
+        batch_evaluator = self.__gradient_machine__.makeEvaluator()
+        assert isinstance(batch_evaluator, api.Evaluator)
+        pass_evaluator = self.__gradient_machine__.makeEvaluator()
+        assert isinstance(pass_evaluator, api.Evaluator)
+        out_args = api.Arguments.createArguments(0)
+
+        feeder = DataFeeder(self.__data_types__, reader_dict)
+
+        for pass_id in xrange(num_passes):
+            event_handler(v2_event.BeginPass(pass_id))
+            pass_evaluator.start()
+            updater.startPass()
+            for batch_id, data_batch in enumerate(reader()):
+                pass_type = updater.startBatch(len(data_batch))
+                self.__gradient_machine__.forwardBackward(
+                    feeder(data_batch), out_args, pass_type)
+                batch_evaluator.start()
+                event_handler(
+                    v2_event.BeginIteration(
+                        pass_id=pass_id, batch_id=batch_id))
+                pass_type = updater.startBatch(len(data_batch))
+                self.__gradient_machine__.forwardBackward(
+                    feeder(data_batch), out_args, pass_type)
+                self.__gradient_machine__.eval(pass_evaluator)
+                self.__gradient_machine__.eval(batch_evaluator)
+                for each_param in self.__gradient_machine__.getParameters():
+                    updater.update(each_param)
+                # Get cost. We use numpy to calculate total cost for this batch.
+                cost_vec = out_args.getSlotValue(0)
+                cost_vec = cost_vec.copyToNumpyMat()
+                cost = cost_vec.sum() / len(data_batch)
+                updater.finishBatch(cost)
+                batch_evaluator.finish()
+                event_handler(
+                    v2_event.EndIteration(
+                        pass_id=pass_id,
+                        batch_id=batch_id,
+                        cost=cost,
+                        evaluator=batch_evaluator))
+
+            updater.finishPass()
+            pass_evaluator.finish()
+            event_handler(v2_event.EndPass(pass_id, evaluator=pass_evaluator))
+        self.__gradient_machine__.finish()
+
+    def default_reader_dict(self):
+        reader_dict = dict()
+        for i, tp in enumerate(self.__data_types__):
+            reader_dict[tp[0]] = i
+        return reader_dict
+
+    def test(self, reader, reader_dict=None):
+        if reader_dict is None:
+            reader_dict = self.default_reader_dict()
+
+        feeder = DataFeeder(self.__data_types__, reader_dict)
+        evaluator = self.__gradient_machine__.makeEvaluator()
+        out_args = api.Arguments.createArguments(0)
+        evaluator.start()
+        for data_batch in reader():
+            self.__gradient_machine__.forward(
+                feeder(data_batch), out_args, api.PASS_TEST)
+            self.__gradient_machine__.eval(evaluator)
+
+        evaluator.finish()
+        return v2_event.TestResult(evaluator=evaluator)
+
+
+def __check_train_args__(reader, event_handler, **kwargs):
+    """
+    Check train function's argument types
+    """
+    if not callable(reader) or not isinstance(reader(), collections.Iterator):
+        raise TypeError('train_data_reader should be a function, '
+                        'which can return a iterator')
+    if not callable(event_handler):
+        raise TypeError('event handler should be a function')
diff --git a/python/setup.py.in b/python/setup.py.in
index b66a42e87c7870..68ca35265cf132 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -4,7 +4,10 @@ packages=['paddle',
           'paddle.proto',
           'paddle.trainer',
           'paddle.trainer_config_helpers',
-          'paddle.utils']
+          'paddle.utils',
+          'paddle.v2',
+          'paddle.v2.dataset',
+          'paddle.v2.reader']
 
 setup(name='paddle',
       version='${PADDLE_VERSION}',