eric-haibin-lin · eric-haibin-lin · Aug 16, 2017 · Jun 26, 2017 · Jul 15, 2017 · Jul 17, 2017
diff --git a/docs/api/python/ndarray.md b/docs/api/python/ndarray.md
@@ -547,11 +547,6 @@ The `contrib.ndarray` module contains many useful experimental APIs for new feat
     :members:
     :special-members:
 
-.. autoclass:: mxnet.ndarray.BaseSparseNDArray
-    :members:
-    :special-members:
-    :exclude-members: __weakref__
-
 .. autoclass:: mxnet.ndarray.CSRNDArray
     :members:
     :special-members:

diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py
@@ -1109,6 +1109,10 @@ def backward(self, out_grad=None, retain_graph=False, train_mode=True):
     def tostype(self, stype):
         """Return a copy of the array with chosen storage type.
 
+        See Also
+        ----------
+        :meth:`mxnet.ndarray.cast_storage`.
+
         Returns
         -------
         NDArray, CSRNDArray or RowSparseNDArray

diff --git a/python/mxnet/ndarray/sparse_ndarray.py b/python/mxnet/ndarray/sparse_ndarray.py
@@ -846,8 +846,8 @@ def _zeros_sparse_ndarray(stype, shape, ctx=None, dtype=None, aux_types=None, **
     dtype : str or numpy.dtype, optional
         An optional value type (default is `float32`)
     aux_types: list of numpy.dtype, optional
-        An optional type for the aux data for BaseSparseNDArray (default values depends
-        on the storage type)
+        An optional list of types of the aux data for RowSparseNDArray or CSRNDArray
+        (default values depends on the storage type)
 
     Returns
     -------

diff --git a/python/mxnet/ndarray/utils.py b/python/mxnet/ndarray/utils.py
@@ -37,10 +37,10 @@ def zeros(shape, ctx=None, dtype=None, stype=None, aux_types=None, **kwargs):
     dtype : str or numpy.dtype, optional
         An optional value type (default is `float32`)
     stype: string, optional
-        The storage type of the empty array, such as 'row_sparse', 'csr', etc
+        The storage type of the empty array, such as 'row_sparse', 'csr', etc.
     aux_types: list of numpy.dtype, optional
-        An optional type for the aux data for the BaseSparseNDArray (default values
-        depends on the storage type)
+        An optional list of types of the aux data for RowSparseNDArray or CSRNDArray
+        (default values depend on the storage type)
 
     Returns
     -------
@@ -73,8 +73,8 @@ def empty(shape, ctx=None, dtype=None, stype=None, aux_types=None):
     stype : str, optional
         An optional storage type (default is `default`).
     aux_types: list of numpy.dtype, optional
-        An optional type for the aux data for the BaseSparseNDArray (default values depends
-        on the storage type)
+        An optional list of types of the aux data for RowSparseNDArray or CSRNDArray
+        (default values depend on the storage type)
 
     Returns
     -------
@@ -111,8 +111,8 @@ def array(source_array, ctx=None, dtype=None, aux_types=None):
         The data type of the output array. The default dtype is ``source_array.dtype``
         if `source_array` is an `NDArray`, `float32` otherwise.
     aux_types: list of numpy.dtype, optional
-        An optional type for the aux data for the BaseSparseNDArray (default values
-        depends on the storage type)
+        An optional list of types of the aux data for RowSparseNDArray or CSRNDArray
+        (default values depend on the storage type)
 
     Returns
     -------

diff --git a/src/operator/batch_norm.cu b/src/operator/batch_norm.cu
@@ -283,7 +283,7 @@ __global__ void BatchNormalizationUpdateOutputKernel(
   }
 
   // Write normalized and update the output
-  const AccReal gamma = weight.numElements() > 0
+  const AccReal gamma = ((flags & FIX_GAMMA_FLAG) == 0 && weight.numElements() > 0)
                         ? ScalarConvert<DType, AccReal>::to(weight[plane])
                         : ScalarConvert<int, AccReal>::to(1);
   const AccReal beta = bias.numElements() > 0 ? ScalarConvert<DType, AccReal>::to(bias[plane])
@@ -332,7 +332,7 @@ static __global__ void BatchNormalizationBackwardKernel(
     invstd = VARIANCE_TO_INVSTD(tensors.runningVar[plane], eps);
   }
 
-  const AccReal weightVal = tensors.weight.numElements() > 0 ?
+  const AccReal weightVal = ((flags & FIX_GAMMA_FLAG) == 0 && tensors.weight.numElements() > 0) ?
                       ScalarConvert<DType, AccReal>::to(tensors.weight[plane]) : AccReal(1);
   const AccReal norm = AccReal(1) / N;
 

diff --git a/src/operator/random/sample_op.cu b/src/operator/random/sample_op.cu
@@ -28,21 +28,20 @@ namespace op {
 
 // GPU versions of uniform and normal distribution.
 template<>
-void SampleUniform_<gpu>(const nnvm::NodeAttrs& attrs,
-                         const OpContext& ctx,
-                         const std::vector<TBlob>& inputs,
-                         const std::vector<OpReqType>& req,
-                         const std::vector<TBlob>& outputs) {
+void SampleUniformDnsImpl<gpu>(const nnvm::NodeAttrs& attrs,
+                               const OpContext& ctx,
+                               const OpReqType& req,
+                               TBlob* output) {
   using namespace mxnet::op;
   using namespace mshadow::expr;
   typedef gpu xpu;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   const SampleUniformParam& param = nnvm::get<SampleUniformParam>(attrs.parsed);
   mshadow::Random<xpu, float> *prnd = ctx.requested[0].get_random<xpu, float>(s);
-  if (outputs[0].type_flag_ != mshadow::kFloat32) {
-    MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+  if (output->type_flag_ != mshadow::kFloat32) {
+    MSHADOW_REAL_TYPE_SWITCH(output->type_flag_, DType, {
       // Not float32: use workspace and copy to output
-      mshadow::Tensor<xpu, 2, DType> out = outputs[0].FlatTo2D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 2, DType> out = output->FlatTo2D<xpu, DType>(s);
       mshadow::Tensor<xpu, 1, float> workspace =
         ctx.requested[1].get_space_typed<xpu, 1, float>
         (mshadow::Shape1(out.shape_.Size()), s);
@@ -51,27 +50,36 @@ void SampleUniform_<gpu>(const nnvm::NodeAttrs& attrs,
     });
   } else {
     // float32: write directly into output
-    mshadow::Tensor<xpu, 2, float> out = outputs[0].FlatTo2D<xpu, float>(s);
+    mshadow::Tensor<xpu, 2, float> out = output->FlatTo2D<xpu, float>(s);
     prnd->SampleUniform(&out, param.low, param.high);
   }
 }
 
 template<>
-void SampleNormal_<gpu>(const nnvm::NodeAttrs& attrs,
-                   const OpContext& ctx,
-                   const std::vector<TBlob>& inputs,
-                   const std::vector<OpReqType>& req,
-                   const std::vector<TBlob>& outputs) {
+void SampleUniform_<gpu>(const nnvm::NodeAttrs& attrs,
+                         const OpContext& ctx,
+                         const std::vector<TBlob>& inputs,
+                         const std::vector<OpReqType>& req,
+                         const std::vector<TBlob>& outputs) {
+  TBlob out = outputs[0];
+  SampleUniformDnsImpl<gpu>(attrs, ctx, req[0], &out);
+}
+
+template<>
+void SampleNormalDnsImpl<gpu>(const nnvm::NodeAttrs& attrs,
+                              const OpContext& ctx,
+                              const OpReqType& req,
+                              TBlob* output) {
   using namespace mxnet::op;
   using namespace mshadow::expr;
   typedef gpu xpu;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   const SampleNormalParam& param = nnvm::get<SampleNormalParam>(attrs.parsed);
   mshadow::Random<xpu, float> *prnd = ctx.requested[0].get_random<xpu, float>(s);
-  if (outputs[0].type_flag_ != mshadow::kFloat32) {
-    MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+  if (output->type_flag_ != mshadow::kFloat32) {
+    MSHADOW_REAL_TYPE_SWITCH(output->type_flag_, DType, {
       // Not float32: use workspace and copy to output
-      mshadow::Tensor<xpu, 2, DType> out = outputs[0].FlatTo2D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 2, DType> out = output->FlatTo2D<xpu, DType>(s);
       mshadow::Tensor<xpu, 1, float> workspace =
         ctx.requested[1].get_space_typed<xpu, 1, float>
         (mshadow::Shape1(out.shape_.Size()), s);
@@ -80,16 +88,28 @@ void SampleNormal_<gpu>(const nnvm::NodeAttrs& attrs,
     });
   } else {
     // float32: write directly into output
-    mshadow::Tensor<xpu, 2, float> out = outputs[0].FlatTo2D<xpu, float>(s);
+    mshadow::Tensor<xpu, 2, float> out = output->FlatTo2D<xpu, float>(s);
     prnd->SampleGaussian(&out, param.loc, param.scale);
   }
 }
 
+template<>
+void SampleNormal_<gpu>(const nnvm::NodeAttrs& attrs,
+                        const OpContext& ctx,
+                        const std::vector<TBlob>& inputs,
+                        const std::vector<OpReqType>& req,
+                        const std::vector<TBlob>& outputs) {
+  TBlob out = outputs[0];
+  SampleNormalDnsImpl<gpu>(attrs, ctx, req[0], &out);
+}
+
 NNVM_REGISTER_OP(random_uniform)
-.set_attr<FCompute>("FCompute<gpu>", SampleUniform_<gpu>);
+.set_attr<FCompute>("FCompute<gpu>", SampleUniform_<gpu>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", SampleUniformEx_<gpu>);
 
 NNVM_REGISTER_OP(random_normal)
-.set_attr<FCompute>("FCompute<gpu>", SampleNormal_<gpu>);
+.set_attr<FCompute>("FCompute<gpu>", SampleNormal_<gpu>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", SampleNormalEx_<gpu>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/cast_storage.cc b/src/operator/tensor/cast_storage.cc
@@ -32,6 +32,39 @@ namespace op {
 DMLC_REGISTER_PARAMETER(CastStorageParam);
 NNVM_REGISTER_OP(cast_storage)
 .describe(R"code(Casts tensor storage type to the new type.
+
+When an NDArray with default storage type is cast to csr or row_sparse storage,
+the result is compact, which means:
+
+- for csr, zero values will not be retained
+- for row_sparse, row slices of all zeros will not be retained
+
+The storage type of ``cast_storage`` output depends on stype parameter:
+
+- cast_storage(csr, 'default') = default
+- cast_storage(row_sparse, 'default') = default
+- cast_storage(default, 'csr') = csr
+- cast_storage(default, 'row_sparse') = row_sparse
+
+Example::
+
+    dense = [[ 0.,  1.,  0.],
+             [ 2.,  0.,  3.],
+             [ 0.,  0.,  0.],
+             [ 0.,  0.,  0.]]
+
+    # cast to row_sparse storage type
+    rsp = cast_storage(default, 'default')
+    rsp.indices = [0, 1]
+    rsp.values = [[ 0.,  1.,  0.],
+                  [ 2.,  0.,  3.]]
+
+    # cast to row_sparse storage type
+    csr = cast_storage(default, 'default')
+    csr.indices = [1, 0, 2]
+    csr.values = [ 1.,  2.,  3.]
+    csr.indptr = [0, 1, 3, 3, 3]
+
 )code" ADD_FILELINE)
 .set_num_inputs(1)
 .set_num_outputs(1)

diff --git a/src/operator/tensor/dot.cc b/src/operator/tensor/dot.cc
@@ -49,6 +49,14 @@ NNVM_REGISTER_OP(dot)
     y = reshape([7,6,5,4,3,2,1,0], shape=(2,2,2))
     dot(x,y)[0,0,1,1] = 0
     sum(x[0,0,:]*y[:,1,1]) = 0
+
+The storage type of ``dot`` output depends on storage types of inputs and transpose options:
+
+- dot(csr, default) = default
+- dot(csr.T, default) = row_sparse
+- dot(csr, row_sparse) = default
+- otherwise, ``dot`` generates output with default storage
+
 )doc" ADD_FILELINE)
 .set_num_inputs(2)
 .set_num_outputs(1)

diff --git a/src/operator/tensor/elemwise_binary_op_basic.cc b/src/operator/tensor/elemwise_binary_op_basic.cc
@@ -28,7 +28,14 @@ namespace mxnet {
 namespace op {
 MXNET_OPERATOR_REGISTER_BINARY(elemwise_add)
 .add_alias("_add").add_alias("_plus").add_alias("_Plus")
-.describe("Adds arguments element-wise.")
+.describe(R"code(Adds arguments element-wise.
+
+The storage type of ``elemwise_add`` output depends on storage types of inputs
+
+- elemwise_add(row_sparse, row_sparse) = row_sparse
+- otherwise, ``elemwise_add`` generates output with default storage
+
+)code")
 .set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, mshadow::op::plus>)
 .set_attr<nnvm::FGradient>("FGradient", CloneGradient{"_backward_add"})
 .set_attr<FComputeEx>("FComputeEx<cpu>", BinaryComputeEx<cpu, mshadow::op::plus>)

diff --git a/src/operator/tensor/elemwise_sum.cc b/src/operator/tensor/elemwise_sum.cc
@@ -110,6 +110,12 @@ NNVM_REGISTER_OP(add_n)
    add\_n(a_1, a_2, ..., a_n) = a_1 + a_2 + ... + a_n
 
 ``add_n`` is potentially more efficient than calling ``add`` by `n` times.
+
+The storage type of ``add_n`` output depends on storage types of inputs
+
+- add_n(row_sparse, row_sparse, ..) = row_sparse
+- otherwise, ``add_n`` generates output with default storage
+
 )doc" ADD_FILELINE)
 .set_attr_parser(ParamParser<ElementWiseSumParam>)
 .set_num_inputs([](const nnvm::NodeAttrs& attrs) {

diff --git a/src/operator/tensor/sparse_retain.cc b/src/operator/tensor/sparse_retain.cc
@@ -41,6 +41,11 @@ Example::
   rsp_out.values = [[1, 2], [5, 6]]
   rsp_out.indices = [0, 3]
 
+The storage type of ``sparse_retain`` output depends on storage types of inputs
+
+- sparse_retain(row_sparse, default) = row_sparse
+- otherwise, ``sparse_retain`` is not supported
+
 )code" ADD_FILELINE)
 .set_num_inputs(2)
 .set_num_outputs(1)

diff --git a/src/operator/tensor/square_sum-inl.h b/src/operator/tensor/square_sum-inl.h
@@ -196,30 +196,26 @@ struct SquareSumRspGradKernel<req, 1> {
 };
 
 /*!
- * This kernel assumes that the ograd and in_data
- * are all rsp and have equal row_idx array.
- * TODO(junwu): make the kernel general to support
- * the cases when ograd and in_data have different
- * row_idx arrays.
+ * Note: This kernel assumes that the ograd and in_data
+ * are all rsp and have equal row_idx array, or
+ * in_data is a full rsp.
  */
 template<int req>
 struct SquareSumRspGradKernel<req, 1, kRowSparseStorage> {
   /*!
-   * \param i index of out_grad_row_idx
+   * \param i index of igrad.data()
    * \param in_grad_row_idx row_idx of the gradient of the op's input
    * \param in_grad gradient of the op's input
    * \param out_grad_row_idx row_idx of the gradient of the op's output
    * \param out_grad gradient of the op's output
-   * \param in_row_idx row idx of the op's input
    * \param in_data op's input
    */
   template<typename IType, typename DType>
   MSHADOW_XINLINE static void Map(int i, IType* in_grad_row_idx, DType* in_grad,
                                   const IType* out_grad_row_idx, const DType* out_grad,
-                                  const IType* in_row_idx, const DType* in_data,
-                                  const int64_t num_cols) {
+                                  const DType* in_data, const int64_t num_cols) {
     const int64_t row = i / num_cols;
-    in_grad_row_idx[row] = in_row_idx[row];
+    in_grad_row_idx[row] = out_grad_row_idx[row];
     KERNEL_ASSIGN(in_grad[i], req, 2*in_data[i]*out_grad[row]);
   }
 };
@@ -341,7 +337,7 @@ void SquareSumRspGradImpl(const nnvm::NodeAttrs& attrs,
   const TBlob& igrad_data = igrad->data();
   const TBlob igrad_row_idx = igrad->aux_data(rowsparse::kIdx);
   const TBlob& ograd_data = ograd.data();
-  const TBlob in_data = input.data();
+  const TBlob& in_data = input.data();
   const TBlob in_row_idx = input.aux_data(rowsparse::kIdx);
   if (ograd.storage_type() == kDefaultStorage) {
     if (0 == param.axis[0]) {  // forward is sum per column
@@ -372,16 +368,20 @@ void SquareSumRspGradImpl(const nnvm::NodeAttrs& attrs,
                                    " when ograd_stype = kRowSparseStorage";
     CHECK_EQ(ograd.shape().ndim(), 2U);
     const TBlob ograd_row_idx = ograd.aux_data(rowsparse::kIdx);
-    CHECK_EQ(ograd_row_idx.Size(), in_row_idx.Size());
+    CHECK(ograd_row_idx.Size() == in_row_idx.Size() || in_row_idx.Size() == in_data.shape_[0]);
     MSHADOW_IDX_TYPE_SWITCH(igrad_row_idx.type_flag_, IType, {
       if (std::is_same<xpu, cpu>::value) {
         const IType* first1 = ograd_row_idx.dptr<IType>();
         const IType* last1 = first1 + ograd_row_idx.Size();
         const IType* first2 = in_row_idx.dptr<IType>();
-        CHECK(std::equal(first1, last1, first2)) << "SquareSumRspGradImpl only supports"
-                                                    " equal ograd_row_idx and input_row_idx"
-                                                    " when ograd and input are both"
-                                                    " row-sparse";
+        // when ograd_row_idx and in_row_idx have the same size and input is not a full rsp
+        // ograd_row_idx and in_row_idx are expected to have the same elements
+        if (ograd_row_idx.Size() == in_row_idx.Size() && in_row_idx.Size() != in_data.shape_[0]) {
+          CHECK(std::equal(first1, last1, first2)) << "SquareSumRspGradImpl only supports"
+                                                      " equal ograd_row_idx and input_row_idx"
+                                                      " when ograd and input are both"
+                                                      " row-sparse";
+        }
       } else {
         LOG(FATAL) << "SquareSumRspGradImpl has not implemented GPU version when"
                       " ograd and input are both row-sparse";
@@ -391,8 +391,7 @@ void SquareSumRspGradImpl(const nnvm::NodeAttrs& attrs,
           Kernel<SquareSumRspGradKernel<req_type, 1, kRowSparseStorage>, xpu>::Launch(
               s, igrad_data.Size(), igrad_row_idx.dptr<IType>(),
               igrad_data.dptr<DType>(), ograd_row_idx.dptr<IType>(),
-              ograd_data.dptr<DType>(), in_row_idx.dptr<IType>(),
-              in_data.dptr<DType>(), num_cols);
+              ograd_data.dptr<DType>(), in_data.dptr<DType>(), num_cols);
         })
       })
     })

diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
@@ -867,7 +867,6 @@ def check_batchnorm_training(stype):
             rolling_mean = np.random.uniform(size=s)
             rolling_std = np.random.uniform(size=s)
 
-            stype = 'row_sparse'
             data = mx.symbol.Variable('data', stype=stype)
             in_location = [mx.nd.array(data_tmp).tostype(stype), mx.nd.array(gamma).tostype(stype),
                            mx.nd.array(beta).tostype(stype)]
@@ -935,7 +934,7 @@ def check_batchnorm_training(stype):
                 test = mx.symbol.BatchNorm(data, fix_gamma=False, use_global_stats=True, axis=chaxis)
                 check_numeric_gradient(test, in_location, xmean_std, numeric_eps=1e-2, rtol=0.2, atol=0.01)
 
-    stypes = ['row_sparse', 'csr', 'default']
+    stypes = ['row_sparse', 'default']
     for stype in stypes:
         check_batchnorm_training(stype)