diff --git a/src/operator/nn/batch_norm.cc b/src/operator/nn/batch_norm.cc
index ba6c413819e4..e57712549e61 100644
--- a/src/operator/nn/batch_norm.cc
+++ b/src/operator/nn/batch_norm.cc
@@ -382,7 +382,8 @@ static inline bool SupportMKLDNNBN(const NDArray &input, const BatchNormParam &p
   TShape shape = input.shape();
   return SupportMKLDNN(input) && shape.ndim() == 4
       && param.axis == mxnet::op::batchnorm::DEFAULT_AXIS
-      && shape[param.axis] % 8 == 0;
+      && shape[param.axis] % 8 == 0
+      && !mxnet::op::batchnorm::disable_mkl;
 }
 
 void BatchNormComputeExCPU(const nnvm::NodeAttrs &attrs,
diff --git a/tests/cpp/include/test_core_op.h b/tests/cpp/include/test_core_op.h
index 019b5c932ac8..53712a6f921e 100644
--- a/tests/cpp/include/test_core_op.h
+++ b/tests/cpp/include/test_core_op.h
@@ -19,11 +19,12 @@
 #ifndef TEST_CORE_OP_H_
 #define TEST_CORE_OP_H_
 
+#include <nnvm/node.h>
 #include <vector>
 #include <algorithm>
 #include <utility>
 #include <string>
-#include <nnvm/node.h>
+#include <map>
 #include "./test_op.h"
 #include "../../../src/imperative/imperative_utils.h"
 
@@ -61,37 +62,6 @@ template<typename DType, typename AccReal = float>
 class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
   , public test::op::OperatorExecutorTiming {
   /*! \brief Performance timing categories */
-  /*!
-   * \brief Access data blob as if on the CPU via a callback
-   * \tparam Type of callback Function to call with CPU-data NDArray
-   * \param src Source NDArray (on GPU or CPU)
-   * \param run_ctx Run context
-   * \param cb Callback Function to call with CPU-data NDArray
-   */
-  template <typename CallbackFunction>
-  static inline void AccessAsCPU(const NDArray &src,
-                                 const RunContext &run_ctx,
-                                 CallbackFunction cb) {
-#if MXNET_USE_CUDA
-    if (src.ctx().dev_type == Context::kCPU) {
-      cb(src);
-    } else {
-      Context cpu_ctx, gpu_ctx = src.ctx();
-      cpu_ctx.dev_type = Context::kCPU;
-      cpu_ctx.dev_id = 0;
-      NDArray on_cpu(src.shape(), cpu_ctx);
-      on_cpu.CheckAndAlloc();
-      TBlob tmp1 = on_cpu.data();
-      mxnet::ndarray::Copy<gpu, cpu>(src.data(), &tmp1, cpu_ctx, gpu_ctx, run_ctx);
-      cb(on_cpu);
-      TBlob tmp2 = src.data();
-      mxnet::ndarray::Copy<cpu, gpu>(on_cpu.data(), &tmp2, gpu_ctx, cpu_ctx, run_ctx);
-    }
-#else
-    cb(src);
-#endif
-  }
-
   /*!
    * \brief Parse additional arguments into NodeAttrs structure
    * \param op Pointer to operator object
@@ -119,6 +89,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
    */
   static inline std::vector<TBlob>& CollectBlobs(const std::vector<NDArray>& src,
                                                  std::vector<TBlob> *dest) {
+    dest->resize(0);
     dest->reserve(dest->size() + src.size());
     for (size_t i = 0, n = src.size(); i < n; ++i) {
       dest->emplace_back(src[i].data());
@@ -132,13 +103,11 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
    * \param ctx Context to use when creating the array/tensor
    * \return The created NDArray
    */
-  NDArray CreateRandArray(const TShape& shape, const Context& ctx) const {
+  NDArray CreateRandArray(const TShape& shape, const RunContext& run_ctx, int dtype) const {
     CHECK_GT(shape.Size(), 0);  // Check it's a valid shape
-    NDArray array(shape, ctx, true, mshadow::DataType<DType>::kFlag);
+    NDArray array(shape, run_ctx.ctx, true, dtype);
     array.CheckAndAlloc();
-    AccessAsCPU(array, ctx_.run_ctx, [this](const NDArray &arr) {
-      test::op::OperatorDataInitializer<DType>::FillRandom(arr.data());
-    });
+    test::op::OperatorDataInitializer<DType>::FillRandom(run_ctx, array.data());
     return array;
   }
 
@@ -148,13 +117,11 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
    * \param ctx Context to use when creating the array/tensor
    * \return The created NDArray
    */
-  NDArray CreateZeroArray(const TShape& shape, const Context& ctx) const {
+  NDArray CreateZeroArray(const TShape& shape, const RunContext& run_ctx, int dtype) const {
     CHECK_GT(shape.Size(), 0);  // Check it's a valid shape
-    NDArray array(shape, ctx, true, mshadow::DataType<DType>::kFlag);
+    NDArray array(shape, run_ctx.ctx, true, dtype);
     array.CheckAndAlloc();
-    AccessAsCPU(array, ctx_.run_ctx, [this](const NDArray &arr) {
-      test::op::OperatorDataInitializer<DType>::FillZero(arr.data());
-    });
+    test::op::OperatorDataInitializer<DType>::FillZero(run_ctx, array.data());
     return array;
   }
 
@@ -225,40 +192,6 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
   }
 
  public:
-  enum BlobVectorType {
-    kInput,
-    kOutput,
-    kAux,
-    kInGrad,
-    kOutGrad,
-    kBlobVectorTypeCount
-  };
-
-#define CASE_STR(__v$) case (__v$): return #__v$
-
-  /*! \brief Convert BlobVectorType enum into a string */
-  static inline const char *bvt2String(const BlobVectorType bvt) {
-    switch (bvt) {
-      CASE_STR(kInput);
-      CASE_STR(kOutput);
-      CASE_STR(kAux);
-      CASE_STR(kInGrad);
-      CASE_STR(kOutGrad);
-      default:
-        CHECK(false);
-        return "";
-    }
-  }
-#undef CASE_STR
-
-  inline const std::vector<TBlob>& getBlobVect(const BlobVectorType bvt) const {
-    // Not implemented
-    CHECK(false);
-    static std::vector<TBlob> dummy;
-    return dummy;
-  }
-
-
   typedef DType   DataType;
   typedef AccReal AccRealType;
 
@@ -327,73 +260,55 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
 #endif
   }
 
-  static nnvm::NodePtr GetBackwardDependency(const nnvm::NodePtr& node,
-                                             uint32_t num_inputs,
-                                             uint32_t num_outputs
-    //std::vector<bool> *p_save_inputs,
-    //std::vector<bool> *p_save_outputs
-  ) {
+  /*!
+   * \brief Get the operator context
+   * \return Reference to this operator's context object
+   */
+  const OpContext& ctx() const {
+    return ctx_;
+  }
 
-    const Op* op = node->op();
-    if(op) {
-      if(!op->name.empty()) {
-        if(op->name == "BatchNorm") {
-          std::cout << "Imperative::GetBackwardDependency( " << op->name << " )" << std::endl;
-        }
-      }
-    }
+  static inline int default_dtype() {
+    using foo = typename mshadow::DataType<DType>;
+    return foo::kFlag;
+  }
 
+  nnvm::NodePtr GetBackwardDependency(const nnvm::NodePtr& node,
+                                      std::map<int, const NDArray *>* index2array) const {
+    index2array->clear();
     static auto& fgradient = nnvm::Op::GetAttr<nnvm::FGradient>("FGradient");
-//    std::vector<bool>& save_inputs = *p_save_inputs;
-//    std::vector<bool>& save_outputs = *p_save_outputs;
-//    save_inputs.resize(num_inputs);
-//    save_outputs.resize(num_outputs);
-//    std::fill(save_inputs.begin(), save_inputs.end(), false);
-//    std::fill(save_outputs.begin(), save_outputs.end(), false);
+
+    const uint32_t num_inputs  = inputs().size();
+    const uint32_t num_outputs = outputs().size();
 
     node->inputs.clear();
     node->inputs.reserve(num_inputs);
     for (uint32_t i = 0; i < num_inputs; ++i) {
       node->inputs.emplace_back(nnvm::NodeEntry{nullptr, i, 0});
+      (*index2array)[i] = &inputs()[i];
     }
 
     if (fgradient.count(node->op())) {
       std::vector<nnvm::NodeEntry> ograd_entries;
       ograd_entries.reserve(num_outputs);
       for (uint32_t i = 0; i < num_outputs; ++i) {
-        ograd_entries.emplace_back(nnvm::NodeEntry{nullptr, i, 1});
+        const uint32_t index = num_inputs + i;
+        ograd_entries.emplace_back(nnvm::NodeEntry{nullptr, index, 1});
+        (*index2array)[index] = &outputs()[i];
       }
       const std::vector<nnvm::NodeEntry> igrad_entries = fgradient[node->op()](node, ograd_entries);
 
-      if(!igrad_entries.empty()) {
+      if (!igrad_entries.empty()) {
         return igrad_entries[0].node;
       }
-
-//      for (const auto& i : igrad_entries) {
-//        if (i.node == nullptr && i.version == 0) {
-//          save_inputs[i.index] = true;
-//        } else if (i.node == node) {
-//          save_outputs[i.index] = true;
-//        }
-//      }
-//      DFSVisit(igrad_entries, [&](const nnvm::NodePtr& gnode) {
-//        if (!gnode || gnode == node) return;
-//        for (const auto& i : gnode->inputs) {
-//          if (i.node == nullptr && i.version == 0) {
-//            save_inputs[i.index] = true;
-//          } else if (i.node == node) {
-//            save_outputs[i.index] = true;
-//          }
-//        }
-//      });
     }
     return nullptr;
   }
 
-  nnvm::NodePtr CalcBackwardPass() const {
+  nnvm::NodePtr CalcBackwardPass(std::map<int, const NDArray *> *index2array) const {
     nnvm::NodePtr node = nnvm::Node::Create();
     node->attrs = attrs_;
-    return GetBackwardDependency(node, inputs().size(), outputs().size());
+    return GetBackwardDependency(node, index2array);
   }
 
   /*!
@@ -424,9 +339,10 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
       op_ = nnvm::Op::Get(op_name);
       CHECK_NOTNULL(op_);
 
+      std::map<int, const NDArray *> index2array;
       nnvm::NodePtr bwd_node_ptr;
-      if(backward_for_op) {
-        bwd_node_ptr = backward_for_op->CalcBackwardPass();
+      if (backward_for_op) {
+        bwd_node_ptr = backward_for_op->CalcBackwardPass(&index2array);
       }
 
       // Set up forward
@@ -435,57 +351,33 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
       int num_inputs = op_->num_inputs;
       if (op_->get_num_inputs) {
         num_inputs = op_->get_num_inputs(attrs_);
-      } else if(backward_for_op) {
-        CHECK_NOTNULL(bwd_node_ptr.get());
-        num_inputs = static_cast<int>(bwd_node_ptr->inputs.size());
+      } else if (backward_for_op) {
+        if(bwd_node_ptr) {
+          num_inputs = static_cast<int>(bwd_node_ptr->inputs.size());
+        }
       }
 
-//      if(backward_for_op) {
-//        const int num_fwd_outputs = backward_for_op->outputs().size();
-//        num_inputs = std::max(num_fwd_outputs, num_inputs);
-//      }
-
       if (!inputs.empty()) {
         CHECK_EQ(inputs.size(), static_cast<size_t>(num_inputs));
       }
 
       int inferred_num_outputs /*, num_visible_outputs*/;
 
-//      imperative::SetNumOutputs(op_, attrs_, num_inputs, &inferred_num_outputs,
-//                                &num_visible_outputs);
-
       if (op_->get_num_outputs) {
         inferred_num_outputs = op_->get_num_outputs(attrs_);
       } else {
         inferred_num_outputs = op_->num_outputs;
       }
 
-//      static auto& finput_names = Op::GetAttr<nnvm::FListInputNames>("FListInputNames");
-//      if(finput_names.count(op_)) {
-//        std::vector<std::string> i_names = finput_names[op_](attrs_);
-//        const int i_name_count = i_names.size();
-//        num_inputs = std::max(i_name_count, num_inputs);
-//      }
-      //using FListInputNames = std::function<std::vector<std::string> (const NodeAttrs& attrs)>;
-
-//      static auto& grad_fun_map = Op::GetAttr<nnvm::FGradient>("FGradient");
-//      if(grad_fun_map.count(op_)) {
-//        auto grad_fun = grad_fun_map[op_];
-//        nnvm::NodePtr nodeptr = std::make_shared<nnvm::Node>();
-//        nodeptr->attrs = attrs_;
-//        std::vector<nnvm::NodeEntry> out_grads;
-//        std::vector<nnvm::NodeEntry> entries = grad_fun(nodeptr, out_grads);
-//        const int grad_count = entries.size();
-//        num_inputs = std::max(grad_count, num_inputs);
-//      }
-
-      //CHECK_GE(inferred_num_outputs, num_visible_outputs);
       // Generic, all shapes the same. Probably this will need to be adjusted for more complex
       // operators such as dot
-      std::vector<TShape> input_shapes;
-      for (size_t i = 0, n = num_inputs; i < n; ++i) {
-        input_shapes.emplace_back(i < input_shapes_.size() ? input_shapes_[i]
-                                                           : input_shapes_[input_shapes_.size() - 1]);
+      std::vector<nnvm::TShape> input_shapes;
+      if (!input_shapes_.empty()) {
+        for (size_t i = 0, n = num_inputs; i < n; ++i) {
+          input_shapes.emplace_back(i < input_shapes_.size() ? input_shapes_[i]
+                                                             : input_shapes_[input_shapes_.size()
+                                                                             - 1]);
+        }
       }
       std::vector<NDArray *> inputs_p, outputs_p;
 
@@ -498,52 +390,115 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
       outputs_.reserve(inferred_num_outputs);
       outputs_p.reserve(inferred_num_outputs);
 
-      for (size_t i = 0; i < static_cast<size_t>(num_inputs); ++i) {
-        CHECK_LT(i, static_cast<int>(input_shapes.size()));
-        inputs_.emplace_back(i < inputs.size() ? inputs[i] : CreateRandArray(input_shapes[i],
-                                                                          ctx_.run_ctx.ctx));
-        inputs_p.emplace_back(&*inputs_.rbegin());
+      std::vector<int> input_types;
+      input_types.reserve(num_inputs);
+      std::vector<int> output_types;
+      output_types.reserve(inferred_num_outputs);
+
+      static auto& finfer_type = Op::GetAttr<nnvm::FInferType>("FInferType");
+      if (finfer_type.count(op_)) {
+        input_types.resize(num_inputs, -1);
+        input_types[0] = default_dtype();  // Set first input to default type
+        output_types.resize(inferred_num_outputs, -1);
+        finfer_type[op_](attrs_, &input_types, &output_types);
+        CHECK_EQ(input_types.size(), num_inputs);
+        CHECK_EQ(output_types.size(), inferred_num_outputs);
+      } else {
+        if (backward_for_op) {
+          if (bwd_node_ptr) {
+            CHECK_EQ(bwd_node_ptr->inputs.size(), num_inputs);
+            input_types.resize(bwd_node_ptr->inputs.size(), -1);
+            for (size_t i = 0; i < num_inputs; ++i) {
+              const int map_key = bwd_node_ptr->inputs[i].index;
+              CHECK(index2array.find(map_key) != index2array.end());
+              const int dtype = index2array[map_key]->dtype();
+              input_types[i] = dtype;
+            }
+            for (const auto &fwd_inp : backward_for_op->inputs()) {
+              const int dtype = fwd_inp.data().type_flag_;
+              output_types.emplace_back(dtype);
+            }
+          } else {
+            for (size_t x = 0; x < num_inputs; ++x) {
+              input_types.emplace_back(default_dtype());
+            }
+            for (const auto &fwd_inp : backward_for_op->inputs()) {
+              const int dtype = fwd_inp.data().type_flag_;
+              output_types.emplace_back(dtype);
+            }
+          }
+        } else {
+          CHECK(false);  // above always true?
+          for (size_t x = 0; x < num_inputs; ++x) {
+            input_types.emplace_back(default_dtype());
+          }
+          for (size_t x = 0; x < inferred_num_outputs; ++x) {
+            output_types.emplace_back(default_dtype());
+          }
+        }
       }
 
       // Output arrays
-      if(outputs_.empty()) {
+      if (outputs_.empty()) {
         std::vector<nnvm::TShape> output_shapes;
         static auto& finfer_shape = Op::GetAttr<nnvm::FInferShape>("FInferShape");
         if (finfer_shape.count(op_)) {
           nnvm::FInferShape call_infer_shapes = finfer_shape[op_];
           output_shapes.resize(inferred_num_outputs);
           call_infer_shapes(attrs_, &input_shapes, &output_shapes);
+          input_shapes_ = input_shapes;
         } else {
-          // TODO: this should be only if outputs param is empty
-          output_shapes = input_shapes;
-          output_shapes.resize(inferred_num_outputs);
+          if (backward_for_op) {
+            // BWD Input shapes
+            if(bwd_node_ptr) {
+              input_shapes.clear();
+              CHECK_EQ(bwd_node_ptr->inputs.size(), num_inputs);
+              for (size_t i = 0; i < num_inputs; ++i) {
+                const int map_key = bwd_node_ptr->inputs[i].index;
+                CHECK(index2array.find(map_key) != index2array.end());
+                const nnvm::TShape &shp = index2array[map_key]->shape();
+                input_shapes.push_back(shp);
+                const nnvm::TShape ss = input_shapes[i];
+              }
+            } else {
+
+            }
+            input_shapes_ = input_shapes;
+            // BWD Output shapes
+            output_shapes = backward_for_op->input_shapes_;
+            CHECK_EQ(output_shapes.size(), inferred_num_outputs);
+          } else {
+            output_shapes = input_shapes;
+            output_shapes.resize(inferred_num_outputs);
+          }
         }
         CHECK_EQ(output_shapes.size(), inferred_num_outputs);
+
         for (size_t i = 0; i < static_cast<size_t>(inferred_num_outputs); ++i) {
           // If supplied and valid, pass from the supplied outputs vector
           // Otherwise use empty for forward pass, or zero-filled for backward pass
           outputs_.emplace_back(i < outputs.size() ? outputs[i]
                                                    : (backward_for_op
                                                       ? CreateZeroArray(output_shapes[i],
-                                                                        ctx_.run_ctx.ctx)
+                                                                        ctx_.run_ctx,
+                                                                        output_types[i])
                                                       : NDArray()));
           outputs_p.emplace_back(&*outputs_.rbegin());
         }
       }
 
+      for (size_t i = 0; i < static_cast<size_t>(num_inputs); ++i) {
+        CHECK_LT(i, static_cast<int>(input_shapes.size()));
+        inputs_.emplace_back(i < inputs.size()
+                             ? inputs[i] : CreateRandArray(input_shapes[i],
+                                                           ctx_.run_ctx,
+                                                           input_types[i]));
+        inputs_p.emplace_back(&*inputs_.rbegin());
+      }
+
       if (!backward_for_op) {
         DispatchMode dispatch_mode = DispatchMode::kUndefined;
         imperative::SetShapeType(ctx_.run_ctx.ctx, attrs_, inputs_p, outputs_p, &dispatch_mode);
-      } else {
-        // Backward op, so set based upon inputs
-        //CHECK_EQ(static_cast<size_t>(num_visible_outputs), backward_for_op->inputs().size());
-//        for (int i = 0; i < num_visible_outputs; ++i) {
-//          CHECK_LT(static_cast<size_t>(i), input_shapes.size());
-//          // backward outputs should look like forward inputs
-//          // TODO(cjolivier01): This check fails for dot product...
-//          // Need better inference of backward shapes
-//          // CHECK_EQ(backward_for_op->inputs()[i].shape(), outputs_[i].shape());
-//        }
       }
 
       std::vector<OpReqType> req;
@@ -591,11 +546,15 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
   template<typename OpProp>
   inline bool initForward(const OpProp &opProp, std::vector<int> *in_type) {
     Init(opProp.GetArgs());
+    resetForward();
     return true;
   }
 
   template<typename OpProp>
-  inline bool initBackward(const OpProp &opProp, std::vector<int> *in_type) { return true; }
+  inline bool initBackward(const OpProp &opProp, std::vector<int> *in_type) {
+    resetBackward();
+    return true;
+  }
 
   inline void forward(const size_t count) {
     perf::TimingItem timeF(&OperatorExecutorTiming::GetTiming(), kForward, "Forward", count);
@@ -620,6 +579,8 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
   void Execute() {
     CHECK_EQ(initialized_, true);
     CHECK_NOTNULL(function_);
+    CollectBlobs(inputs_, &blob_inputs_);
+    CollectBlobs(outputs_, &blob_outputs_);
     function_(attrs_, ctx_, blob_inputs_, req_, blob_outputs_);
   }
 
@@ -668,14 +629,6 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
     return false;
   }
 
-  /*!
-   * \brief Get the operator context
-   * \return Reference to this operator's context object
-   */
-  const OpContext& ctx() const {
-    return ctx_;
-  }
-
   /*!
    * \brief Access input NDArray vector
    * \return reference to NDArray vector of forward inputs
@@ -726,13 +679,9 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
     verbose_ = verbose;
   }
 
-  virtual void resetForward() {
-    CHECK(false) << "Not implemented, generally inits forward-pass data";
-  }
+  virtual void resetForward() {}
 
-  virtual void resetBackward() {
-    CHECK(false) << "Not implemented, generally inits backward-pass data";
-  }
+  virtual void resetBackward() {}
 
  private:
   /*!
diff --git a/tests/cpp/include/test_legacy_op.h b/tests/cpp/include/test_legacy_op.h
index 498fa06650a1..e4c5b3e1febb 100644
--- a/tests/cpp/include/test_legacy_op.h
+++ b/tests/cpp/include/test_legacy_op.h
@@ -376,16 +376,16 @@ class LegacyOperatorExecutor : public OperatorDataInitializer<DType>
     copy(blob, sourceData, 0, sourceDataSize);
   }
 
-  void FillRandom() {
-    for (size_t j = 0, jn = this->c_.all_blob_vects_.size(); j < jn; ++j) {
-      std::vector<TBlob> *data_vect = this->c_.all_blob_vects_[j];
-      if (data_vect) {
-        for (size_t i = 0, n = data_vect->size(); i < n; ++i) {
-          OperatorDataInitializer<DType>::FillRandom((*data_vect)[i]);
-        }
-      }
-    }
-  }
+//  void FillRandom() {
+//    for (size_t j = 0, jn = this->c_.all_blob_vects_.size(); j < jn; ++j) {
+//      std::vector<TBlob> *data_vect = this->c_.all_blob_vects_[j];
+//      if (data_vect) {
+//        for (size_t i = 0, n = data_vect->size(); i < n; ++i) {
+//          OperatorDataInitializer<DType>::FillRandom((*data_vect)[i]);
+//        }
+//      }
+//    }
+//  }
 
   std::vector<TBlob>& inputs() { return c_.blob_input_vec_; }
   const std::vector<TBlob>& inputs() const { return c_.blob_input_vec_; }
diff --git a/tests/cpp/include/test_op.h b/tests/cpp/include/test_op.h
index 066168e2623f..7a0c6d3878ee 100644
--- a/tests/cpp/include/test_op.h
+++ b/tests/cpp/include/test_op.h
@@ -100,12 +100,12 @@ class OperatorDataInitializer {
    * \brief Fill a blob with random values
    * \param blob Blob which to fill with random values
    */
-  void FillRandom(const TBlob& blob) const {
+  void FillRandom(const RunContext& run_ctx, const TBlob& blob) const {
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wabsolute-value"
     std::uniform_real_distribution<> dis_real(-5.0, 5.0);
     std::uniform_int_distribution<> dis_int(-128, 127);
-    test::patternFill<DType>(&blob, [this, &dis_real, &dis_int]() -> DType {
+    test::patternFill(run_ctx, &blob, [this, &dis_real, &dis_int]() -> DType {
       if (!std::is_integral<DType>::value) {
         DType val;
         do {
@@ -123,8 +123,8 @@ class OperatorDataInitializer {
 #pragma clang diagnostic pop
   }
 
-  void FillZero(const TBlob& blob) const {
-    test::patternFill<DType>(&blob, []() -> DType { return DType(0); });
+  void FillZero(const RunContext& run_ctx, const TBlob& blob) const {
+    test::patternFill(run_ctx, &blob, []() -> DType { return DType(0); });
   }
 
  private:
@@ -223,8 +223,8 @@ class Validator {
   /*! \brief Compare blob data */
   static bool compare(const TBlob& b1, const TBlob& b2) {
     if (b1.shape_ == b2.shape_) {
+      CHECK_EQ(b1.type_flag_, b2.type_flag_) << "Can't compare blobs of different data types";
       MSHADOW_REAL_TYPE_SWITCH(b1.type_flag_, DTypeX, {
-        CHECK_EQ(b1.type_flag_, b2.type_flag_) << "Can't compare blobs of different data types";
         const DTypeX *d1 = b1.dptr<DTypeX>();
         const DTypeX *d2 = b2.dptr<DTypeX>();
         CHECK_NE(d1, d2);  // don't compare the same memory
@@ -255,7 +255,7 @@ class Validator {
       const DTypeX v2 = *valuePtr++;
       EXPECT_NEAR(v1, v2, kErrorBound);
       if (!isNear(v1, v2, kErrorBound) && !warningCount++) {
-        LOG(WARNING) << "Near test failure: " << i << ", " << n << std::endl << std::flush;
+        on_failure(i, n, v1, v2, kErrorBound);
       }
     }
     return true;
diff --git a/tests/cpp/include/test_util.h b/tests/cpp/include/test_util.h
index e33b9a6e68c7..1e4faebfad99 100644
--- a/tests/cpp/include/test_util.h
+++ b/tests/cpp/include/test_util.h
@@ -34,6 +34,8 @@
 #include <sstream>
 #include <random>
 
+#include "../../../src/ndarray/ndarray_function.h"
+
 #if MXNET_USE_VTUNE
 #include <ittnotify.h>
 #endif
@@ -132,56 +134,136 @@ class StandaloneBlob : public TBlob {
   std::shared_ptr<BlobMemory>  memory_;
 };
 
+/*!
+ * \brief Access a TBlob's data on the CPU within the scope of this object
+ * Overloaded () operator returns the CPU-bound TBlob
+ * RAII will copy the data back to the GPU (if it was a GPU blob)
+ */
+class CAccessAsCPU {
+ public:
+  CAccessAsCPU(const RunContext& run_ctx, const TBlob& src, bool copy_back_result = true)
+  : run_ctx_(run_ctx)
+    , src_(src)
+    , copy_back_result_(copy_back_result) {
 #if MXNET_USE_CUDA
-/*! \brief Return blob in CPU memory  */
-inline StandaloneBlob BlobOnCPU(const RunContext &rctx, const TBlob& src) {
-  StandaloneBlob res(src.shape_, false, src.type_flag_);
-  if (src.dev_mask() == cpu::kDevMask) {
-    LOG(WARNING) << "BlobOnCPU(<cpu blob>) is safe, but try not to call this with a CPU blob"
-                 << " because it is inefficient";
-    memcpy(res.dptr_, src.dptr_, res.MemorySize());
-  } else {
-    mshadow::Stream<gpu> *stream = rctx.get_stream<gpu>();
-    MSHADOW_TYPE_SWITCH(src.type_flag_, DType, {
-      mshadow::Copy(res.FlatTo1D<cpu, DType>(), src.FlatTo1D<gpu, DType>(stream), stream);
-    });
+    if (run_ctx.ctx.dev_type == Context::kCPU) {
+      blob_ = src;
+    } else {
+      Context cpu_ctx, gpu_ctx = run_ctx.ctx;
+      cpu_ctx.dev_type = Context::kCPU;
+      cpu_ctx.dev_id = 0;
+      NDArray on_cpu(src.shape_, cpu_ctx, false, src_.type_flag_);
+      on_cpu.CheckAndAlloc();
+      blob_ = on_cpu.data();
+      run_ctx.get_stream<gpu>()->Wait();
+      mxnet::ndarray::Copy<gpu, cpu>(src, &blob_, cpu_ctx, gpu_ctx, run_ctx);
+      run_ctx.get_stream<gpu>()->Wait();
+      on_cpu_ = on_cpu;
+    }
+#else
+    blob_ = src;
+#endif
+  }
+  ~CAccessAsCPU() {
+#if MXNET_USE_CUDA
+    if (copy_back_result_) {
+      // Copy back from GPU to CPU
+      if (run_ctx_.ctx.dev_type == Context::kGPU) {
+        Context cpu_ctx, gpu_ctx = run_ctx_.ctx;
+        cpu_ctx.dev_type = Context::kCPU;
+        cpu_ctx.dev_id = 0;
+        run_ctx_.get_stream<gpu>()->Wait();
+        mxnet::ndarray::Copy<cpu, gpu>(blob_, &src_, gpu_ctx, cpu_ctx, run_ctx_);
+        run_ctx_.get_stream<gpu>()->Wait();
+      }
+    }
+#endif
+  }
+  inline const TBlob& operator ()() const {
+    return blob_;
   }
-  return res;
-}
-#endif  // MXNET_USE_CUDA
 
-constexpr const size_t MPRINT_PRECISION = 5;
+ private:
+  const RunContext run_ctx_;
+  TBlob src_;
+  const bool copy_back_result_;
+  NDArray on_cpu_;
+  TBlob blob_;
+};
 
-template<typename DType>
-inline void fill(const TBlob& blob, const DType val) {
-  DType *p1 = blob.dptr<DType>();
-  for (size_t i = 0, n = blob.Size(); i < n; ++i) {
-    *p1++ = val;
+/*!
+ * \brief Access data blob as if on the CPU via a callback
+ * \tparam Type of callback Function to call with CPU-data NDArray
+ * \param src Source NDArray (on GPU or CPU)
+ * \param run_ctx Run context
+ * \param cb Callback Function to call with CPU-data NDArray
+ */
+template <typename CallbackFunction>
+inline void AccessAsCPU(const NDArray &src,
+                               const RunContext &run_ctx,
+                               CallbackFunction cb) {
+#if MXNET_USE_CUDA
+  if (src.ctx().dev_type == Context::kCPU) {
+    cb(src);
+  } else {
+    Context cpu_ctx, gpu_ctx = src.ctx();
+    cpu_ctx.dev_type = Context::kCPU;
+    cpu_ctx.dev_id = 0;
+    NDArray on_cpu(src.shape(), cpu_ctx, false, src.dtype());
+    on_cpu.CheckAndAlloc();
+    TBlob tmp1 = on_cpu.data();
+    run_ctx.get_stream<gpu>()->Wait();
+    mxnet::ndarray::Copy<gpu, cpu>(src.data(), &tmp1, cpu_ctx, gpu_ctx, run_ctx);
+    run_ctx.get_stream<gpu>()->Wait();
+    cb(on_cpu);
+    TBlob tmp2 = src.data();
+    mxnet::ndarray::Copy<cpu, gpu>(on_cpu.data(), &tmp2, gpu_ctx, cpu_ctx, run_ctx);
+    run_ctx.get_stream<gpu>()->Wait();
   }
+#else
+  cb(src);
+#endif
 }
 
-template<typename DType>
-inline void try_fill(const TBlob *blob, const DType val) {
-  if(blob) {
-    DType *p1 = blob->dptr<DType>();
-    for (size_t i = 0, n = blob->Size(); i < n; ++i) {
-      *p1++ = val;
-    }
+/*!
+ * \brief Access data blob as if on the CPU via a callback
+ * \tparam Type of callback Function to call with CPU-data NDArray
+ * \param src Source TBlob (on GPU or CPU)
+ * \param run_ctx Run context
+ * \param cb Callback Function to call with CPU-data TBlob
+ */
+template <typename CallbackFunction>
+inline void AccessAsCPU(const TBlob& src,
+                               const RunContext &run_ctx,
+                               CallbackFunction cb) {
+#if MXNET_USE_CUDA
+  if (run_ctx.ctx.dev_type == Context::kCPU) {
+    cb(src);
+  } else {
+    cb(CAccessAsCPU(run_ctx, src, true)());
   }
+#else
+  cb(src);
+#endif
 }
 
+constexpr const size_t MPRINT_PRECISION = 5;
 template<typename DType>
-inline void fill(const TBlob& blob, const DType *valArray) {
-  DType *p1 = blob.dptr<DType>();
-  for (size_t i = 0, n = blob.Size(); i < n; ++i) {
-    *p1++ = *valArray++;
-  }
+inline void fill(const RunContext &run_ctx, const TBlob& _blob, const DType val) {
+  AccessAsCPU(_blob, run_ctx, [&run_ctx, val](const TBlob& blob) {
+    MSHADOW_TYPE_SWITCH(blob.type_flag_, DTypeX, {
+      DTypeX *p1 = blob.dptr<DTypeX>();
+      for (size_t i = 0, n = blob.Size(); i < n; ++i) {
+        *p1++ = val;
+      }
+    });
+  });
 }
 
 template<typename DType>
-inline void try_fill(const std::vector<TBlob>& container, size_t index, const DType value) {
-  if (index < container.size()) {
-    test::fill(container[index], value);
+inline void try_fill(const RunContext &run_ctx, const TBlob *blob, const DType val) {
+  if (blob) {
+    fill(run_ctx, *blob, val);
   }
 }
 
@@ -292,7 +374,8 @@ inline StreamType& print_blob_(const RunContext& ctx,
                                const bool add_endl = true) {
 #if MXNET_USE_CUDA
   if (blob.dev_mask() == gpu::kDevMask) {
-    return print_blob_<DType>(ctx, _os, BlobOnCPU(ctx, blob), doChannels, doBatches, add_endl);
+    return print_blob_<DType>(ctx, _os, CAccessAsCPU(ctx, blob, false)(), doChannels,
+                              doBatches, add_endl);
   }
 #endif  // MXNET_USE_CUDA
 
@@ -407,9 +490,10 @@ inline StreamType& print_blob_(const RunContext& ctx,
     if (add_endl) {
       os << std::endl;
     }
-  }
-  if (!add_endl) {
+  } else if (!add_endl) {
     os << " ";
+  } else {
+    os << std::endl;
   }
   os << std::flush;
   return os;
@@ -553,62 +637,76 @@ inline std::string type_name() { return demangle(typeid(T).name()); }
  *  2D: batch item -> channel -> row -> col
  *  3D: batch item -> channel -> col
  */
-template<typename DType, typename GetNextData>
-static inline void patternFill(const TBlob *blob, GetNextData getNextData) {
-  const size_t dim = blob->ndim();
-  CHECK_LE(dim, 5U) << "Will need to handle above 3 dimensions (another for loop)";
-  const size_t num = blob->size(0);
-  const size_t channels = dim > 1 ? blob->size(1) : 1;
-  const size_t depth = dim > 2 ? blob->size(2) : 1;
-  const size_t height = dim > 3 ? blob->size(3) : 1;
-  const size_t width = dim > 4 ? blob->size(4) : 1;
-  const size_t numberOfIndexes = blob->shape_.Size();
-  for (size_t n = 0; n < num; ++n) {
-    if (dim > 1) {
-      for (size_t ch = 0; ch < channels; ++ch) {
-        if (dim > 2) {
-          for (size_t d = 0; d < depth; ++d) {
-            if (dim > 3) {
-              for (size_t row = 0; row < height; ++row) {
-                if (dim > 4) {
-                  for (size_t col = 0; col < width; ++col) {
-                    if (dim == 5) {
-                      const size_t idx = test::offset(blob->shape_, {n, ch, d, row, col});
-                      CHECK_LT(idx, numberOfIndexes);
-                      DType &f = blob->dptr<DType>()[idx];
-                      f = getNextData();
-                    } else {
-                      CHECK(dim <= 5) << "Unimplemented dimension: " << dim;
+template<typename GetNextData>
+static inline void patternFill(const RunContext& run_ctx,
+                               const TBlob *_blob,
+                               GetNextData getNextData) {
+  AccessAsCPU(*_blob, run_ctx, [getNextData](const TBlob& blob) {
+    const size_t dim = static_cast<size_t>(blob.ndim());
+    CHECK_LE(dim, 5U) << "Will need to handle above 3 dimensions (another for loop)";
+    const size_t num = blob.size(0);
+    const size_t channels = dim > 1 ? blob.size(1) : 1;
+    const size_t depth = dim > 2 ? blob.size(2) : 1;
+    const size_t height = dim > 3 ? blob.size(3) : 1;
+    const size_t width = dim > 4 ? blob.size(4) : 1;
+    const size_t numberOfIndexes = blob.shape_.Size();
+    for (size_t n = 0; n < num; ++n) {
+      if (dim > 1) {
+        for (size_t ch = 0; ch < channels; ++ch) {
+          if (dim > 2) {
+            for (size_t d = 0; d < depth; ++d) {
+              if (dim > 3) {
+                for (size_t row = 0; row < height; ++row) {
+                  if (dim > 4) {
+                    for (size_t col = 0; col < width; ++col) {
+                      if (dim == 5) {
+                        const size_t idx = test::offset(blob.shape_, {n, ch, d, row, col});
+                        CHECK_LT(idx, numberOfIndexes);
+                        MSHADOW_TYPE_SWITCH(blob.type_flag_, ThisDataType, {
+                          ThisDataType &f = blob.dptr<ThisDataType>()[idx];
+                          f = getNextData();
+                        });
+                      } else {
+                        CHECK(dim <= 5) << "Unimplemented dimension: " << dim;
+                      }
                     }
+                  } else {
+                    const size_t idx = test::offset(blob.shape_, {n, ch, d, row});
+                    CHECK_LT(idx, numberOfIndexes);
+                    MSHADOW_TYPE_SWITCH(blob.type_flag_, ThisDataType, {
+                      ThisDataType &f = blob.dptr<ThisDataType>()[idx];
+                      f = getNextData();
+                    });
                   }
-                } else {
-                  const size_t idx = test::offset(blob->shape_, {n, ch, d, row});
-                  CHECK_LT(idx, numberOfIndexes);
-                  DType &f = blob->dptr<DType>()[idx];
-                  f = getNextData();
                 }
+              } else {
+                const size_t idx = test::offset(blob.shape_, {n, ch, d});
+                CHECK_LT(idx, numberOfIndexes);
+                MSHADOW_TYPE_SWITCH(blob.type_flag_, ThisDataType, {
+                  ThisDataType &f = blob.dptr<ThisDataType>()[idx];
+                  f = getNextData();
+                });
               }
-            } else {
-              const size_t idx = test::offset(blob->shape_, {n, ch, d});
-              CHECK_LT(idx, numberOfIndexes);
-              DType &f = blob->dptr<DType>()[idx];
-              f = getNextData();
             }
+          } else {
+            const size_t idx = test::offset(blob.shape_, {n, ch});
+            CHECK_LT(idx, numberOfIndexes);
+            MSHADOW_TYPE_SWITCH(blob.type_flag_, ThisDataType, {
+              ThisDataType &f = blob.dptr<ThisDataType>()[idx];
+              f = getNextData();
+            });
           }
-        } else {
-          const size_t idx = test::offset(blob->shape_, {n, ch});
-          CHECK_LT(idx, numberOfIndexes);
-          DType &f = blob->dptr<DType>()[idx];
-          f = getNextData();
         }
+      } else {
+        const size_t idx = test::offset(blob.shape_, {n});
+        CHECK_LT(idx, numberOfIndexes);
+        MSHADOW_TYPE_SWITCH(blob.type_flag_, ThisDataType, {
+          ThisDataType &f = blob.dptr<ThisDataType>()[idx];
+          f = getNextData();
+        });
       }
-    } else {
-      const size_t idx = test::offset(blob->shape_, {n});
-      CHECK_LT(idx, numberOfIndexes);
-      DType &f = blob->dptr<DType>()[idx];
-      f = getNextData();
     }
-  }
+  });
 }
 
 /*! \brief Return a random number within a given range (inclusive) */
diff --git a/tests/cpp/operator/batchnorm_test.cc b/tests/cpp/operator/batchnorm_test.cc
index aaa1add21b5f..4b08d985de3e 100644
--- a/tests/cpp/operator/batchnorm_test.cc
+++ b/tests/cpp/operator/batchnorm_test.cc
@@ -18,9 +18,9 @@
  */
 
 /*!
- * Copyright (c) 2017 by Contributors
+ * Copyright (c) 2018 by Contributors
  * \file batchnorm_test.cc
- * \brief batchnorm operator unit test utility functions
+ * \brief batchnorm operator unit tests and utility functions
  * \author Chris Olivier
 */
 
@@ -28,14 +28,14 @@
 #include <mxnet/tensor_blob.h>
 #include "../../src/operator/nn/batch_norm-inl.h"
 #include "../../src/operator/batch_norm_v1-inl.h"
+#include "../../src/operator/operator_common.h"
 #include "./test_legacy_op.h"
 #include "./test_core_op.h"
 #include "executor/exec_pass.h"
 
 using namespace mxnet;
 
-#define SIMPLE_DIMENSIONS  1
-#define MXNET_DUMP_C  0
+#define SIMPLE_DIMENSIONS  0
 #define DISABLE_VALIDATION 0  // If performance profiling, may do things
 // that cause validation to fail
 
@@ -49,8 +49,8 @@ static constexpr int DW = 3;
 static constexpr int BATCH_SIZE = 1;
 static constexpr int CHANNELS = 1;
 static constexpr int DEPTH = 1;
-static constexpr int DH = 2;
-static constexpr int DW = 1;
+static constexpr int DH = 3;
+static constexpr int DW = 2;
 #endif
 
 static constexpr int TIMING_BATCH_SIZE = 128;
@@ -59,11 +59,52 @@ static constexpr int TIMING_DEPTH = 2;
 static constexpr int TIMING_DH = 28;
 static constexpr int TIMING_DW = 28;
 
+#define PRT(__lbl$, __var$) \
+  test::print(ctx.run_ctx, &(std::cout << (__lbl$) << ": "), (__var$), true)
+
+/*!
+ * \brief Forward
+ */
+enum ForwardInputs {
+  /* in_data */     kForInData, kForGamma, kForBeta,
+  /* aux_states */  kForMovingMean, kForMovingVar
+};
+enum ForwardOutputs {
+  /* outputs */     kForOutData , kForOutMean, kForOutVar
+};
+
+/*!
+ * \brief Backward
+ */
+enum BackwardInputs {
+  /* out_grad */    bwd_out_grad_Grad, bwd_out_grad_Mean, bwd_out_grad_Var,
+  /* in_data */     bwd_in_data_Data, bwd_in_data_Gamma, bwd_in_data_Beta,
+  /* aux_states */  bwd_aux_states_MovingMean, bwd_aux_states_MovingVar,
+  /* in_grad */     bwd_out_data_Data, bwd_out_data_Mean, bwd_out_data_Var
+};
+enum BackwardOutputs {
+  /* in_grad */     bwd_in_grad_Data /* Original input data */,
+  /* weight, bias*/ bwd_in_grad_Gamma, bwd_in_grad_Beta
+};
+
+/**
+ *  _____        _           _____       _  _
+ * |  __ \      | |         |_   _|     (_)| |
+ * | |  | | __ _| |_  __ _    | |  _ __  _ | |_
+ * | |  | |/ _` | __|/ _` |   | | | '_ \| || __|
+ * | |__| | (_| | |_| (_| |  _| |_| | | | || |_
+ * |_____/ \__,_|\__|\__,_| |_____|_| |_|_| \__|
+ *
+ *
+ */
 /*! \brief BatchNorm-specific test data  */
 template <typename DType, typename AccReal>
 class BNOperatorExecutor : public test::op::CoreOpExecutor<DType, AccReal> {
   using Super = typename test::op::CoreOpExecutor<DType, AccReal>;
+
  public:
+  using Super::ctx;
+
   BNOperatorExecutor(const bool isGPU, const TShape& inputShape,
                      const test::op::kwargs_t& kwargs,
                      const bool hasWeightAndBias = false)
@@ -72,132 +113,78 @@ class BNOperatorExecutor : public test::op::CoreOpExecutor<DType, AccReal> {
     param_.Init(kwargs);
   }
 
-  //using BlobVectorType = typename test::op::CoreOpExecutor<DType, AccReal>::BlobVectorType;
-
-  enum ForwardInputs { kForInData, kForGamma, kForBeta, kForMovingMean, kForMovingVar };
-  enum ForwardOutputs { kForOutData, kForOutMean, kForOutVar };
-
-  enum BackwardInputs { kBackOutGrad, kBackOutGradMean, kBackOutGradVar, kBackData,
-    kBackGamma, kBackBeta, kBackInMovingMean, kBackInMovingVar, kBackOutData, kBackOutMean,
-    kBackOutVar };
-
-  enum WhichArray {
-    kForwardIn,
-    kForwardOut,
-    kBackwardIn,
-    kBackwardOut
-  };
-
-  const NDArray *GetForwardInArray(const int idx) const {
+  const NDArray *GetForwardInArray(const ForwardInputs idx) const {
     const std::vector<NDArray> &arrs = Super::inputs();
     CHECK_LT(idx, arrs.size());
     return &arrs[idx];
   }
 
-  const NDArray *GetForwardOutArray(const int idx) const {
+  const NDArray *GetForwardOutArray(const ForwardOutputs idx) const {
     const std::vector<NDArray> &arrs = Super::outputs();
     CHECK_LT(idx, arrs.size());
     return &arrs[idx];
   }
 
-  const NDArray *GetBackwardOutArray(const int idx) const {
+  const NDArray *GetBackwardInArray(const BackwardInputs idx) {
+    const std::vector<NDArray> &arrs = Super::bwd_inputs();
+    CHECK_LT(idx, arrs.size());
+    return &arrs[idx];
+  }
+
+  const NDArray *GetBackwardOutArray(const BackwardOutputs idx) const {
     const std::vector<NDArray> &arrs = Super::bwd_outputs();
     CHECK_LT(idx, arrs.size());
     return &arrs[idx];
   }
 
-  const NDArray *GetBackwardInArray(const int idx) const {
-    const std::vector<NDArray> &arrs = Super::bwd_inputs();
-    switch (idx) {
-      case kBackOutGrad:
-        CHECK_LT(kBackOutGrad, arrs.size());
-        return &arrs[kBackOutGrad];
-      case kBackOutGradMean:
-        if (param_.output_mean_var) {
-          CHECK_LT(kBackOutGradMean, arrs.size());
-          return &arrs[kBackOutGradMean];
-        } else {
-          CHECK(false);
-          return nullptr;
-        }
-      case kBackOutGradVar:
-        if (param_.output_mean_var) {
-          return &arrs[kBackOutGradVar];
-        } else {
-          CHECK(false);
-          return nullptr;
-        }
-      default: {
-        const size_t index = param_.output_mean_var ? idx : idx - 2;
-        if(index < arrs.size()) {
-          return &arrs[index];
-        }
-        return nullptr;
-      }
-    }
+  NDArray *GetArray(const ForwardInputs idx) {
+    return const_cast<NDArray *>(GetForwardInArray(idx));
   }
 
-  const TBlob *GetBackwardInBlob(const int idx) const {
-    const NDArray * arr = GetBackwardInArray(idx);
-    if(arr) {
-      return &arr->data();
-    }
-    return nullptr;
+  NDArray *GetArray(const ForwardOutputs idx) {
+    return const_cast<NDArray *>(GetForwardOutArray(idx));
   }
 
-  const NDArray *GetArray(const WhichArray wa, const int idx) const {
-    switch(wa) {
-      case kForwardIn:
-        return GetForwardInArray(idx);
-      case kForwardOut:
-        return GetForwardOutArray(idx);
-      case kBackwardIn:
-        return GetBackwardOutArray(idx);
-      case kBackwardOut:
-      default:
-        CHECK(false);  // need to check params
-        return nullptr;
-    }
+  NDArray *GetArray(const BackwardOutputs idx) {
+    return const_cast<NDArray *>(GetBackwardOutArray(idx));
   }
 
-  inline const TBlob& Blob(const NDArray *arr) const { return arr->data(); }
+  NDArray *GetArray(const BackwardInputs idx) {
+    return const_cast<NDArray *>(GetBackwardInArray(idx));
+  }
+
+  inline const TBlob& Blob(const NDArray *arr) { return arr->data(); }
 
   template<typename EnumType>
-  const TBlob& GetBlob(const WhichArray wa, const EnumType idx) const {
-    return GetArray(wa, idx)->data();
+  const TBlob& GetBlob(const EnumType idx) const {
+    return const_cast<BNOperatorExecutor<DType, AccReal> *>(this)->GetArray(idx)->data();
   }
 
   void resetForward() override {
-    // Start by filling all inputs and outputs with an arbitrary value
+    Super::resetForward();
+
+    // Start by filling all inputs and outputs with an arbitrary values
     for (size_t i = 0, n = Super::inputs().size(); i < n; ++i) {
-      const TBlob& out = Blob(&Super::inputs()[i]);
-      const int dtype = out.type_flag_;
-      MSHADOW_TYPE_SWITCH(dtype, DTypeX, { test::fill(out, DTypeX(0.1234)); });
+      test::try_fill(ctx().run_ctx, &Super::inputs()[i].data(), 0.1234);
     }
     for (size_t i = 0, n = Super::outputs().size(); i < n; ++i) {
-      const TBlob& out = Blob(&Super::outputs()[i]);
-      const int dtype = out.type_flag_;
-      MSHADOW_TYPE_SWITCH(dtype, DTypeX, { test::fill(out, DTypeX(0.1234)); });
+      test::try_fill(ctx().run_ctx, &Super::outputs()[i].data(), 0.5678);
+    }
+    for (size_t i = 0, n = Super::bwd_inputs().size(); i < n; ++i) {
+      test::try_fill(ctx().run_ctx, &Super::bwd_inputs()[i].data(), 0.9012);
+    }
+    for (size_t i = 0, n = Super::outputs().size(); i < n; ++i) {
+      test::try_fill(ctx().run_ctx, &Super::bwd_outputs()[i].data(), 0.3456);
     }
     // Init input data
-    MSHADOW_TYPE_SWITCH(
-      Blob(GetForwardInArray(kForInData)).type_flag_,
-      //this->c_.blob_input_vec_[mxnet::op::batchnorm::kData].type_flag_,
-      DTypeX,
-      {
-        DTypeX val = 0;
-        test::patternFill<DTypeX>(
-          &Blob(GetForwardInArray(kForInData)),
-          //&this->c_.blob_input_vec_[mxnet::op::batchnorm::kData],
-          [&val]{ return val += 1; }); });
+    double val = 0;
+    test::patternFill(ctx().run_ctx, &GetBlob(kForInData), [&val]() -> double { return val += 1; });
 
     MSHADOW_TYPE_SWITCH(
-      Blob(GetForwardInArray(kForGamma)).type_flag_,
-      //this->c_.blob_input_vec_[mxnet::op::batchnorm::kGamma].type_flag_,
+      GetBlob(kForGamma).type_flag_,
       DTypeX, {
-        //const TBlob& blob = this->c_.blob_input_vec_[mxnet::op::batchnorm::kGamma];
-        const TBlob& blob = Blob(GetForwardInArray(kForGamma));
-        test::fill(blob, DTypeX(1));
+        const TBlob& blob = GetBlob(kForGamma);
+        test::fill(ctx().run_ctx, blob, DTypeX(1));
         if (hasWeightAndBias_) {
           if (blob.size(0) > 1) {
             blob.dptr<DTypeX>()[1] = DTypeX(3);
@@ -205,15 +192,13 @@ class BNOperatorExecutor : public test::op::CoreOpExecutor<DType, AccReal> {
         }
       });
     MSHADOW_TYPE_SWITCH(
-      Blob(GetForwardInArray(kForBeta)).type_flag_,
-      //this->c_.blob_input_vec_[mxnet::op::batchnorm::kBeta].type_flag_,
+      GetBlob(kForBeta).type_flag_,
       DTypeX, {
-        //const TBlob& blob = this->c_.blob_input_vec_[mxnet::op::batchnorm::kBeta];
-        const TBlob& blob = Blob(GetForwardInArray(kForBeta));
+        const TBlob& blob = GetBlob(kForBeta);
         if (!hasWeightAndBias_) {
-          test::fill(blob, DTypeX(0));
+          test::fill(ctx().run_ctx, blob, DTypeX(0));
         } else {  // This will cause forward pass check to fail when calculating sum == 0
-          test::fill(blob, DTypeX(1));
+          test::fill(ctx().run_ctx, blob, DTypeX(1));
           if (blob.size(0) > 0) {
             blob.dptr<DTypeX>()[0] = DTypeX(3);
           }
@@ -221,93 +206,77 @@ class BNOperatorExecutor : public test::op::CoreOpExecutor<DType, AccReal> {
       });
 
     // Init the moving data (all mean = 0, all var = 1)
-    MSHADOW_TYPE_SWITCH(
-      //this->c_.blob_aux_states_[mxnet::op::batchnorm::kMovingMean].type_flag_,
-      Blob(GetForwardInArray(kForMovingMean)).type_flag_,
-      DTypeX, {
-        test::fill(Blob(GetForwardInArray(kForMovingMean)), DTypeX(0));
-        //test::fill(this->c_.blob_aux_states_[mxnet::op::batchnorm::kMovingMean], DTypeX(0));
-      });
-    MSHADOW_TYPE_SWITCH(
-      Blob(GetForwardInArray(kForMovingVar)).type_flag_,
-      //this->c_.blob_aux_states_[mxnet::op::batchnorm::kMovingVar].type_flag_,
-      DTypeX, {
-        //test::fill(this->c_.blob_aux_states_[mxnet::op::batchnorm::kMovingVar], DTypeX(1));});
-        test::fill(Blob(GetForwardInArray(kForMovingVar)), DTypeX(1));
-    });
+    test::try_fill(ctx().run_ctx, &GetBlob(kForMovingMean), 0);
+    test::try_fill(ctx().run_ctx, &GetBlob(kForMovingVar), 1);
+    test::try_fill(ctx().run_ctx, &GetBlob(kForOutMean), 0);
+    test::try_fill(ctx().run_ctx, &GetBlob(kForOutVar), 1);
   }
 
   void resetBackward() override {
-    // Start by filling all backward inputs and outputs with an arbitrary value
-    for (size_t i = 0, n = Super::bwd_inputs().size(); i < n; ++i) {
-      const TBlob& out = Blob(&Super::bwd_inputs()[i]);
-      const int dtype = out.type_flag_;
-      MSHADOW_TYPE_SWITCH(dtype, DTypeX, { test::fill(out, DTypeX(0.5678)); });
-    }
-    for (size_t i = 0, n = Super::bwd_outputs().size(); i < n; ++i) {
-      const TBlob& out = Blob(&Super::bwd_outputs()[i]);
-      const int dtype = out.type_flag_;
-      MSHADOW_TYPE_SWITCH(dtype, DTypeX, { test::fill(out, DTypeX(0.5678)); });
-    }
-    DType val = -.001;
+    Super::resetBackward();
+
+    // Join forward input and in_data array
+    double val = 0;
+    test::patternFill(ctx().run_ctx, &GetBlob(bwd_in_data_Data), [&val]() -> double {
+      return val += 1;
+    });
+
     MSHADOW_TYPE_SWITCH(
-      GetBlob(kBackwardIn, kBackOutGrad).type_flag_,
-      //this->c_.blob_out_grad_[mxnet::op::batchnorm::kOut].type_flag_,
+      GetBlob(bwd_in_data_Gamma).type_flag_,
       DTypeX, {
-        test::patternFill<DTypeX>(
-          &GetBlob(kBackwardIn, kBackOutGrad),
-          //&this->c_.blob_out_grad_[mxnet::op::batchnorm::kOut],
-                                  [&val]{ return val += 1; });
+        const TBlob& blob = GetBlob(bwd_in_data_Gamma);
+        test::fill(ctx().run_ctx, blob, DTypeX(1));
+        if (hasWeightAndBias_) {
+          if (blob.size(0) > 1) {
+            blob.dptr<DTypeX>()[1] = DTypeX(3);
+          }
+        }
       });
-
-    // out-grad weights
-    //if (mxnet::op::batchnorm::kGamma < this->c_.blob_out_grad_.size()) {
-    if (GetBackwardInBlob(kBackGamma)) {
-      MSHADOW_TYPE_SWITCH(
-        GetBackwardInBlob(kBackGamma)->type_flag_,
-        //this->c_.blob_out_grad_[mxnet::op::batchnorm::kGamma].type_flag_,
-        DTypeX,
-        { test::try_fill(GetBackwardInBlob(kBackGamma), DTypeX(0.1)); });
-    }
-
-    // out-grad biases
-    if (GetBackwardInBlob(kBackBeta)) {
-      MSHADOW_TYPE_SWITCH(
-        GetBackwardInBlob(kBackBeta)->type_flag_,
-        //this->c_.blob_out_grad_[mxnet::op::batchnorm::kGamma].type_flag_,
-        DTypeX,
-        { test::try_fill(GetBackwardInBlob(kBackBeta), DTypeX(0.1)); });
-    }
-
-    /*
-    // in-grad
     MSHADOW_TYPE_SWITCH(
-      this->c_.blob_in_grad_[mxnet::op::batchnorm::kData].type_flag_,
-      DTypeX,
-      { test::try_fill(this->c_.blob_in_grad_, mxnet::op::batchnorm::kData, DTypeX(0)); });
-
-    // in-grad weights
-    if (mxnet::op::batchnorm::kGamma < this->c_.blob_in_grad_.size()) {
-      MSHADOW_TYPE_SWITCH(
-        this->c_.blob_in_grad_[mxnet::op::batchnorm::kGamma].type_flag_,
-        DTypeX,
-        { test::try_fill(this->c_.blob_in_grad_, mxnet::op::batchnorm::kGamma, DTypeX(0)); });
-    }
+      GetBlob(bwd_in_data_Beta).type_flag_,
+      DTypeX, {
+        const TBlob& blob = GetBlob(bwd_in_data_Beta);
+        if (!hasWeightAndBias_) {
+          test::fill(ctx().run_ctx, blob, DTypeX(0));
+        } else {  // This will cause forward pass check to fail when calculating sum == 0
+          test::fill(ctx().run_ctx, blob, DTypeX(1));
+          if (blob.size(0) > 0) {
+            blob.dptr<DTypeX>()[0] = DTypeX(3);
+          }
+        }
+      });
 
-    // in-grad biases
-    if (mxnet::op::batchnorm::kBeta < this->c_.blob_in_grad_.size()) {
-      MSHADOW_TYPE_SWITCH(
-        this->c_.blob_in_grad_[mxnet::op::batchnorm::kBeta].type_flag_,
-        DTypeX,
-        { test::try_fill(this->c_.blob_in_grad_, mxnet::op::batchnorm::kBeta, DTypeX(0)); });
-    }
-    */
+    // Join aux arrays
+    test::try_fill(ctx().run_ctx, &GetBlob(bwd_aux_states_MovingMean), 0);
+    test::try_fill(ctx().run_ctx, &GetBlob(bwd_aux_states_MovingVar), 1);
+
+    val = -.101;
+    test::patternFill(ctx().run_ctx, &GetBlob(bwd_out_data_Data), [&val]() -> double {
+      return val += 1; });
+    test::try_fill(ctx().run_ctx, &GetBlob(bwd_out_data_Mean), 0.0);
+    test::try_fill(ctx().run_ctx, &GetBlob(bwd_out_data_Var), 1.0);
+
+    val = -.001;
+    test::patternFill(ctx().run_ctx, &GetBlob(bwd_out_grad_Grad), [&val]() -> double {
+      return val += 0.01; });
+    test::try_fill(ctx().run_ctx, &GetBlob(bwd_out_grad_Mean), 0.0);
+    test::try_fill(ctx().run_ctx, &GetBlob(bwd_out_grad_Var), 1.0);
   }
 
   const bool hasWeightAndBias_;  // This will cause forward pass validation to fail
   op::BatchNormParam param_;
 };
 
+/**
+ * __      __    _  _     _       _
+ * \ \    / /   | |(_)   | |     | |
+ *  \ \  / /__ _| | _  __| | __ _| |_  ___  _ __
+ *   \ \/ // _` | || |/ _` |/ _` | __|/ _ \| '__|
+ *    \  /| (_| | || | (_| | (_| | |_| (_) | |
+ *     \/  \__,_|_||_|\__,_|\__,_|\__|\___/|_|
+ *
+ *
+ */
 /*! \brief Validate batch norm test outputs */
 template<typename DType, typename AccReal>
 class BatchNormValidator : public test::op::Validator<DType, AccReal> {
@@ -350,14 +319,14 @@ class BatchNormValidator : public test::op::Validator<DType, AccReal> {
         // expect zero mean
         EXPECT_NEAR(0, sum, kErrorBound);
         if (!Super::isNear(AccReal(0), sum, kErrorBound)) {
-          LOG(WARNING) << "Sum is not close enough to zero "
+          LOG(WARNING) << "Sum is not close enough to zero: "
                        << saveSum << " (" << sum << "), "
                        << saveVar << " (" << var << ")";
         }
         // expect unit variance
         EXPECT_NEAR(1, var, kErrorBound);
         if (!Super::isNear(AccReal(1), var, kErrorBound)) {
-          LOG(WARNING) << "Variance is not close enough to 1 "
+          LOG(WARNING) << "Variance is not close enough to 1: "
                        << saveSum << " (" << sum << "), "
                        << saveVar << " (" << var << ")";
         }
@@ -375,7 +344,7 @@ class BatchNormValidator : public test::op::Validator<DType, AccReal> {
     const size_t height = blob->shape_[2];
     const size_t width = blob->shape_[3];
 
-    size_t itemCount = 0;
+    size_t itemCount = 0, nonZero = 0;
 
     for (size_t j = 0; j < channels; ++j) {
       AccReal sum = 0, var = 0;
@@ -386,10 +355,16 @@ class BatchNormValidator : public test::op::Validator<DType, AccReal> {
             sum += data;
             var += data * data;
             ++itemCount;
+            if (data != 0) {
+              ++nonZero;
+            }
           }
         }
       }
 
+      CHECK_GT(itemCount, 1U);  // Not a valid check for one item
+      CHECK_NE(nonZero, 0);
+
       const AccReal saveSum = sum, saveVar = var;
 
       // not channels
@@ -401,16 +376,18 @@ class BatchNormValidator : public test::op::Validator<DType, AccReal> {
         // expect zero mean
         EXPECT_NEAR(0, sum, kErrorBound);
         if (!Super::isNear(AccReal(0), sum, kErrorBound)) {
-          LOG(WARNING) << "Sum is not close enough to zero "
+          LOG(WARNING) << "Sum is not close enough to zero: "
                        << saveSum << " (" << sum << "), "
                        << saveVar << " (" << var << ")";
+          test::print(RunContext(), &(std::cerr << "Mean problem:" << std::endl), *blob);
         }
         // expect unit variance
         EXPECT_NEAR(1, var, kErrorBound);
         if (!Super::isNear(AccReal(1), var, kErrorBound)) {
-          LOG(WARNING) << "Variance is not close enough to 1"
+          LOG(WARNING) << "Variance is not close enough to 1: "
                        << saveSum << " (" << sum << "), "
                        << saveVar << " (" << var << ")";
+          test::print(RunContext(), &(std::cerr << "Variance problem:" << std::endl), *blob);
         }
       }
     }
@@ -473,96 +450,110 @@ class BatchNormValidator : public test::op::Validator<DType, AccReal> {
   template <typename ExecutorType1, typename ExecutorType2, typename EnumType>
   static inline bool compare(const ExecutorType1& i1,
                              const ExecutorType2& i2,
-                             const typename ExecutorType1::WhichArray wa,
                              const EnumType idx,
                              bool print = false) {
-    const TBlob& b1 = i1.GetBlob(wa, idx);
-    const TBlob& b2 = i2.GetBlob(wa, idx);
+    test::CAccessAsCPU cpu1(i1.ctx().run_ctx, i1.GetBlob(idx), false),
+      cpu2(i2.ctx().run_ctx, i2.GetBlob(idx), false);
+    const TBlob& b1 = cpu1();
+    const TBlob& b2 = cpu2();
     if (print && test::debug_output) {
-      test::print(RunContext(), &(std::cout << "Blob 1:"), b1, true, true);
-      test::print(RunContext(), &(std::cout << "Blob 2:"), b2, true, true);
+      test::print(i1.ctx().run_ctx, &(std::cout << "Blob 1:"), b1, true, true);
+      test::print(i2.ctx().run_ctx, &(std::cout << "Blob 2:"), b2, true, true);
+    }
+    const bool rc = test::op::Validator<DType, AccReal>::compare(b1, b2);
+    if (!rc) {
+      test::print(i1.ctx().run_ctx, &(std::cerr << "ERROR Blob 1:"), b1, true, true);
+      test::print(i2.ctx().run_ctx, &(std::cerr << "ERROR Blob 2:"), b2, true, true);
     }
-    return test::op::Validator<DType, AccReal>::compare(b1, b2);
+    return rc;
   }
 
   /*! \brief Check batch norm output */
   template<typename BNOperatorProp>
-  static void validateForward(const BNOperatorProp& data) {
-    //const TBlob& outputBlob = data.output_blobs()[mxnet::op::batchnorm::kData];
-    const TBlob& outputBlob = data.GetBlob(BNOperatorProp::kForwardOut,
-                                           BNOperatorProp::kForOutData);
-    test::print(RunContext(), &(std::cout << "Fwd Output Blob:"), outputBlob, true, true);
-    switch (outputBlob.ndim()) {
-      case 3:
-        checkBatchNorm1D(&outputBlob);
-        break;
-      case 4:
-        checkBatchNorm2D(&outputBlob);
-        break;
-      case 5:
-        checkBatchNorm3D(&outputBlob);
-        break;
-      default:
-        CHECK(false) << "Supplied shape is not supported for this test";
-        break;
+  static void validateForward(const RunContext& run_ctx, const BNOperatorProp& data) {
+    const TBlob &outputBlob = data.GetBlob(ForwardOutputs::kForOutData);
+    if (test::debug_output) {
+      test::print(run_ctx, &(std::cout << "Fwd Output Blob:"), outputBlob, true, true);
     }
+    test::AccessAsCPU(outputBlob, run_ctx, [](const TBlob& blob) {
+      switch (blob.ndim()) {
+        case 3:
+          checkBatchNorm1D(&blob);
+          break;
+        case 4:
+          checkBatchNorm2D(&blob);
+          break;
+        case 5:
+          checkBatchNorm3D(&blob);
+          break;
+        default:
+          CHECK(false) << "Supplied shape is not supported for this test";
+          break;
+      }
+    });
   }
 
+#define TEST_ISTRUE(__args$) \
+  do { \
+    bool _rc; \
+    EXPECT_TRUE((_rc = (__args$))); \
+    if (!_rc) { \
+      rc = false; \
+    } \
+  } while (0)
+
   /*! \brief Compare entire operator data between two test sets */
   template<typename PropType1, typename PropType2>
-  static void compare(
+  static bool compare(
     const test::op::OpInfo<PropType1, BNOperatorExecutor<DType, AccReal>>& info_1,
     const test::op::OpInfo<PropType2, BNOperatorExecutor<DType, AccReal>>& info_2) {
+    bool rc = true;
     // Input
-    EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_,
-                        BNOperatorExecutor<DType, AccReal>::kForwardIn,
-                        BNOperatorExecutor<DType, AccReal>::kForInData));
-    EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_,
-                        BNOperatorExecutor<DType, AccReal>::kForwardIn,
-                        BNOperatorExecutor<DType, AccReal>::kForGamma));
-    EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_,
-                        BNOperatorExecutor<DType, AccReal>::kForwardIn,
-                        BNOperatorExecutor<DType, AccReal>::kForBeta));
+    TEST_ISTRUE(compare(*info_1.executor_, *info_2.executor_, ForwardInputs::kForInData));
+    TEST_ISTRUE(compare(*info_1.executor_, *info_2.executor_, ForwardInputs::kForGamma));
+    TEST_ISTRUE(compare(*info_1.executor_, *info_2.executor_, ForwardInputs::kForBeta));
     // Output
-    EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_,
-                        BNOperatorExecutor<DType, AccReal>::kForwardOut,
-                        BNOperatorExecutor<DType, AccReal>::kForOutData));
-    CHECK_EQ(info_2.prop_->getParam().use_global_stats,
-             info_1.prop_->getParam().use_global_stats);
+    TEST_ISTRUE(compare(*info_1.executor_, *info_2.executor_, ForwardOutputs::kForOutData));
+    CHECK_EQ(info_2.prop_->getParam().use_global_stats, info_1.prop_->getParam().use_global_stats);
 
-#if 0
 #if MXNET_USE_CUDNN != 1 /* CUDNN takes a different approach here on first pass */
     // Aux
-    EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_,
-                        test::op::CoreOpExecutor<DType>::kAux,
-                        mxnet::op::batchnorm::kMovingMean));
-    EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_,
-                        test::op::CoreOpExecutor<DType>::kAux,
-                        mxnet::op::batchnorm::kMovingVar));
+    TEST_ISTRUE(compare(*info_1.executor_, *info_2.executor_, ForwardOutputs::kForOutMean));
+    TEST_ISTRUE(compare(*info_1.executor_, *info_2.executor_, ForwardOutputs::kForOutVar));
 #endif
+
     if (!info_2.prop_->getParam().use_global_stats) {
-      EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_,
-                          test::op::CoreOpExecutor<DType>::kOutput,
-                          mxnet::op::batchnorm::kMean));
+      TEST_ISTRUE(compare(*info_1.executor_, *info_2.executor_,
+                          BackwardInputs::bwd_out_data_Mean));
+      TEST_ISTRUE(compare(*info_1.executor_, *info_2.executor_,
+                          BackwardInputs::bwd_out_data_Var));
       // InGrad
-      EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_,
-                          test::op::CoreOpExecutor<DType>::kInGrad,
-                          mxnet::op::batchnorm::kData));
-      EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_,
-                          test::op::CoreOpExecutor<DType>::kInGrad,
-                          mxnet::op::batchnorm::kGamma));
-      EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_,
-                          test::op::CoreOpExecutor<DType>::kInGrad,
-                          mxnet::op::batchnorm::kBeta));
+      TEST_ISTRUE(compare(*info_1.executor_, *info_2.executor_,
+                          BackwardOutputs::bwd_in_grad_Data));
+#if 0
+      TEST_ISTRUE(compare(*info_1.executor_, *info_2.executor_,
+                          BackwardOutputs::bwd_in_grad_Gamma));
+      TEST_ISTRUE(compare(*info_1.executor_, *info_2.executor_,
+                          BackwardOutputs::bwd_in_grad_Beta));
+#endif
       // OutGrad
-      EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_,
-                          test::op::CoreOpExecutor<DType>::kOutGrad,
-                          mxnet::op::batchnorm::kData));
+      TEST_ISTRUE(compare(*info_1.executor_, *info_2.executor_,
+                          BackwardInputs::bwd_out_grad_Grad));
     }
-#endif
+    return rc;
   }
 };
 
+/**
+ *  _____                                  _
+ * |  __ \                                | |
+ * | |__) |__ _ _ __  __ _ _ __ ___   ___ | |_  ___  _ __  ___
+ * |  ___// _` | '__|/ _` | '_ ` _ \ / _ \| __|/ _ \| '__|/ __|
+ * | |   | (_| | |  | (_| | | | | | |  __/| |_|  __/| |   \__ \
+ * |_|    \__,_|_|   \__,_|_| |_| |_|\___| \__|\___||_|   |___/
+ *
+ *
+ */
 static const test::op::kwargs_t blank_kwargs;
 static const test::op::kwargs_t blank_kwargs_nocudnn = {
   {"cudnn_off", "True"} };
@@ -591,39 +582,49 @@ static bool isUGS(const test::op::kwargs_t& kwargs) {
 }
 #endif  // DISABLE_VALIDATION
 
-template<typename StreamType, typename OperatorExecutor>
-static StreamType& PRT(StreamType *os, const OperatorExecutor& obj,
-                       const typename OperatorExecutor::BlobVectorType bvt, const size_t idx) {
-  *os << OperatorExecutor::bvt2String(bvt) << ": " << idx
-      << ": ";
-  const TBlob& blob = obj.getBlobVect(bvt)[idx];
-
-  test::print(RunContext(), os, blob);
+/**
+ *  _____        _                    ____        _               _
+ * |  __ \      | |                  / __ \      | |             | |
+ * | |  | | ___ | |__  _   _  __ _  | |  | |_   _| |_ _ __  _   _| |_
+ * | |  | |/ _ \| '_ \| | | |/ _` | | |  | | | | | __| '_ \| | | | __|
+ * | |__| |  __/| |_) | |_| | (_| | | |__| | |_| | |_| |_) | |_| | |_
+ * |_____/ \___||_.__/ \__,_|\__, |  \____/ \__,_|\__| .__/ \__,_|\__|
+ *                            __/ |                  | |
+ *                           |___/                   |_|
+ */
+template<typename StreamType, typename OperatorExecutor, typename BlobType>
+static StreamType& _DBPRT(const RunContext& run_ctx, const char *label,
+                          StreamType *os, const OperatorExecutor& obj, const BlobType type) {
+  *os << label << ": ";
+  test::print(RunContext(), os, test::CAccessAsCPU(run_ctx, obj.GetBlob(type), false)());
   return *os;
 }
 
+#define DBPRT(__os, __obj, __type$) _DBPRT(run_ctx, #__type$, __os, __obj, __type$)
+
 template<typename StreamType, typename Prop, typename OperatorExecutor>
 static StreamType& dumpF(StreamType *os,
                          const test::op::OpInfo<Prop, OperatorExecutor>& prop,
-                         const size_t x = 0) {
-  if (test::debug_output) {
+                         const size_t x = 0,
+                         const bool force = test::debug_output) {
+  if (force) {
     *os << std::endl;
     if (x) {
       *os << "=============================" << std::endl;
       *os << "= " << x << std::endl;
       *os << "=============================" << std::endl;
     }
-//    typedef typename OperatorExecutor::BlobVectorType BlobVectorType;
-//    PRT(os, *prop.executor_, BlobVectorType::kInput, mxnet::op::batchnorm::kData);
-//    PRT(os, *prop.executor_, BlobVectorType::kInput, mxnet::op::batchnorm::kGamma);
-//    PRT(os, *prop.executor_, BlobVectorType::kInput, mxnet::op::batchnorm::kBeta);
-//
-//    PRT(os, *prop.executor_, BlobVectorType::kAux, mxnet::op::batchnorm::kMovingMean);
-//    PRT(os, *prop.executor_, BlobVectorType::kAux, mxnet::op::batchnorm::kMovingVar);
-//
-//    PRT(os, *prop.executor_, BlobVectorType::kOutput, mxnet::op::batchnorm::kOut);
-//    PRT(os, *prop.executor_, BlobVectorType::kOutput, mxnet::op::batchnorm::kMean);
-//    PRT(os, *prop.executor_, BlobVectorType::kOutput, mxnet::op::batchnorm::kVar);
+    const RunContext run_ctx = prop.executor_->ctx().run_ctx;
+    DBPRT(os, *prop.executor_, ForwardInputs::kForInData);
+    DBPRT(os, *prop.executor_, ForwardInputs::kForGamma);
+    DBPRT(os, *prop.executor_, ForwardInputs::kForBeta);
+
+    DBPRT(os, *prop.executor_, ForwardInputs::kForMovingMean);
+    DBPRT(os, *prop.executor_, ForwardInputs::kForMovingVar);
+
+    DBPRT(os, *prop.executor_, ForwardOutputs::kForOutData);
+    DBPRT(os, *prop.executor_, ForwardOutputs::kForOutMean);
+    DBPRT(os, *prop.executor_, ForwardOutputs::kForOutVar);
   }
   return *os;
 }
@@ -631,8 +632,9 @@ static StreamType& dumpF(StreamType *os,
 template<typename StreamType, typename Prop, typename OperatorExecutor>
 static StreamType& dumpB(StreamType *os,
                          const test::op::OpInfo<Prop, OperatorExecutor>& prop,
-                         const size_t x = 0) {
-  if (test::debug_output) {
+                         const size_t x = 0,
+                         const bool force = test::debug_output) {
+  if (force) {
     *os << std::endl;
     if (x) {
       *os << "=============================" << std::endl;
@@ -640,31 +642,29 @@ static StreamType& dumpB(StreamType *os,
       *os << "=============================" << std::endl;
     }
 
-//    typedef typename OperatorExecutor::BlobVectorType BlobVectorType;
-//    PRT(os, *prop.executor_, BlobVectorType::kInGrad, mxnet::op::batchnorm::kData);
-//    PRT(os, *prop.executor_, BlobVectorType::kInGrad, mxnet::op::batchnorm::kGamma);
-//    PRT(os, *prop.executor_, BlobVectorType::kInGrad, mxnet::op::batchnorm::kBeta);
-//
-//    PRT(os, *prop.executor_, BlobVectorType::kAux, mxnet::op::batchnorm::kMovingMean);
-//    PRT(os, *prop.executor_, BlobVectorType::kAux, mxnet::op::batchnorm::kMovingVar);
-//
-//    PRT(os, *prop.executor_, BlobVectorType::kOutGrad, mxnet::op::batchnorm::kOut);
-  }
-  return *os;
-}
+    const RunContext run_ctx = prop.executor_->ctx().run_ctx;
+    DBPRT(os, *prop.executor_, BackwardOutputs::bwd_in_grad_Data);
+    DBPRT(os, *prop.executor_, BackwardOutputs::bwd_in_grad_Gamma);
+    DBPRT(os, *prop.executor_, BackwardOutputs::bwd_in_grad_Beta);
 
-template<typename StreamType, typename Prop1, typename Prop2, typename OperatorExecutor>
-static StreamType& dumpF(StreamType *os,
-                         const test::op::OpInfoPair<Prop1, Prop2, OperatorExecutor>& bi) {
-  return dumpF(&dumpF(os, bi.info_1_, 1), bi.info_2_, 2);
-}
+    DBPRT(os, *prop.executor_, BackwardInputs::bwd_aux_states_MovingMean);
+    DBPRT(os, *prop.executor_, BackwardInputs::bwd_aux_states_MovingVar);
 
-template<typename StreamType, typename Prop1, typename Prop2, typename OperatorExecutor>
-static StreamType& dumpB(StreamType *os,
-                         const test::op::OpInfoPair<Prop1, Prop2, OperatorExecutor>& bi) {
-  return dumpB(&dumpB(os, bi.info_1_, 1), bi.info_2_, 2);
+    DBPRT(os, *prop.executor_, BackwardInputs::bwd_out_grad_Grad);
+  }
+  return *os;
 }
 
+/**
+ *  _______         _     ______                _   _
+ * |__   __|       | |   |  ____|              | | (_)
+ *    | | ___  ___ | |_  | |__ _   _ _ __   ___| |_ _  ___  _ __   ___
+ *    | |/ _ \/ __|| __| |  __| | | | '_ \ / __| __| |/ _ \| '_ \ / __|
+ *    | |  __/\__ \| |_  | |  | |_| | | | | (__| |_| | (_) | | | |\__ \
+ *    |_|\___||___/ \__| |_|   \__,_|_| |_|\___|\__|_|\___/|_| |_||___/
+ *
+ *
+ */
 /*! \brief Test batch norm operator forward pass */
 template<typename OperatorProp, typename OperatorExecutor>
 static test::op::OpInfo<OperatorProp, OperatorExecutor> TestBatchNormOperatorForward(
@@ -692,7 +692,8 @@ static test::op::OpInfo<OperatorProp, OperatorExecutor> TestBatchNormOperatorFor
 #if !DISABLE_VALIDATION
   if (!isUGS(kwargs)) {
     BatchNormValidator<typename OperatorExecutor::DataType,
-      typename OperatorExecutor::AccRealType>::validateForward(*info.executor_);
+      typename OperatorExecutor::AccRealType>::validateForward(
+      info.executor_->ctx().run_ctx, *info.executor_);
   }
 #endif
 
@@ -718,7 +719,6 @@ static test::op::OpInfoPair<OperatorProp1, OperatorProp2, OperatorExecutor> test
   const bool isGPU2,
   const TShape &inputShape,
   const test::op::kwargs_t& kwargs,
-  const bool dumpC,
   const size_t count = 1,
   const size_t cycleCount = CYCLE_COUNT) {
   test::op::OpInfo<OperatorProp1, OperatorExecutor> info_1 =
@@ -748,14 +748,15 @@ static test::op::OpInfoPair<OperatorProp1, OperatorProp2, OperatorExecutor> test
     }
 
     // Check that everything is the same after the forward pass
-    BatchNormValidator<DType, AccReal>::compare(info_1, info_2);
-
-    BatchNormValidator<DType, AccReal>::compare(
-      *info_1.executor_, *info_2.executor_,
-      OperatorExecutor::kForwardIn, OperatorExecutor::kForInData,
-      //test::op::CoreOpExecutor<DType>::kInput,
-      //mxnet::op::batchnorm::kData,
-      false);
+    const bool b1 = BatchNormValidator<DType, AccReal>::compare(info_1, info_2);
+
+    const bool b2 = BatchNormValidator<DType, AccReal>::compare(*info_1.executor_,
+                                                                *info_2.executor_,
+                                                                kForInData, false);
+    if (!b1 || !b2) {
+      dumpF(&std::cout, info_1, 1, true);
+      dumpF(&std::cout, info_2, 2, true);
+    }
 
     if (!thisCount) {
       // return backward
@@ -772,13 +773,14 @@ static test::op::OpInfoPair<OperatorProp1, OperatorProp2, OperatorExecutor> test
     }
 
     // Check that everything is the same after the backward pass
-    BatchNormValidator<DType, AccReal>::compare(info_1, info_2);
+    if (!BatchNormValidator<DType, AccReal>::compare(info_1, info_2)) {
+      dumpF(&std::cout, info_1, 1, true);
+      dumpF(&std::cout, info_2, 2, true);
+      dumpB(&std::cout, info_1, 1, true);
+      dumpB(&std::cout, info_2, 2, true);
+    }
   } while (++thisCount < cycleCount);
 
-//  if (dumpC) {
-//    info_1.executor_->dumpC(&std::cerr, "BN_testForwardAndBackward");
-//  }
-
   return  { info_1, info_2 };
 }
 template<typename OperatorProp1, typename OperatorProp2, typename OperatorExecutor>
@@ -786,7 +788,6 @@ static test::op::OpInfoPair<OperatorProp1, OperatorProp2, OperatorExecutor>
 testForwardAndBackward(const bool isGPU,
                        const TShape &inputShape,
                        const test::op::kwargs_t kwargs,
-                       const bool dumpC = false,
                        const size_t count = 1,
                        const size_t cycleCount = CYCLE_COUNT
 ) {
@@ -795,14 +796,23 @@ testForwardAndBackward(const bool isGPU,
     isGPU,
     inputShape,
     kwargs,
-    dumpC,
     count,
     cycleCount);
 }
 
+/**
+ *   ____          _____
+ *  / __ \        |  __ \
+ * | |  | |_ __   | |__) |_ __  ___  _ __
+ * | |  | | '_ \  |  ___/| '__|/ _ \| '_ \
+ * | |__| | |_) | | |    | |  | (_) | |_) |
+ *  \____/| .__/  |_|    |_|   \___/| .__/
+ *        | |                       | |
+ *        |_|                       |_|
+ */
+
 // NOTE: This should know which version to use (V1, mkl, etc)
 struct BatchNormCoreOpProp : public mxnet::test::op::CoreOpProp {
-
   void Init(const mxnet::test::op::kwargs_t& kwargs) override {
     mxnet::test::op::CoreOpProp::Init(kwargs);
     params_.Init(kwargs, dmlc::parameter::kAllowUnknown);
@@ -817,77 +827,80 @@ template<typename OperatorExecutor>
 static test::op::OpInfoPair<BatchNormCoreOpProp, BatchNormCoreOpProp, OperatorExecutor>
 testBNForwardAndBackward2D(const bool isGPU,
                            const TShape &inputShape,
-                           const test::op::kwargs_t& kwargs,
-                           const bool dumpC = false) {
+                           const test::op::kwargs_t& kwargs) {
   CHECK_EQ(inputShape.ndim(), 4);  // V1 can only handle 2D
-  return testForwardAndBackward<BatchNormCoreOpProp,
-    BatchNormCoreOpProp, OperatorExecutor>(
-    isGPU,
-    isGPU,
-    inputShape,
-    kwargs,
-    dumpC);
+  return testForwardAndBackward<BatchNormCoreOpProp, BatchNormCoreOpProp, OperatorExecutor>(
+    isGPU, isGPU, inputShape, kwargs);
 }
 
-/*
- * Forward tests
+template<typename OperatorExecutor>
+static test::op::OpInfoPair<BatchNormCoreOpProp, BatchNormCoreOpProp, OperatorExecutor>
+testBNForwardAndBackward(const bool isGPU,
+                         const TShape &inputShape,
+                         const test::op::kwargs_t& kwargs) {
+  return testForwardAndBackward<BatchNormCoreOpProp, BatchNormCoreOpProp, OperatorExecutor>(
+    isGPU, isGPU, inputShape, kwargs);
+}
+
+/**
+ *   _____             _  _
+ *  / ____|           (_)| |
+ * | (___   __ _ _ __  _ | |_ _   _
+ *  \___ \ / _` | '_ \| || __| | | |
+ *  ____) | (_| | | | | || |_| |_| |
+ * |_____/ \__,_|_| |_|_| \__|\__, |
+ *                             __/ |
+ *                            |___/
  */
-TEST(BATCH_NORM, Test2DForwardV1V2) {
+TEST(BATCH_NORM, TestSanityForwaredAndBackward) {
   MSHADOW_REAL_TYPE_SWITCH_EX(
     mshadow::kFloat32,
-    DType,
-    AccReal,
-    {
-      // Have to specify somehow v1 and v2
-      auto infoA = testBNForwardAndBackward2D<BNOperatorExecutor<DType, AccReal>>(
-        false, {BATCH_SIZE, CHANNELS, DH, DW}, blank_kwargs);
-    });
+    DType, AccReal, {
+    testBNForwardAndBackward2D<BNOperatorExecutor<DType, AccReal>>(
+      false, {BATCH_SIZE, CHANNELS, DH, DW}, blank_kwargs);
+  });
 }
 
-#if 0
-
-static const std::vector<int> v2_types = {mshadow::kFloat32,
-                                          mshadow::kFloat64,
-                                          mshadow::kFloat16};
+/**
+ *   _____                            _                          _______         _
+ *  / ____|                          | |                        |__   __|       | |
+ * | |      ___  _ __ _ __  ___   ___| |_ _ __   ___  ___  ___     | | ___  ___ | |_  ___
+ * | |     / _ \| '__| '__|/ _ \ / __| __| '_ \ / _ \/ __|/ __|    | |/ _ \/ __|| __|/ __|
+ * | |____| (_) | |  | |  |  __/| (__| |_| | | |  __/\__ \\__ \    | |  __/\__ \| |_ \__ \
+ *  \_____|\___/|_|  |_|   \___| \___|\__|_| |_|\___||___/|___/    |_|\___||___/ \__||___/
+ *
+ *
+ */
+static const std::vector<mshadow::TypeFlag> v2_types = {
+  mshadow::kFloat32,
+  mshadow::kFloat64,
+  mshadow::kFloat16
+};
 
 TEST(BATCH_NORM, Test1DForward) {
-  for (int type :  v2_types) {
-    MSHADOW_REAL_TYPE_SWITCH_EX(
-      type, DType, AccReal,
-      {
-        TestBatchNormOperatorForward<mxnet::op::BatchNormProp, BNOperatorExecutor<DType, AccReal>>(
-          false, {BATCH_SIZE, CHANNELS, DW}, blank_kwargs);
-      });
+  for (const mshadow::TypeFlag type :  v2_types) {
+    MSHADOW_REAL_TYPE_SWITCH_EX(type, DType, AccReal, {
+      testBNForwardAndBackward<BNOperatorExecutor<DType, AccReal>>(
+        false, {BATCH_SIZE, CHANNELS, DW}, blank_kwargs);
+    });
   }
 }
 
-TEST(BATCH_NORM, Test2DForwardV1) {
-  TestBatchNormOperatorForward<mxnet::op::BatchNormProp, BNOperatorExecutor<float, float>>(
-    false,
-    {BATCH_SIZE, CHANNELS, DH, DW},
-    blank_kwargs);
-}
-
 TEST(BATCH_NORM, Test2DForward) {
   for (int type :  v2_types) {
-    MSHADOW_REAL_TYPE_SWITCH_EX(
-      type, DType, AccReal,
-      {
-        auto opInfoFloatH = TestBatchNormOperatorForward<mxnet::op::BatchNormProp,
-          BNOperatorExecutor<DType, AccReal>>(
-          false, {BATCH_SIZE, CHANNELS, DH, DW}, blank_kwargs);
-      });
+    MSHADOW_REAL_TYPE_SWITCH_EX(type, DType, AccReal, {
+      testBNForwardAndBackward<BNOperatorExecutor<DType, AccReal>>(
+        false, {BATCH_SIZE, CHANNELS, DH, DW}, blank_kwargs);
+    });
   }
 }
 
 TEST(BATCH_NORM, Test3DForward) {
-  for (int type :  v2_types) {
-    MSHADOW_REAL_TYPE_SWITCH_EX(
-      type, DType, AccReal,
-      {
-        TestBatchNormOperatorForward<mxnet::op::BatchNormProp, BNOperatorExecutor<DType, AccReal>>(
-          false, {BATCH_SIZE, CHANNELS, DEPTH, DH, DW}, blank_kwargs);
-      });
+  for (const mshadow::TypeFlag type :  v2_types) {
+    MSHADOW_REAL_TYPE_SWITCH_EX(type, DType, AccReal, {
+      testBNForwardAndBackward<BNOperatorExecutor<DType, AccReal>>(
+        false, {BATCH_SIZE, CHANNELS, DEPTH, DH, DW}, blank_kwargs);
+    });
   }
 }
 
@@ -976,16 +989,16 @@ TEST(BATCH_NORM, TestStochasticTiming_2D) {
   MSHADOW_REAL_TYPE_SWITCH_EX(
     mshadow::kFloat32, DType, AccReal,
     {
-      timingTest<mxnet::op::BatchNormProp, BNOperatorExecutor<DType, AccReal>>(
-        "RANDOM: BatchNormProp<cpu>", false, true,
+      timingTest<BatchNormCoreOpProp, BNOperatorExecutor<DType, AccReal>>(
+        "RANDOM: BatchNormCoreOpProp<cpu>", false, true,
         blank_kwargs_nocudnn, GPU_TEST_DIMENSIONS); });
 #if MXNET_USE_CUDA
   if (test::unitTestsWithCuda) {
     MSHADOW_REAL_TYPE_SWITCH_EX(
       mshadow::kFloat32, DType, AccReal,
       {
-        timingTest<mxnet::op::BatchNormProp, BNOperatorExecutor<DType, AccReal>>(
-          "RANDOM: BatchNormProp<gpu>", true, true,
+        timingTest<BatchNormCoreOpProp, BNOperatorExecutor<DType, AccReal>>(
+          "RANDOM: BatchNormCoreOpProp<gpu>", true, true,
           blank_kwargs_nocudnn, GPU_TEST_DIMENSIONS); });
   }
 #endif
@@ -1004,28 +1017,32 @@ TEST(BATCH_NORM, TestTiming_2D) {
   }
 MSHADOW_REAL_TYPE_SWITCH_EX(
   mshadow::kFloat32, DType, AccReal, {
-#if defined(MXNET_USE_MKL2017) && (MXNET_USE_MKL2017 == 1)
-  timingTest<mxnet::op::BatchNormProp, BNOperatorExecutor<DType, AccReal>>(
+#if MXNET_USE_MKLDNN
+  // MKL
+  timingTest<BatchNormCoreOpProp, BNOperatorExecutor<DType, AccReal>>(
     "MKL BatchNormProp<cpu> 2D",
     false, false,
     blank_kwargs_nocudnn,
     2, THISCOUNT);
 #endif
+  // CPU
   test::ScopeSet<volatile bool> disableMKL(&mxnet::op::batchnorm::disable_mkl, true);
-  timingTest<mxnet::op::BatchNormProp, BNOperatorExecutor<DType, AccReal>>(
+  timingTest<BatchNormCoreOpProp, BNOperatorExecutor<DType, AccReal>>(
     "BatchNormProp<cpu> 2D",
     false, false,
     blank_kwargs_nocudnn,
     2, THISCOUNT);
 #if MXNET_USE_CUDA
   if (test::unitTestsWithCuda) {
-    timingTest<mxnet::op::BatchNormProp, BNOperatorExecutor<DType, AccReal>>(
+    // CUDA
+    timingTest<BatchNormCoreOpProp, BNOperatorExecutor<DType, AccReal>>(
       "BatchNormProp<gpu> 2D",
       true, false,
       blank_kwargs_nocudnn,
       2, THISCOUNT);
 #if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5
-    timingTest<mxnet::op::BatchNormProp, BNOperatorExecutor<DType, AccReal>>(
+    // CUDA-CUDNN
+    timingTest<BatchNormCoreOpProp, BNOperatorExecutor<DType, AccReal>>(
       "CUDNN BatchNormProp<gpu> 2D",
       true, false,
       blank_kwargs,
@@ -1037,16 +1054,29 @@ MSHADOW_REAL_TYPE_SWITCH_EX(
 }
 #endif  // _WIN32
 
-/**
- * Backward tests (generally include forward tests as well)
- */
+inline std::ostream& operator << (std::ostream& os, const test::op::kwargs_t& kwargs) {
+  if (!kwargs.empty()) {
+    os << "[";
+    size_t count = 0;
+    for (const auto &item : kwargs) {
+      if (count++) {
+        os << ", ";
+      }
+      os << item.first << "=" << item.second;
+    }
+    os << "]";
+  }
+  return os;
+}
 
+#if 0
 TEST(BATCH_NORM, TestIterAll) {
   TShape shapes[] = {
     TShape({BATCH_SIZE, CHANNELS, DH}),
     TShape({BATCH_SIZE, CHANNELS, DH, DW}),
     TShape({BATCH_SIZE, CHANNELS, DEPTH, DH, DW})
   };
+  int pass = 0;
   const char *tof[2] = { "False", "True" };
   test::op::kwargs_t kwargs;
   for (size_t x1 = 0; x1 < 2U; ++x1) {
@@ -1058,19 +1088,25 @@ TEST(BATCH_NORM, TestIterAll) {
           kwargs.push_back({ "cudnn_off", "True" });
         }
         for (TShape shape : shapes) {
-          for (int g1 = 0; g1 < 2; ++g1) {
-            for (int g2 = 0; g2 < 2; ++g2) {
+          for (bool g1 : { false, true }) {
+            for (bool g2 : { false, true }) {
               for (int type : v2_types) {
+                std::cout << shape << ", " << op::type_string(type) << ", "
+                          << kwargs << ", g1 = "
+                          << g1 << ", g2 = " << g2 << std::endl;
+                std::cout << "." << std::flush;
                 MSHADOW_REAL_TYPE_SWITCH_EX(
                   type, DType, AccReal,
                   {
-                    test::op::OpInfoPair<mxnet::op::BatchNormProp, mxnet::op::BatchNormProp,
+                    test::op::OpInfoPair<BatchNormCoreOpProp, BatchNormCoreOpProp,
                       BNOperatorExecutor<DType, AccReal>>
-                      bi = testForwardAndBackward<mxnet::op::BatchNormProp,
-                      mxnet::op::BatchNormProp,
+                      bi = testForwardAndBackward<BatchNormCoreOpProp,
+                      BatchNormCoreOpProp,
                       BNOperatorExecutor<DType, AccReal>>(
-                      g1 != 0, g2 != 0, shape, kwargs, false);  // Keep it simple
+                      g1, g2, shape, kwargs);  // Keep it simple
                   });
+                std::cout << std::endl;
+                ++pass;
               }
             }
           }
@@ -1084,95 +1120,19 @@ TEST(BATCH_NORM, TestIterAll) {
     kwargs.pop_back();
   }
 }
-
-TEST(BATCH_NORM, Test2DBackward2DPlusLoadAndCompareLogic) {
-  test::ScopeSet<volatile bool> disableMKL(&mxnet::op::batchnorm::disable_mkl, true);
-  MSHADOW_REAL_TYPE_SWITCH_EX(
-    mshadow::kFloat32, DType, AccReal,
-    {
-      Test2DBackward2DPlusLoadAndCompareLogicUtil::test<DType, AccReal>();
-    });
-}
-
-template<typename PropType, typename OperatorExecutor>
-void compare(const bool isGPU,
-             const test::op::OpInfo<PropType, OperatorExecutor>& object,
-             const std::vector<
-               std::vector< std::vector<typename OperatorExecutor::DataType> > >& values) {
-  test::op::OpInfo<PropType, OperatorExecutor> info_checkLoad =
-    test::op::createOpAndInfoF<PropType, OperatorExecutor>(
-      blank_kwargs, isGPU, object.executor_->inputs()[0].shape_);
-  info_checkLoad.executor_->initForward(*info_checkLoad.prop_, &info_checkLoad.in_type_);
-  info_checkLoad.executor_->initBackward(*info_checkLoad.prop_, &info_checkLoad.in_type_);
-  info_checkLoad.executor_->load(values);
-  BatchNormValidator<
-    typename OperatorExecutor::DataType,
-    typename OperatorExecutor::AccRealType>::compare(object, info_checkLoad);
-}
-
-
-#ifndef _WIN32
-TEST(BATCH_NORM, TestBackward1D_Simple) {
-  MSHADOW_REAL_TYPE_SWITCH_EX(
-    mshadow::kFloat32, DTypeX, AccReal,
-    {
-      const TShape inputShape({1, 1, 2});
-      test::op::OpInfo<mxnet::op::BatchNormProp, BNOperatorExecutor<DTypeX, AccReal>> info =
-        TestBatchNormOperatorForward<mxnet::op::BatchNormProp, BNOperatorExecutor<DTypeX, AccReal>>(
-          false, inputShape, blank_kwargs);
-      info.executor_->initBackward(*info.prop_, &info.in_type_);
-      runOperatorBackward(&info);
-
-#if MXNET_DUMP_C
-      info.executor_->dumpC(&std::cerr, "BN_TestBackward1D_Simple");
 #endif
 
-      // Expected data state when running forward+backward starting with default values
-      // Note: This data structure generated by dumpC()
-      static const std::vector< std::vector< std::vector<DTypeX> > >
-        ___BN_TestBackward1D_Simple_data_shape_1_1_2___ = {
-        { /* kInput */
-          { 1.0f, 2.0f },
-          { 1.0f },
-          { 0.0f }
-        },
-        { /* kOutput */
-          { -0.998006f, 0.998006f },
-          { 1.5f },
-          { 0.25f }
-        },
-        { /* kAux */
-          { 0.15f },
-          { 0.925f }
-        },
-        { /* kInGrad */
-          { -0.00397621f, 0.00397609f },
-          { 0.0f },
-          { 2.998f }
-        },
-        { /* kOutGrad */
-          { 0.999f, 1.999f }
-        }
-      };
-      compare(false, info, ___BN_TestBackward1D_Simple_data_shape_1_1_2___);
-    });
-}
-#endif  // _WIN32
-
 #ifndef _WIN32
 TEST(BATCH_NORM, TestBackward3D) {
   MSHADOW_REAL_TYPE_SWITCH_EX(
     mshadow::kFloat32, DType, AccReal,
     {
       const TShape inputShape({2, 3, 2, 3, 5});
-      test::op::OpInfo<mxnet::op::BatchNormProp, BNOperatorExecutor<DType, AccReal>> info =
-        TestBatchNormOperatorForward<mxnet::op::BatchNormProp, BNOperatorExecutor<DType, AccReal>>(
+      test::op::OpInfo<BatchNormCoreOpProp, BNOperatorExecutor<DType, AccReal>> info =
+        TestBatchNormOperatorForward<BatchNormCoreOpProp, BNOperatorExecutor<DType, AccReal>>(
           false, inputShape, blank_kwargs);
       info.executor_->initBackward(*info.prop_, &info.in_type_);
       runOperatorBackward(&info);
-#if MXNET_DUMP_C
-      info.executor_->dumpC(&std::cerr, "TestBackward3D");
-#endif
     });
 }
 #endif  // _WIN32
@@ -1182,8 +1142,9 @@ class ChannelAxisTestData {
  protected:
   enum Mode { LOAD, SAVE };
 
-  void loadOrSave(const TBlob& blob, int channel_axis, const Mode mode) {
-    mxnet::op::batchnorm::BNTensor3<DType> tensor3(blob, channel_axis);
+  void loadOrSave(const RunContext& run_ctx, const TBlob& blob, int channel_axis, const Mode mode) {
+    test::CAccessAsCPU cpu_blob(run_ctx, blob, true);
+    mxnet::op::batchnorm::BNTensor3<DType> tensor3(cpu_blob(), channel_axis);
     const TShape &shape = blob.shape_;
     CHECK_GT(shape.ndim(), 0);
     if (channel_axis < 0) {
@@ -1233,14 +1194,15 @@ class ChannelAxisTestData {
     }
   }
 
-  static void print(const std::string& label, const TBlob& blob) {
+  static void print(const RunContext& run_ctx, const std::string& label, const TBlob& blob) {
     if (test::debug_output) {
       if (!label.empty()) {
         std::cout << label << ": ";
       }
+      test::CAccessAsCPU cpu_blob(run_ctx, blob, true);
       const size_t totalSize = blob.Size();
       for (size_t i = 0; i < totalSize; ++i) {
-        const float val = blob.dptr<DType>()[i];
+        const float val = cpu_blob().dptr<DType>()[i];
         if (i) {
           std::cout << ", ";
         }
@@ -1251,25 +1213,26 @@ class ChannelAxisTestData {
     }
   }
 
-  void save(const TBlob& blob, const int channel_axis) {
-    loadOrSave(blob, channel_axis, SAVE);
+  void save(const RunContext& run_ctx, const TBlob& blob, const int channel_axis) {
+    loadOrSave(run_ctx, blob, channel_axis, SAVE);
   }
 
-  void load(const TBlob& blob, const int channel_axis) {
-    loadOrSave(blob, channel_axis, LOAD);
+  void load(const RunContext& run_ctx, const TBlob& blob, const int channel_axis) {
+    loadOrSave(run_ctx, blob, channel_axis, LOAD);
   }
 };
 
 template<typename DType, typename AccReal>
-static void compare(const TBlob& blob, const std::vector<DType>& vals) {
+static void compare(const RunContext& run_ctx, const TBlob& blob, const std::vector<DType>& vals) {
   CHECK_EQ(blob.Size(), vals.size());
-  const DType *v = blob.dptr<DType>();
+  test::CAccessAsCPU cpu_blob(run_ctx, blob, false);
+  const DType *v = cpu_blob().dptr<DType>();
   for (size_t i = 0, n = vals.size(); i < n; ++i) {
     const DType vBlob = v[i];
     const DType vVect = vals[i];
     const bool near = BatchNormValidator<DType, AccReal>::isNear(
-      vBlob, vVect, BatchNormValidator<DType, AccReal>::ErrorBound(&blob));
-    EXPECT_TRUE(near);
+      vBlob, vVect, BatchNormValidator<DType, AccReal>::ErrorBound(&cpu_blob()));
+    ASSERT_TRUE(near);
     if (!near) {
       LOG(WARNING) << vBlob << " is not near enough to " << vVect << std::endl;
     }
@@ -1290,9 +1253,9 @@ static void compare(const std::vector<std::vector<float>>& d1,
       const DType v2 = vec2[i];
       const bool near = BatchNormValidator<DType, AccReal>::isNear(
         v1, v2, BatchNormValidator<DType, AccReal>::ERROR_BOUND());
-      EXPECT_TRUE(near);
       if (!near) {
         LOG(WARNING) << v1 << " is not near enough to " << v2 << std::endl;
+        ASSERT_TRUE(near);
       }
     }
   }
@@ -1311,13 +1274,17 @@ static void testSaveAndLoad(const std::vector<size_t>& dims,
     shape[i] = index_t(dims[i]);
   }
 
+  RunContext cpu_run_ctx;
+  cpu_run_ctx.ctx.dev_type = Context::kCPU;
+  cpu_run_ctx.ctx.dev_id = 0;
+  cpu_run_ctx.stream = nullptr;
   std::unique_ptr<test::StandaloneBlob> blob(new test::StandaloneBlob(
     shape, false, mshadow::DataType<DType>::kFlag));
 
-  data.save(*blob, channelAxis);
-  ChannelAxisTestData<DType>::print("saved to blob", *blob);
-  compare<DType, AccReal>(*blob, expectedBlobData);
-  data.load(*blob, channelAxis);
+  data.save(cpu_run_ctx, *blob, channelAxis);
+  ChannelAxisTestData<DType>::print(cpu_run_ctx, "saved to blob", *blob);
+  compare<DType, AccReal>(cpu_run_ctx, *blob, expectedBlobData);
+  data.load(cpu_run_ctx, *blob, channelAxis);
   compare<DType, AccReal>(data.channel_data_, inputChannelData);
 }
 
@@ -1369,7 +1336,6 @@ static TShape MakeShape(const std::vector<index_t>& shape,
   return newShape;
 }
 
-
 /*! \brief Create and arrange equivalent data with different channel axes, then compare
  * normalized results */
 static void runChannelAxisTest(
@@ -1430,17 +1396,18 @@ static void runChannelAxisTest(
 
   // Create operator 1 with ChannelAxis2 (normally the experimental one)
   kwargs.push_back({"axis", std::to_string(channelAxis1)});
-  test::op::OpInfo<mxnet::op::BatchNormProp, BNOperatorExecutor<DType, AccReal>> info_c1 =
-    test::op::createOpAndInfoF<
-      mxnet::op::BatchNormProp, BNOperatorExecutor<DType, AccReal>>(
-      kwargs, isGPU1, shape_c1);
+  test::op::OpInfo<BatchNormCoreOpProp, BNOperatorExecutor<DType, AccReal>> info_c1 =
+    test::op::createOpAndInfoF<BatchNormCoreOpProp, BNOperatorExecutor<DType, AccReal>>(
+      BNOperatorExecutor<DType, AccReal>::ArgsWithOpName(
+        kwargs, "BatchNorm", "_backward_BatchNorm"), isGPU1, shape_c1, kwargs);
+  kwargs.pop_back();
 
   // Create operator 2 with ChannelAxis2 (normally the control one)
-  kwargs.pop_back();
   kwargs.push_back({"axis", std::to_string(channelAxis2)});
-  test::op::OpInfo<mxnet::op::BatchNormProp, BNOperatorExecutor<DType, AccReal>> info_c2 =
-    test::op::createOpAndInfoF<mxnet::op::BatchNormProp, BNOperatorExecutor<DType, AccReal>>(
-      kwargs, isGPU2, shape_c2);
+  test::op::OpInfo<BatchNormCoreOpProp, BNOperatorExecutor<DType, AccReal>> info_c2 =
+    test::op::createOpAndInfoF<BatchNormCoreOpProp, BNOperatorExecutor<DType, AccReal>>(
+      BNOperatorExecutor<DType, AccReal>::ArgsWithOpName(
+        kwargs, "BatchNorm", "_backward_BatchNorm"), isGPU2, shape_c2, kwargs);
   kwargs.pop_back();
 
   // Init operators
@@ -1450,47 +1417,69 @@ static void runChannelAxisTest(
   info_c2.executor_->initBackward(*info_c2.prop_, &info_c2.in_type_);
 
   // Save input data to blob with new shape 1
-  data_c1.save(info_c1.executor_->inputs()[0], channelAxis1);
-  ChannelAxisTestData<DType>::print("blob 1 input", info_c1.executor_->inputs()[0]);
+  data_c1.save(info_c1.executor_->ctx().run_ctx,
+               info_c1.executor_->GetBlob(ForwardInputs::kForInData), channelAxis1);
+  ChannelAxisTestData<DType>::print(info_c1.executor_->ctx().run_ctx,
+                                    "blob 1 input",
+                                    info_c1.executor_->GetBlob(ForwardInputs::kForInData));
 
   // Save input data to blob with new shape 2
-  data_c2.save(info_c2.executor_->inputs()[0], channelAxis2);
-  ChannelAxisTestData<DType>::print("blob 2 input", info_c2.executor_->inputs()[0]);
+  data_c2.save(info_c2.executor_->ctx().run_ctx,
+               info_c2.executor_->GetBlob(ForwardInputs::kForInData), channelAxis2);
+  ChannelAxisTestData<DType>::print(info_c2.executor_->ctx().run_ctx,
+                                    "blob 2 input",
+                                    info_c2.executor_->GetBlob(ForwardInputs::kForInData));
 
   // Save output grad to blob with new shape 1
-  grad_c1.save(info_c1.executor_->bwd_inputs()[0], channelAxis1);
-  ChannelAxisTestData<DType>::print("blob 1 output grad", info_c1.executor_->bwd_inputs()[0]);
+  grad_c1.save(info_c1.executor_->ctx().run_ctx,
+               info_c1.executor_->GetBlob(BackwardInputs::bwd_out_grad_Grad), channelAxis1);
+  ChannelAxisTestData<DType>::print(info_c1.executor_->ctx().run_ctx,
+                                    "blob 1 output grad",
+                                    info_c1.executor_->GetBlob(BackwardInputs::bwd_out_grad_Grad));
 
   // Save output grad to blob with new shape 2
-  grad_c2.save(info_c2.executor_->bwd_inputs()[0], channelAxis2);
-  ChannelAxisTestData<DType>::print("blob 2 output grad", info_c2.executor_->bwd_inputs()[0]);
+  grad_c2.save(info_c2.executor_->ctx().run_ctx,
+               info_c2.executor_->GetBlob(BackwardInputs::bwd_out_grad_Grad), channelAxis2);
+  ChannelAxisTestData<DType>::print(info_c2.executor_->ctx().run_ctx,
+                                    "blob 2 output grad",
+                                    info_c2.executor_->GetBlob(BackwardInputs::bwd_out_grad_Grad));
 
   // Run both operators forward and backwards several times
   for (index_t x = 0; x < numberOfPasses; ++x) {
-    info_c1.executor_->forward();
-    info_c2.executor_->forward();
-
-    info_c1.executor_->backward();
-    info_c2.executor_->backward();
+    info_c1.executor_->forward(1);
+    info_c2.executor_->forward(1);
+    info_c1.executor_->backward(1);
+    info_c2.executor_->backward(1);
+    break;  // REMOVE ME
   }
 
+  //
+  // Check forward pass
+  //
   // Transform operator 1's blob output to a normalized shape
-  data_c1.load(info_c1.executor_->outputs()[0], channelAxis1);
+  data_c1.load(info_c1.executor_->ctx().run_ctx,
+               info_c1.executor_->GetBlob(ForwardOutputs::kForOutData), channelAxis1);
   ChannelAxisTestData<DType>::print("channel data 1", data_c1.channel_data_);
 
   // Transform operator 2's blob output to a normalized shape
-  data_c2.load(info_c2.executor_->outputs()[0], channelAxis2);
+  data_c2.load(info_c2.executor_->ctx().run_ctx,
+               info_c2.executor_->GetBlob(ForwardOutputs::kForOutData), channelAxis2);
   ChannelAxisTestData<DType>::print("channel data 2", data_c2.channel_data_);
 
   // Compare the operators' output data while they're in a normalized shape
   compare<DType, AccReal>(data_c1.channel_data_, data_c2.channel_data_);
 
+  //
+  // Check backward pass
+  //
   // Transform operator 1's input-grad blob to a normalized shape
-  grad_c1.load(info_c1.executor_->bwd_outputs()[0], channelAxis1);
+  grad_c1.load(info_c1.executor_->ctx().run_ctx,
+               info_c1.executor_->GetBlob(BackwardOutputs::bwd_in_grad_Data), channelAxis1);
   ChannelAxisTestData<DType>::print("input grad 1", grad_c1.channel_data_);
 
   // Transform operator 2's input-grad blob to a normalized shape
-  grad_c2.load(info_c2.executor_->bwd_outputs()[0], channelAxis2);
+  grad_c2.load(info_c2.executor_->ctx().run_ctx,
+               info_c2.executor_->GetBlob(BackwardOutputs::bwd_in_grad_Data), channelAxis2);
   ChannelAxisTestData<DType>::print("input grad 2", grad_c2.channel_data_);
 
   // Compare the operators' input grad data while they're in a normalized shape
@@ -1521,6 +1510,7 @@ TEST(BATCH_NORM, TestChannelAxisSimple) {
  *  Channel position 1 (default) is checked everywhere else, so for and
  *  backward result equivalence here implies correctness for other channel positions
  */
+#if 0
 TEST(BATCH_NORM, TestChannelAxis) {
   test::ScopeSet<bool> noDebugOutput(&test::debug_output, false);
 
@@ -1532,14 +1522,16 @@ TEST(BATCH_NORM, TestChannelAxis) {
      {1, 2, 3, 4}};
   const char *tof[2] = {"False", "True"};
 
+  size_t pass = 0;
   for (size_t x1 = 0; x1 < 2U; ++x1) {
     kwargs.push_back({"fix_gamma", tof[x1]});
     for (size_t x2 = 0; x2 < 2U; ++x2) {
       kwargs.push_back({"use_global_stats", tof[x2]});
       for (size_t x3 = 0; x3 < 2U; ++x3) {
         kwargs.push_back({"cudnn_off", tof[x3]});
-        for (index_t g1 = 0; g1 < 2U; ++g1) {
-          for (index_t g2 = 0; g2 < 2U; ++g2) {
+        for (bool g1 : { true }) {
+        for (bool g1 : { false, true }) {
+          for (bool g2 : { false, true }) {
             for (const std::vector<index_t> &simpleShape : shapes) {
               const int dim = static_cast<int>(simpleShape.size());
               for (signed int channelAxis = -dim, shapeDim = dim;
@@ -1547,8 +1539,9 @@ TEST(BATCH_NORM, TestChannelAxis) {
                    ++channelAxis) {
                 for (size_t channelCount = 1; channelCount <= 3; ++channelCount) {
                   // Check against base-case of channel axis position 1
-                  runChannelAxisTest(g1 != 0, g2 != 0, kwargs, simpleShape,
+                  runChannelAxisTest(g1, g2, kwargs, simpleShape,
                                      1, channelAxis, channelCount, false);
+                  ++pass;
                 }
               }
             }
@@ -1570,11 +1563,11 @@ TEST(BATCH_NORM, Test2DForward2D_gpu) {
     MSHADOW_REAL_TYPE_SWITCH_EX(
       type, DType, AccReal,
       {
-        TestBatchNormOperatorForward<mxnet::op::BatchNormProp, BNOperatorExecutor<DType, AccReal>>(
+        TestBatchNormOperatorForward<BatchNormCoreOpProp, BNOperatorExecutor<DType, AccReal>>(
           true,
           {BATCH_SIZE, CHANNELS, DH, DW},
           blank_kwargs);
-        TestBatchNormOperatorForward<mxnet::op::BatchNormProp, BNOperatorExecutor<DType, AccReal>>(
+        TestBatchNormOperatorForward<BatchNormCoreOpProp, BNOperatorExecutor<DType, AccReal>>(
           true,
           {BATCH_SIZE, CHANNELS, DH, DW},
           blank_kwargs_nocudnn);
@@ -1588,12 +1581,12 @@ TEST(BATCH_NORM, Test2DBackwardMixed_gpu_cpu) {
       type, DType, AccReal,
       {
         const TShape inputShape({1, 1, 2, 1});
-        testForwardAndBackward<mxnet::op::BatchNormProp, mxnet::op::BatchNormProp,
+        testForwardAndBackward<BatchNormCoreOpProp, BatchNormCoreOpProp,
           BNOperatorExecutor<DType, AccReal>>(
-          false, true, inputShape, blank_kwargs, false);
-        testForwardAndBackward<mxnet::op::BatchNormProp, mxnet::op::BatchNormProp,
+          false, true, inputShape, blank_kwargs);
+        testForwardAndBackward<BatchNormCoreOpProp, BatchNormCoreOpProp,
           BNOperatorExecutor<DType, AccReal>>(
-          false, true, inputShape, blank_kwargs_nocudnn, false);
+          false, true, inputShape, blank_kwargs_nocudnn);
       });
   }
 }
@@ -1604,12 +1597,12 @@ TEST(BATCH_NORM, Test2DBackwardMixedComplex_gpu_cpu) {
       type, DType, AccReal,
       {
         const TShape inputShape({BATCH_SIZE, CHANNELS, DH, DW});
-        testForwardAndBackward<mxnet::op::BatchNormProp, mxnet::op::BatchNormProp,
+        testForwardAndBackward<BatchNormCoreOpProp, BatchNormCoreOpProp,
           BNOperatorExecutor<DType, AccReal>>(
-          false, true, inputShape, blank_kwargs, false);
-        testForwardAndBackward<mxnet::op::BatchNormProp, mxnet::op::BatchNormProp,
+          false, true, inputShape, blank_kwargs);
+        testForwardAndBackward<BatchNormCoreOpProp, BatchNormCoreOpProp,
           BNOperatorExecutor<DType, AccReal>>(
-          false, true, inputShape, blank_kwargs_nocudnn, false);
+          false, true, inputShape, blank_kwargs_nocudnn);
       });
   }
 }
@@ -1622,12 +1615,12 @@ TEST(BATCH_NORM, Test2DBackwardMixed_gpu_cpu_nfg) {
       type, DType, AccReal,
       {
         const TShape inputShape({1, 1, 2, 1});
-        testForwardAndBackward<mxnet::op::BatchNormProp, mxnet::op::BatchNormProp,
+        testForwardAndBackward<BatchNormCoreOpProp, BatchNormCoreOpProp,
           BNOperatorExecutor<DType, AccReal>>(
-          false, true, inputShape, nonfixgamma_kwargs, false);
-        testForwardAndBackward<mxnet::op::BatchNormProp, mxnet::op::BatchNormProp,
+          false, true, inputShape, nonfixgamma_kwargs);
+        testForwardAndBackward<BatchNormCoreOpProp, BatchNormCoreOpProp,
           BNOperatorExecutor<DType, AccReal>>(
-          false, true, inputShape, nonfixgamma_kwargs_nocudnn, false);
+          false, true, inputShape, nonfixgamma_kwargs_nocudnn);
       });
   }
 }
@@ -1638,12 +1631,12 @@ TEST(BATCH_NORM, Test2DBackwardMixedComplex_gpu_cpu_nfg) {
       type, DType, AccReal,
       {
         const TShape inputShape({BATCH_SIZE, CHANNELS, DH, DW});
-        testForwardAndBackward<mxnet::op::BatchNormProp, mxnet::op::BatchNormProp,
+        testForwardAndBackward<BatchNormCoreOpProp, BatchNormCoreOpProp,
           BNOperatorExecutor<DType, AccReal>>(
-          false, true, inputShape, nonfixgamma_kwargs, false);
-        testForwardAndBackward<mxnet::op::BatchNormProp, mxnet::op::BatchNormProp,
+          false, true, inputShape, nonfixgamma_kwargs);
+        testForwardAndBackward<BatchNormCoreOpProp, BatchNormCoreOpProp,
           BNOperatorExecutor<DType, AccReal>>(
-          false, true, inputShape, nonfixgamma_kwargs_nocudnn, false);
+          false, true, inputShape, nonfixgamma_kwargs_nocudnn);
       });
   }
 }
@@ -1656,12 +1649,12 @@ TEST(BATCH_NORM, Test2DBackwardMixed_gpu_cpu_ugs) {
       type, DType, AccReal,
       {
         const TShape inputShape({2, 3, 2, 2});
-        testForwardAndBackward<mxnet::op::BatchNormProp, mxnet::op::BatchNormProp,
+        testForwardAndBackward<BatchNormCoreOpProp, BatchNormCoreOpProp,
           BNOperatorExecutor<DType, AccReal>>(
-          false, true, inputShape, useglobalstats_kwargs_nocudnn, false);
-        testForwardAndBackward<mxnet::op::BatchNormProp, mxnet::op::BatchNormProp,
+          false, true, inputShape, useglobalstats_kwargs_nocudnn);
+        testForwardAndBackward<BatchNormCoreOpProp, BatchNormCoreOpProp,
           BNOperatorExecutor<DType, AccReal>>(
-          false, true, inputShape, useglobalstats_kwargs, false);
+          false, true, inputShape, useglobalstats_kwargs);
       });
   }
 }
@@ -1672,12 +1665,12 @@ TEST(BATCH_NORM, Test2DBackwardMixedComplex_gpu_cpu_ugs) {
       type, DType, AccReal,
       {
         const TShape inputShape({BATCH_SIZE, CHANNELS, DH, DW});
-        testForwardAndBackward<mxnet::op::BatchNormProp, mxnet::op::BatchNormProp,
+        testForwardAndBackward<BatchNormCoreOpProp, BatchNormCoreOpProp,
           BNOperatorExecutor<DType, AccReal>>(
-          false, true, inputShape, useglobalstats_kwargs, false);
-        testForwardAndBackward<mxnet::op::BatchNormProp, mxnet::op::BatchNormProp,
+          false, true, inputShape, useglobalstats_kwargs);
+        testForwardAndBackward<BatchNormCoreOpProp, BatchNormCoreOpProp,
           BNOperatorExecutor<DType, AccReal>>(
-          false, true, inputShape, useglobalstats_kwargs_nocudnn, false);
+          false, true, inputShape, useglobalstats_kwargs_nocudnn);
       });
   }
 }
diff --git a/tests/cpp/operator/dropout_perf.cc b/tests/cpp/operator/dropout_perf.cc
index 4132fcb22c62..4afd56fe586a 100644
--- a/tests/cpp/operator/dropout_perf.cc
+++ b/tests/cpp/operator/dropout_perf.cc
@@ -45,7 +45,6 @@ TEST(DROPOUT_PERF, ExecuteBidirectional) {
   kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "Dropout",
                                                            "_backward_Dropout");
   runner.set_verbose(true);
-  //runner.RunGenericOperatorForward(false, { shape }, kwargs, 1);
   runner.RunBidirectional(false, { shape }, kwargs, 1);
 }
 
@@ -60,7 +59,7 @@ TEST(DROPOUT_PERF, TimingCPU) {
   test::op::CoreOperatorRunner<float> runner;
   kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "Dropout",
                                                            "_backward_Dropout");
-  runner.RunGenericOperatorForward(false, { shape }, kwargs, 1);
+  runner.RunBidirectional(false, { shape }, kwargs, 1);
   std::vector <TShape> shapes;
   if (test::performance_run) {
     shapes = {
@@ -95,7 +94,7 @@ TEST(DROPOUT_PERF, TimingGPU) {
   test::op::CoreOperatorRunner<float> runner;
   kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "Dropout",
                                                            "_backward_Dropout");
-  runner.RunGenericOperatorForward(true, { shape }, kwargs, 1);
+  runner.RunBidirectional(false, { shape }, kwargs, 1);
   std::vector <TShape> shapes = {
     {1,  1, 28,  28},
     {1,  3, 28,  28},
diff --git a/tests/cpp/operator/fully_conn_perf.cc b/tests/cpp/operator/fully_conn_perf.cc
index 2283562dea2b..e574ae2b4379 100644
--- a/tests/cpp/operator/fully_conn_perf.cc
+++ b/tests/cpp/operator/fully_conn_perf.cc
@@ -47,7 +47,7 @@ TEST(FULLY_CONNECTED, ExecuteBidirectionalFullyConnected) {
   runner.set_verbose(true);
   kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "FullyConnected",
                                                            "_backward_FullyConnected");
-  runner.RunGenericOperatorForward(false, { shape1, shape2 }, kwargs, 1);
+  runner.RunBidirectional(false, { shape1, shape2 }, kwargs, 1);
 }
 
 /*!
@@ -60,7 +60,7 @@ TEST(FULLY_CONNECTED, FullyConnectedTimingCPU) {
   test::op::CoreOperatorRunner<float> runner;
   kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "FullyConnected",
                                                            "_backward_FullyConnected");
-  runner.RunGenericOperatorForward(false, { shape1, shape2 }, kwargs, 1);
+  runner.RunBidirectional(false, { shape1, shape2 }, kwargs, 1);
   std::vector <TShape> shapes;
   if (test::performance_run) {
     shapes = {
@@ -96,7 +96,7 @@ TEST(FULLY_CONNECTED, FullyConnectedTimingGPU) {
   test::op::CoreOperatorRunner<float> runner;
   kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "FullyConnected",
                                                            "_backward_FullyConnected");
-  runner.RunGenericOperatorForward(true, { shape1, shape2 }, kwargs, 1);
+  runner.RunBidirectional(false, { shape1, shape2 }, kwargs, 1);
   std::vector <TShape> shapes;
   if (test::performance_run) {
     shapes = {
diff --git a/tests/cpp/operator/tune/operator_tune_test.cc b/tests/cpp/operator/tune/operator_tune_test.cc
index f404e4faa923..7d84e47dbf16 100644
--- a/tests/cpp/operator/tune/operator_tune_test.cc
+++ b/tests/cpp/operator/tune/operator_tune_test.cc
@@ -18,13 +18,14 @@
  */
 #include <gtest/gtest.h>
 #include <mxnet/tensor_blob.h>
-#include <numeric>
 #include "../../src/operator/nn/activation-inl.h"
 #include "../../src/operator/operator_tune-inl.h"
 #include "../include/test_op_runner.h"
 #include "../include/test_core_op.h"
 #include "../include/test_tune.h"
 
+#if MXNET_USE_OPERATOR_TUNING
+
 using namespace mxnet;
 
 /*!
@@ -173,3 +174,4 @@ TEST(OMP_TUNING, EvaluateTuneTestInt64) {
   std::cout << "Success rate for type " << test::type_name<DType>() << ": " << result << std::endl;
 }
 
+#endif  // MXNET_USE_OPERATOR_TUNING
\ No newline at end of file
diff --git a/tests/cpp/test_main.cc b/tests/cpp/test_main.cc
index 5556d7bf1c2e..fc46dff1d9b9 100644
--- a/tests/cpp/test_main.cc
+++ b/tests/cpp/test_main.cc
@@ -64,7 +64,7 @@ static bool checkForWorkingCuda() {
       }
     }
   }
-  std::fprintf(stderr, "Warning: Could not find working CUDA driver\n");
+  std::cerr << "Warning: Could not find working CUDA driver" << std::endl;
   return false;
 }
 #else
@@ -89,19 +89,20 @@ int main(int argc, char ** argv) {
   mxnet::test::unitTestsWithCuda = checkForWorkingCuda();  // auto-determine
 
   for (int x = 1; x < argc; ++x) {
+    const char *arg = argv[x];
     // force checks with CUDA
-    if (!strcmp(argv[x], "--with-cuda")) {
+    if (!strcmp(arg, "--with-cuda")) {
       // override (ie force attempt CUDA)
       mxnet::test::unitTestsWithCuda = true;
-    } else if (!strcmp(argv[x], "--debug")) {
+    } else if (!strcmp(arg, "--debug") || !strcmp(arg, "-d")) {
       mxnet::test::debug_output = true;
-    } else if (!strcmp(argv[x], "--perf")) {
+    } else if (!strcmp(arg, "--perf") || !strcmp(arg, "-p")) {
       mxnet::test::performance_run = true;
-    } else if (!strcmp(argv[x], "--csv")) {
+    } else if (!strcmp(arg, "--csv")) {
       mxnet::test::csv = true;
-    } else if (!strcmp(argv[x], "--quick") || !strcmp(argv[x], "-q")) {
+    } else if (!strcmp(arg, "--quick") || !strcmp(arg, "-q")) {
       mxnet::test::quick_test = true;
-    } else if (!strcmp(argv[x], "--backtrace")) {
+    } else if (!strcmp(arg, "--backtrace")) {
         backtrace_test();
         return 0;
     }