diff --git a/README.md b/README.md
index 349efc3065d931..6894ab4f79ef88 100644
--- a/README.md
+++ b/README.md
@@ -3,9 +3,9 @@
 </div>
 -----------------
 
-| **`Linux CPU`** | **`Linux GPU PIP`** | **`Mac OS CPU`** | **`Windows CPU`** | **`Android`** |
+| **`Linux CPU`** | **`Linux GPU`** | **`Mac OS CPU`** | **`Windows CPU`** | **`Android`** |
 |-----------------|---------------------|------------------|-------------------|---------------|
-| [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-cpu)](https://ci.tensorflow.org/job/tensorflow-master-cpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-gpu_pip)](https://ci.tensorflow.org/job/tensorflow-master-gpu_pip) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-mac)](https://ci.tensorflow.org/job/tensorflow-master-mac) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-win-cmake-py)](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-win-cmake-py) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-android)](https://ci.tensorflow.org/job/tensorflow-master-android) |
+| [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-cpu)](https://ci.tensorflow.org/job/tensorflow-master-cpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-linux-gpu)](https://ci.tensorflow.org/job/tensorflow-master-linux-gpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-mac)](https://ci.tensorflow.org/job/tensorflow-master-mac) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-win-cmake-py)](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-win-cmake-py) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-android)](https://ci.tensorflow.org/job/tensorflow-master-android) |
 
 **TensorFlow** is an open source software library for numerical computation using
 data flow graphs.  Nodes in the graph represent mathematical operations, while
diff --git a/RELEASE.md b/RELEASE.md
index ead29f0c547f6d..12dde27c20cd4d 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -85,7 +85,7 @@
   removed.
 * `tf.all_variables`, `tf.VARIABLES` and `tf.initialize_all_variables` renamed
   to `tf.global_variables`, `tf.GLOBAL_VARIABLES` and
-  `tf.global_variable_initializers` respectively.
+  `tf.global_variables_initializer` respectively.
 
 ## Bug Fixes and Other Changes
 
diff --git a/configure b/configure
index 8a3bb677682107..156c25fdd2054f 100755
--- a/configure
+++ b/configure
@@ -279,7 +279,7 @@ while true; do
       TF_CUDNN_VERSION=${BASH_REMATCH[1]}
       echo "libcudnn.so resolves to libcudnn${TF_CUDNN_EXT}"
     elif [[ "$REALVAL" =~ ([0-9]*).dylib ]]; then
-      TF_CUDNN_EXT="."${BASH_REMATCH[1]}".dylib"
+      TF_CUDNN_EXT=${BASH_REMATCH[1]}".dylib"
       TF_CUDNN_VERSION=${BASH_REMATCH[1]}
       echo "libcudnn.dylib resolves to libcudnn${TF_CUDNN_EXT}"
     fi
diff --git a/libxsmm.BUILD b/libxsmm.BUILD
new file mode 100644
index 00000000000000..a03066590ba4e4
--- /dev/null
+++ b/libxsmm.BUILD
@@ -0,0 +1,115 @@
+# Description:
+#    LIBXSMM: Library for small matrix-matrix multiplications targeting Intel Architecture (x86).
+
+licenses(["notice"])  # BSD 3-clause
+exports_files(["LICENSE"])
+
+# Arguments to ./scripts/libxsmm_interface.py, see that file for detailed description.
+#  precision: SP & DP
+#  ilp64: no
+#  prefetch: 1 (auto)
+libxsmm_interface_arguments = "0 0 1"
+
+# Arguments to ./scripts/libxsmm_config.py, see that file for detailed description.
+#  ilp64: no
+#  offload: no
+#  alignment [b]
+#  prefetch: 1 (auto)
+#  threshold: fallback to BLAS if n*m*k above this
+#  synchronize: yes
+#  jit: yes
+#  flags
+#  alpha = 1
+#  beta = 1
+libxsmm_config_arguments = "0 0 64 1 0 1 1 0 1 1"
+
+genrule(
+    name = "libxsmm_headers",
+    srcs = [
+        "src/template/libxsmm.h",
+        "src/template/libxsmm_config.h",
+    ],
+    outs = [
+        "include/libxsmm.h",
+        "include/libxsmm_config.h",
+    ],
+    cmd = "$(location :libxsmm_interface) $(location src/template/libxsmm.h) " + libxsmm_interface_arguments + " > $(location include/libxsmm.h);" +
+          "$(location :libxsmm_config) $(location src/template/libxsmm_config.h) " + libxsmm_config_arguments + " > $(location include/libxsmm_config.h)",
+    tools = [
+        ":libxsmm_config",
+        ":libxsmm_interface",
+    ],
+)
+
+cc_library(
+    name = "xsmm_avx",
+    srcs = [
+        "src/libxsmm_main.c",
+        "src/libxsmm_dump.c",
+        "src/libxsmm_malloc.c",
+        "src/libxsmm_gemm.c",
+        "src/libxsmm_timer.c",
+        "src/libxsmm_trace.c",
+        "src/libxsmm_trans.c",
+        "src/libxsmm_sync.c",
+        "src/libxsmm_perf.c",
+        "src/libxsmm_dnn.c",
+        "src/libxsmm_dnn_convolution_forward.c",
+        "src/libxsmm_cpuid_x86.c",
+    ] + glob([
+        "src/generator_*.c",
+    ]),
+    hdrs = [
+        "include/libxsmm_dnn.h",
+        "include/libxsmm_frontend.h",
+        "include/libxsmm_generator.h",
+        "include/libxsmm_macros.h",
+        "include/libxsmm_malloc.h",
+        "include/libxsmm_sync.h",
+        "include/libxsmm_timer.h",
+        "include/libxsmm_typedefs.h",
+        "include/libxsmm_dispatch.h",
+        "src/libxsmm_gemm_diff.c",
+        "src/libxsmm_cpuid_x86.c",
+        "src/libxsmm_hash.c",
+        # Generated:
+        "include/libxsmm.h",
+        "include/libxsmm_config.h",
+    ] + glob([
+        "src/*.h",
+        "src/template/*.c",
+    ]),
+    copts = [
+        "-mavx",  # JIT does not work without avx anyway, and this silences some CRC32 warnings.
+        "-Wno-vla",  # Libxsmm convolutions heavily use VLA.
+    ],
+    defines = [
+        "LIBXSMM_BUILD",
+        "LIBXSMM_CPUID_X86_NOINLINE",
+        "__BLAS=0",
+    ],
+    includes = ["include"],
+    linkopts = ["-ldl"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":libxsmm_headers",
+    ],
+)
+
+py_library(
+    name = "libxsmm_scripts",
+    srcs = glob(["scripts/*.py"]),
+    data = ["version.txt"],
+)
+
+py_binary(
+    name = "libxsmm_interface",
+    srcs = ["scripts/libxsmm_interface.py"],
+    deps = [":libxsmm_scripts"],
+)
+
+py_binary(
+    name = "libxsmm_config",
+    srcs = ["scripts/libxsmm_config.py"],
+    deps = [":libxsmm_scripts"],
+)
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 2d7c28feb7fa4e..e75dc1aa450d9c 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -94,6 +94,7 @@ filegroup(
         "//tensorflow/contrib:all_files",
         "//tensorflow/contrib/android:all_files",
         "//tensorflow/contrib/bayesflow:all_files",
+        "//tensorflow/contrib/compiler:all_files",
         "//tensorflow/contrib/copy_graph:all_files",
         "//tensorflow/contrib/crf:all_files",
         "//tensorflow/contrib/cudnn_rnn:all_files",
@@ -105,6 +106,8 @@ filegroup(
         "//tensorflow/contrib/framework:all_files",
         "//tensorflow/contrib/graph_editor:all_files",
         "//tensorflow/contrib/grid_rnn:all_files",
+        "//tensorflow/contrib/input_pipeline:all_files",
+        "//tensorflow/contrib/input_pipeline/kernels:all_files",
         "//tensorflow/contrib/integrate:all_files",
         "//tensorflow/contrib/labeled_tensor:all_files",
         "//tensorflow/contrib/layers:all_files",
@@ -116,7 +119,6 @@ filegroup(
         "//tensorflow/contrib/lookup:all_files",
         "//tensorflow/contrib/losses:all_files",
         "//tensorflow/contrib/metrics:all_files",
-        "//tensorflow/contrib/metrics/kernels:all_files",
         "//tensorflow/contrib/ndlstm:all_files",
         "//tensorflow/contrib/opt:all_files",
         "//tensorflow/contrib/rnn:all_files",
diff --git a/tensorflow/cc/gradients/math_grad.cc b/tensorflow/cc/gradients/math_grad.cc
index 11c6207599d975..fad437abbbac11 100644
--- a/tensorflow/cc/gradients/math_grad.cc
+++ b/tensorflow/cc/gradients/math_grad.cc
@@ -117,6 +117,19 @@ Status LogGrad(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("Log", LogGrad);
 
+Status Log1pGrad(const Scope& scope, const Operation& op,
+                 const std::vector<Output>& grad_inputs,
+                 std::vector<Output>* grad_outputs) {
+  // f(x) = log1p(x) = y
+  // df/dx = 1 / (1 + x)
+  // dx = dy * (1 / (1 + x))
+  auto one = Cast(scope, Const(scope, 1.0), op.input(0).type());
+  grad_outputs->push_back(
+      Div(scope, grad_inputs[0], Add(scope, one, op.input(0))));
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("Log1p", Log1pGrad);
+
 Status TanhGrad(const Scope& scope, const Operation& op,
                 const std::vector<Output>& grad_inputs,
                 std::vector<Output>* grad_outputs) {
diff --git a/tensorflow/cc/gradients/math_grad_test.cc b/tensorflow/cc/gradients/math_grad_test.cc
index 8b7fb8d765e986..7c0a14e20efd30 100644
--- a/tensorflow/cc/gradients/math_grad_test.cc
+++ b/tensorflow/cc/gradients/math_grad_test.cc
@@ -43,6 +43,7 @@ class CWiseUnaryGradTest : public ::testing::Test {
     RSQRT,
     EXP,
     LOG,
+    LOG1P,
     TANH,
     SIGMOID,
     SIGN,
@@ -101,6 +102,9 @@ class CWiseUnaryGradTest : public ::testing::Test {
       case LOG:
         y = Log(scope_, x);
         break;
+      case LOG1P:
+        y = Log1p(scope_, x);
+        break;
       case TANH:
         y = Tanh(scope_, x);
         break;
@@ -207,6 +211,15 @@ TEST_F(CWiseUnaryGradTest, Log) {
   TestCWiseGrad(LOG, x_fn, dy_fn, dx_fn);
 }
 
+TEST_F(CWiseUnaryGradTest, Log1p) {
+  auto x_fn = [this](const int i) { return RV({0, 1e-6, 1, 2, 3, 4, 100}); };
+  auto dy_fn = [this](const float x) { return x + RV({-2, 2, -3, 3, -4, 4}); };
+  auto dx_fn = [this](const float x, const float dy) {
+    return dy * (1.0 / (1.0 + x));
+  };
+  TestCWiseGrad(LOG1P, x_fn, dy_fn, dx_fn);
+}
+
 TEST_F(CWiseUnaryGradTest, Tanh) {
   auto x_fn = [this](const int i) { return RV({0, -1, 1, -2, 2, -3, 3}); };
   auto dy_fn = [this](const float x) { return x + RV({-2, 2, -3, 3, -4, 4}); };
diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index 73212f0dc42d13..2acf9bf777a69a 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -31,11 +31,13 @@ namespace tensorflow {
 namespace {
 
 auto* load_attempt_count = monitoring::Counter<2>::New(
-    "/tensorflow/cc/saved_model/load_attempt_count", "model_path", "status",
-    "The number of times a SavedModel was successfully loaded.");
+    "/tensorflow/cc/saved_model/load_attempt_count",
+    "The number of times a SavedModel was successfully loaded.", "model_path",
+    "status");
 auto* load_latency = monitoring::Counter<1>::New(
-    "/tensorflow/cc/saved_model/load_latency", "model_path",
-    "Latency in microseconds for SavedModels that were successfully loaded.");
+    "/tensorflow/cc/saved_model/load_latency",
+    "Latency in microseconds for SavedModels that were succesfully loaded.",
+    "model_path");
 constexpr char kLoadAttemptFail[] = "fail";
 constexpr char kLoadAttemptSuccess[] = "success";
 
diff --git a/tensorflow/cc/saved_model/loader_test.cc b/tensorflow/cc/saved_model/loader_test.cc
index dbbcc79802a0ef..fefd7ab6e36a29 100644
--- a/tensorflow/cc/saved_model/loader_test.cc
+++ b/tensorflow/cc/saved_model/loader_test.cc
@@ -190,7 +190,7 @@ TEST_F(LoaderTest, MaybeSavedModelDirectory) {
 
   // Directory that exists but is an invalid SavedModel location.
   const string invalid_export_dir =
-      io::JoinPath(testing::TensorFlowSrcRoot(), "cc/saved_model/testdata");
+      io::JoinPath(testing::TensorFlowSrcRoot(), "cc/saved_model");
   EXPECT_FALSE(MaybeSavedModelDirectory(invalid_export_dir));
 }
 
diff --git a/tensorflow/cc/training/queue_runner.cc b/tensorflow/cc/training/queue_runner.cc
index 5d6710ea5ccf7d..e703a9bb307b50 100644
--- a/tensorflow/cc/training/queue_runner.cc
+++ b/tensorflow/cc/training/queue_runner.cc
@@ -166,7 +166,9 @@ void QueueRunner::Run(Session* sess, const string& enqueue_op) {
     last_run = (runs_ == 0);
   }
 
-  if (IsQueueClosed(status)) {
+  // Close the queue unless the coordinator is shutting down since the cancel op
+  // will be run anway in this case.
+  if (IsQueueClosed(status) && (!coord_ || !coord_->ShouldStop())) {
     if (last_run && !close_op_name_.empty()) {
       UpdateStatus(sess->Run({}, {}, {close_op_name_}, nullptr));
     }
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index 9a6b116c61927c..a865035e93a25c 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -14,6 +14,7 @@ py_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/contrib/bayesflow:bayesflow_py",
+        "//tensorflow/contrib/compiler:compiler_py",
         "//tensorflow/contrib/copy_graph:copy_graph_py",
         "//tensorflow/contrib/crf:crf_py",
         "//tensorflow/contrib/cudnn_rnn:cudnn_rnn_py",
@@ -24,10 +25,12 @@ py_library(
         "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/contrib/graph_editor:graph_editor_py",
         "//tensorflow/contrib/grid_rnn:grid_rnn_py",
+        "//tensorflow/contrib/input_pipeline:input_pipeline_py",
         "//tensorflow/contrib/integrate:integrate_py",
         "//tensorflow/contrib/labeled_tensor",
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/contrib/learn",
+        "//tensorflow/contrib/legacy_seq2seq:seq2seq_py",
         "//tensorflow/contrib/linalg:linalg_py",
         "//tensorflow/contrib/linear_optimizer:sdca_ops_py",
         "//tensorflow/contrib/lookup:lookup_py",
@@ -58,9 +61,9 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/contrib/factorization/kernels:all_kernels",
+        "//tensorflow/contrib/input_pipeline:input_pipeline_ops_kernels",
         "//tensorflow/contrib/layers:bucketization_op_kernel",
         "//tensorflow/contrib/layers:sparse_feature_cross_op_kernel",
-        "//tensorflow/contrib/metrics:set_ops_kernels",
     ],
 )
 
@@ -70,9 +73,9 @@ cc_library(
     deps = [
         "//tensorflow/contrib/factorization:all_ops",
         "//tensorflow/contrib/framework:all_ops",
+        "//tensorflow/contrib/input_pipeline:input_pipeline_ops_op_lib",
         "//tensorflow/contrib/layers:bucketization_op_op_lib",
         "//tensorflow/contrib/layers:sparse_feature_cross_op_op_lib",
-        "//tensorflow/contrib/metrics:set_ops_op_lib",
     ],
 )
 
diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index 31c3064cdefdb5..1161f3c44f6dde 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -20,6 +20,7 @@
 
 # Add projects here, they will show up under tf.contrib.
 from tensorflow.contrib import bayesflow
+from tensorflow.contrib import compiler
 from tensorflow.contrib import copy_graph
 from tensorflow.contrib import crf
 from tensorflow.contrib import cudnn_rnn
@@ -29,10 +30,12 @@
 from tensorflow.contrib import framework
 from tensorflow.contrib import graph_editor
 from tensorflow.contrib import grid_rnn
+from tensorflow.contrib import input_pipeline
 from tensorflow.contrib import integrate
 from tensorflow.contrib import labeled_tensor
 from tensorflow.contrib import layers
 from tensorflow.contrib import learn
+from tensorflow.contrib import legacy_seq2seq
 from tensorflow.contrib import linalg
 from tensorflow.contrib import linear_optimizer
 from tensorflow.contrib import lookup
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_variables_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_variables_test.py
index 1646abcd9fb0ba..74bf699d222ad5 100644
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_variables_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_variables_test.py
@@ -42,8 +42,8 @@ def testStochasticVariables(self):
 
     self.assertEqual(
         {"stochastic_variables/sv_mu", "stochastic_variables/sv_sigma"},
-        set([v.op.name for v in tf.all_variables()]))
-    self.assertEqual(set(tf.trainable_variables()), set(tf.all_variables()))
+        set([v.op.name for v in tf.global_variables()]))
+    self.assertEqual(set(tf.trainable_variables()), set(tf.global_variables()))
 
     v = tf.convert_to_tensor(v)
     self.assertEqual(list(shape), v.get_shape().as_list())
@@ -64,7 +64,7 @@ def testStochasticVariablesWithConstantInitializer(self):
             })):
       v = tf.get_variable("sv")
 
-    for var in tf.all_variables():
+    for var in tf.global_variables():
       if "mu" in var.name:
         mu_var = var
       if "sigma" in var.name:
@@ -96,7 +96,7 @@ def sigma_init(shape, dtype, partition_info):
             })):
       v = tf.get_variable("sv", shape)
 
-    for var in tf.all_variables():
+    for var in tf.global_variables():
       if "mu" in var.name:
         mu_var = var
       if "sigma" in var.name:
diff --git a/tensorflow/contrib/bayesflow/python/ops/stochastic_gradient_estimators.py b/tensorflow/contrib/bayesflow/python/ops/stochastic_gradient_estimators.py
index 64488ebb1006d9..f9f3721047b6e1 100644
--- a/tensorflow/contrib/bayesflow/python/ops/stochastic_gradient_estimators.py
+++ b/tensorflow/contrib/bayesflow/python/ops/stochastic_gradient_estimators.py
@@ -183,7 +183,7 @@ def mean_baseline(_, loss):
     with vs.variable_scope(name, default_name="MeanBaseline"):
       reduced_loss = math_ops.reduce_mean(loss)
 
-      ema = training.ExponentialMovingAverage(decay=ema_decay)
+      ema = training.ExponentialMovingAverage(decay=ema_decay, zero_debias=True)
       update_op = ema.apply([reduced_loss])
 
       with ops.control_dependencies([update_op]):
diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index b682838b30bd61..ec6be97151737f 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -57,6 +57,7 @@ if(WIN32)
   add_definitions(-DNOMINMAX -D_WIN32_WINNT=0x0A00 -DLANG_CXX11 -DCOMPILER_MSVC -D__VERSION__=\"MSVC\")
   add_definitions(-DWIN32 -DOS_WIN -D_MBCS -DWIN64 -DWIN32_LEAN_AND_MEAN -DNOGDI -DPLATFORM_WINDOWS)
   add_definitions(-DTENSORFLOW_USE_EIGEN_THREADPOOL -DEIGEN_HAS_C99_MATH -D_ITERATOR_DEBUG_LEVEL=0)
+  add_definitions(-DNDEBUG /O2)  # Equivalent of -c opt in Bazel.
   add_definitions(/bigobj /nologo /EHsc /GF /FC /MP /Gm-)
   # Suppress warnings to reduce build log size.
   add_definitions(/wd4267 /wd4244 /wd4800 /wd4503 /wd4554 /wd4996 /wd4348 /wd4018)
@@ -149,11 +150,11 @@ if (tensorflow_ENABLE_GPU)
 
     # by default we assume compute cabability 3.5 and 5.2. If you change this change it in
     # CUDA_NVCC_FLAGS and cuda_config.h below
-    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_35,code=\"sm_35,compute_35\";-gencode arch=compute_52,code=\"sm_52,compute_52\")
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_30,code=\"sm_30,compute_30\";-gencode arch=compute_35,code=\"sm_35,compute_35\";-gencode arch=compute_52,code=\"sm_52,compute_52\")
     set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--include-path ${PROJECT_BINARY_DIR}/$\{build_configuration\};--expt-relaxed-constexpr)
     set(CUDA_INCLUDE ${CUDA_TOOLKIT_TARGET_DIR} ${CUDA_TOOLKIT_TARGET_DIR}/extras/CUPTI/include)
     include_directories(${CUDA_INCLUDE})
-    add_definitions(-DGOOGLE_CUDA=1 -DTF_EXTRA_CUDA_CAPABILITIES=3.5,5.2)
+    add_definitions(-DGOOGLE_CUDA=1 -DTF_EXTRA_CUDA_CAPABILITIES=3.0,3.5,5.2)
 
     # add cudnn
     include_directories(${CUDNN_HOME})
@@ -163,7 +164,7 @@ if (tensorflow_ENABLE_GPU)
     FILE(WRITE ${tensorflow_source_dir}/third_party/gpus/cuda/cuda_config.h
       "#ifndef CUDA_CUDA_CONFIG_H_\n"
       "#define CUDA_CUDA_CONFIG_H_\n"
-      "#define TF_CUDA_CAPABILITIES CudaVersion(\"3.5\"),CudaVersion(\"5.2\")\n"
+      "#define TF_CUDA_CAPABILITIES CudaVersion(\"3.0\"),CudaVersion(\"3.5\"),CudaVersion(\"5.2\")\n"
       "#define TF_CUDA_VERSION \"64_80\"\n"
       "#define TF_CUDNN_VERSION \"64_5\"\n"
       "#endif  // CUDA_CUDA_CONFIG_H_\n"
diff --git a/tensorflow/contrib/cmake/external/gif.cmake b/tensorflow/contrib/cmake/external/gif.cmake
index 231159ed0a5ddb..da20561b8806ca 100644
--- a/tensorflow/contrib/cmake/external/gif.cmake
+++ b/tensorflow/contrib/cmake/external/gif.cmake
@@ -1,7 +1,7 @@
 include (ExternalProject)
 
 set(gif_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/gif_archive/giflib-5.1.4/)
-set(gif_URL http://cdimage.debian.org/mirror/xbmc.org/build-deps/sources/giflib-5.1.4.tar.gz)
+set(gif_URL http://ufpr.dl.sourceforge.net/project/giflib/giflib-5.1.4.tar.gz)
 set(gif_HASH SHA256=34a7377ba834397db019e8eb122e551a49c98f49df75ec3fcc92b9a794a4f6d1)
 set(gif_INSTALL ${CMAKE_BINARY_DIR}/gif/install)
 set(gif_BUILD ${CMAKE_BINARY_DIR}/gif/src/gif)
diff --git a/tensorflow/contrib/cmake/tf_core_kernels.cmake b/tensorflow/contrib/cmake/tf_core_kernels.cmake
index 092396da11878a..dac0406ba844f8 100644
--- a/tensorflow/contrib/cmake/tf_core_kernels.cmake
+++ b/tensorflow/contrib/cmake/tf_core_kernels.cmake
@@ -36,8 +36,6 @@ if(tensorflow_BUILD_CONTRIB_KERNELS)
       "${tensorflow_source_dir}/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/layers/ops/bucketization_op.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/layers/ops/sparse_feature_cross_op.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/metrics/kernels/set_kernels.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/metrics/ops/set_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/blas_gemm.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/gru_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/lstm_ops.cc"
diff --git a/tensorflow/contrib/cmake/tf_core_ops.cmake b/tensorflow/contrib/cmake/tf_core_ops.cmake
index 5523023cb7f76e..a9791cdeb75d85 100644
--- a/tensorflow/contrib/cmake/tf_core_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_core_ops.cmake
@@ -9,6 +9,7 @@ set(tf_op_lib_names
     "io_ops"
     "linalg_ops"
     "logging_ops"
+    "losses"
     "math_ops"
     "nn_ops"
     "no_op"
@@ -17,6 +18,7 @@ set(tf_op_lib_names
     "resource_variable_ops"
     "script_ops"
     "sdca_ops"
+    "set_ops"  
     "sendrecv_ops"
     "sparse_ops"
     "state_ops"
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index fcbe238652d94f..ce305a4b1e78bb 100644
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -473,12 +473,14 @@ GENERATE_PYTHON_OP_LIB("image_ops")
 GENERATE_PYTHON_OP_LIB("io_ops")
 GENERATE_PYTHON_OP_LIB("linalg_ops")
 GENERATE_PYTHON_OP_LIB("logging_ops")
+GENERATE_PYTHON_OP_LIB("losses")
 GENERATE_PYTHON_OP_LIB("nn_ops")
 GENERATE_PYTHON_OP_LIB("parsing_ops")
 GENERATE_PYTHON_OP_LIB("random_ops")
 GENERATE_PYTHON_OP_LIB("resource_variable_ops")
 GENERATE_PYTHON_OP_LIB("script_ops")
 GENERATE_PYTHON_OP_LIB("sdca_ops")
+GENERATE_PYTHON_OP_LIB("set_ops")
 GENERATE_PYTHON_OP_LIB("state_ops")
 GENERATE_PYTHON_OP_LIB("sparse_ops")
 GENERATE_PYTHON_OP_LIB("string_ops")
diff --git a/tensorflow/contrib/compiler/BUILD b/tensorflow/contrib/compiler/BUILD
new file mode 100644
index 00000000000000..444f5e9e1616c6
--- /dev/null
+++ b/tensorflow/contrib/compiler/BUILD
@@ -0,0 +1,29 @@
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = [":friends"])
+
+package_group(
+    name = "friends",
+    packages = ["//tensorflow/..."],
+)
+
+py_library(
+    name = "compiler_py",
+    srcs = [
+        "__init__.py",
+        "jit.py",
+    ],
+    srcs_version = "PY2AND3",
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/compiler/__init__.py b/tensorflow/contrib/compiler/__init__.py
new file mode 100644
index 00000000000000..c4937dadfb8be3
--- /dev/null
+++ b/tensorflow/contrib/compiler/__init__.py
@@ -0,0 +1,21 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A module for controlling the Tensorflow/XLA JIT compiler."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.compiler import jit
diff --git a/tensorflow/contrib/compiler/jit.py b/tensorflow/contrib/compiler/jit.py
new file mode 100644
index 00000000000000..5c84159fcf17e7
--- /dev/null
+++ b/tensorflow/contrib/compiler/jit.py
@@ -0,0 +1,50 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Library for controlling the Tensorflow/XLA JIT compiler."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+
+import tensorflow as tf
+
+
+@contextlib.contextmanager
+def experimental_jit_scope(compile_ops=True):
+  """Enable or disable JIT compilation of operators within the scope.
+
+  NOTE: This is an experimental feature.
+
+  The compilation is a hint and only supported on a best-effort basis.
+
+  Example usage:
+    with tf.contrib.framework.experimental_jit_scope():
+      c = tf.matmul(a, b)  # compiled
+    with tf.contrib.framework.experimental_jit_scope(compile_ops=False):
+        d = tf.matmul(a, c)  # not compiled
+
+  Args:
+    compile_ops: boolean, whether to enable or disable compilation in the scope.
+  Yields:
+    The current scope, enabling or disabling compilation.
+
+  """
+  attrs = {"_XlaCompile": tf.AttrValue(b=compile_ops)}
+  # pylint: disable=protected-access
+  with tf.get_default_graph()._attr_scope(attrs):
+    yield
+  # pylint: enable=protected-access
diff --git a/tensorflow/contrib/cudnn_rnn/BUILD b/tensorflow/contrib/cudnn_rnn/BUILD
index 2b7b177a307328..8a36286eef3e76 100644
--- a/tensorflow/contrib/cudnn_rnn/BUILD
+++ b/tensorflow/contrib/cudnn_rnn/BUILD
@@ -3,16 +3,17 @@
 #   APIs are meant to change over time.
 package(
     default_visibility = ["//visibility:private"],
+    features = ["-parse_headers"],
 )
 
 licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_libs")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
 tf_custom_op_library(
     name = "python/ops/_cudnn_rnn_ops.so",
diff --git a/tensorflow/contrib/cudnn_rnn/__init__.py b/tensorflow/contrib/cudnn_rnn/__init__.py
index 4314f09959d750..b7ac5e7146f8a9 100644
--- a/tensorflow/contrib/cudnn_rnn/__init__.py
+++ b/tensorflow/contrib/cudnn_rnn/__init__.py
@@ -22,3 +22,4 @@
 from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnLSTM
 from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnRNNRelu
 from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnRNNTanh
+from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import RNNParamsSaveable
diff --git a/tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc b/tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc
index 8edbcc62ed7f01..6049d2afdab3e6 100644
--- a/tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc
+++ b/tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc
@@ -43,6 +43,7 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/util/stream_executor_util.h"
 #endif  // GOOGLE_CUDA
 
 /*
@@ -78,6 +79,12 @@ using GPUDevice = Eigen::GpuDevice;
 template <typename Device, typename T, typename Index>
 class CudnnRNNParamsSizeOp;
 
+template <typename Device, typename T>
+class CudnnRNNParamsToCanonical;
+
+template <typename Device, typename T>
+class CudnnRNNCanonicalToParams;
+
 template <typename Device, typename T>
 class CudnnRNNForwardOp;
 
@@ -96,6 +103,7 @@ using perftools::gputools::dnn::RnnInputMode;
 using perftools::gputools::dnn::RnnDirectionMode;
 using perftools::gputools::dnn::ToDataType;
 using perftools::gputools::DeviceMemory;
+using perftools::gputools::DeviceMemoryBase;
 using perftools::gputools::ScratchAllocator;
 using perftools::gputools::port::StatusOr;
 
@@ -184,6 +192,16 @@ DeviceMemory<U> CastDeviceMemory(Tensor* tensor) {
       tensor->template flat<T>().size() * sizeof(T));
 }
 
+DeviceMemoryBase SliceDeviceMemory(const DeviceMemoryBase& device_memory,
+                                   int64 offset, int64 size) {
+  const void* base_ptr = device_memory.opaque();
+  void* offset_ptr =
+      const_cast<char*>(reinterpret_cast<const char*>(base_ptr) + offset);
+  CHECK(offset + size <= device_memory.size())
+      << "The slice is not within the region of DeviceMemory.";
+  return DeviceMemoryBase(offset_ptr, size);
+}
+
 inline Status FromExecutorStatus(const perftools::gputools::port::Status& s) {
   return s.ok() ? Status::OK() : Status(static_cast<tensorflow::error::Code>(
                                             static_cast<int>(s.code())),
@@ -357,6 +375,28 @@ Status ExtractForwardInput(OpKernelContext* context,
 
 using perftools::gputools::dnn::RnnDescriptor;
 
+template <typename T>
+void RestoreParams(const OpInputList params_input,
+                   const std::vector<RnnDescriptor::ParamsRegion>& params,
+                   DeviceMemoryBase* data_dst,
+                   perftools::gputools::Stream* stream) {
+  int num_params = params.size();
+  CHECK(params_input.size() == num_params)
+      << "Number of params mismatch. Expected " << params_input.size()
+      << ", got " << num_params;
+  for (int i = 0; i < params.size(); i++) {
+    int64 size_in_bytes = params[i].size;
+    int64 size = size_in_bytes / sizeof(T);
+    CHECK(size == params_input[i].NumElements())
+        << "Params size mismatch. Expected " << size << ", got "
+        << params_input[i].NumElements();
+    auto data_src_ptr = StreamExecutorUtil::AsDeviceMemory<T>(params_input[i]);
+    DeviceMemoryBase data_dst_ptr =
+        SliceDeviceMemory(*data_dst, params[i].offset, size_in_bytes);
+    stream->ThenMemcpy(&data_dst_ptr, data_src_ptr, size_in_bytes);
+  }
+}
+
 }  // namespace
 
 // A common base class for RNN kernels. It extracts common attributes and
@@ -458,6 +498,162 @@ REGISTER_KERNEL_BUILDER(Name("CudnnRNNParamsSize")
                             .TypeConstraint<int32>("S"),
                         CudnnRNNParamsSizeOp<GPUDevice, float, int32>);
 
+// Convert weight and bias params from a platform-specific layout to the
+// canonical form.
+template <typename T>
+class CudnnRNNParamsToCanonical<GPUDevice, T> : public CudnnRNNKernelCommon {
+ public:
+  typedef GPUDevice Device;
+  explicit CudnnRNNParamsToCanonical(OpKernelConstruction* context)
+      : CudnnRNNKernelCommon(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("num_params", &num_params_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(3);
+    auto input_ptr = StreamExecutorUtil::AsDeviceMemory<T>(input);
+    auto* stream = context->op_device_context()->stream();
+
+    std::unique_ptr<RnnDescriptor> rnn_desc;
+    OP_REQUIRES_OK(context, ExtractCudnnRNNParamsInfo<T>(context, &rnn_desc));
+    int64 params_size_in_bytes = rnn_desc->ParamsSizeInBytes();
+    CHECK(params_size_in_bytes % sizeof(T) == 0)
+        << "params_size_in_bytes must be multiple of element size";
+
+    const Tensor* num_units_t = nullptr;
+    OP_REQUIRES_OK(context, context->input("num_units", &num_units_t));
+    CHECK(TensorShapeUtils::IsScalar(num_units_t->shape()))
+        << "num_units is not a scalar";
+    int num_units = num_units_t->scalar<int>()();
+
+    const Tensor* input_size_t = nullptr;
+    OP_REQUIRES_OK(context, context->input("input_size", &input_size_t));
+    CHECK(TensorShapeUtils::IsScalar(input_size_t->shape()))
+        << "input_size is not a scalar";
+    int input_size = input_size_t->scalar<int>()();
+
+    const Tensor* num_layers_t = nullptr;
+    OP_REQUIRES_OK(context, context->input("num_layers", &num_layers_t));
+    CHECK(TensorShapeUtils::IsScalar(num_layers_t->shape()))
+        << "num_layers is not a scalar";
+    int num_layers = num_layers_t->scalar<int>()();
+    int num_params_per_layer = num_params_ / num_layers;
+
+    CHECK(num_params_ == rnn_desc->ParamsWeightRegions().size())
+        << "Number of params mismatch. Expected " << num_params_ << ", got "
+        << rnn_desc->ParamsWeightRegions().size();
+    for (int i = 0; i < rnn_desc->ParamsWeightRegions().size(); i++) {
+      int64 size_in_bytes = rnn_desc->ParamsWeightRegions()[i].size;
+      int64 size = size_in_bytes / sizeof(T);
+      int width = (i < num_params_per_layer / 2) ? input_size : num_units;
+      int height = num_units;
+      CHECK(size == width * height) << "Params size mismatch. Expected "
+                                    << width * height << ", got " << size;
+      // If data is aligned, use slice view to avoid expensive memcpy.
+      bool start_aligned =
+          rnn_desc->ParamsWeightRegions()[i].offset % EIGEN_MAX_ALIGN_BYTES ==
+          0;
+      bool size_aligned = size_in_bytes % EIGEN_MAX_ALIGN_BYTES == 0;
+      if (start_aligned && size_aligned) {
+        int start = rnn_desc->ParamsWeightRegions()[i].offset / sizeof(T);
+        int end = start + size_in_bytes / sizeof(T);
+        context->set_output(i, input.Slice(start, end));
+      } else {
+        Tensor* output = nullptr;
+        OP_REQUIRES_OK(
+            context,
+            context->allocate_output(i, TensorShape({width, height}), &output));
+        DeviceMemoryBase data_src_ptr = SliceDeviceMemory(
+            input_ptr, rnn_desc->ParamsWeightRegions()[i].offset,
+            size_in_bytes);
+        auto data_dst_ptr = StreamExecutorUtil::AsDeviceMemory<T>(*output);
+        stream->ThenMemcpy(&data_dst_ptr, data_src_ptr, size_in_bytes);
+      }
+    }
+
+    CHECK(num_params_ == rnn_desc->ParamsBiasRegions().size())
+        << "Number of params mismatch. Expected " << num_params_ << ", got "
+        << rnn_desc->ParamsBiasRegions().size();
+    for (int i = 0; i < rnn_desc->ParamsBiasRegions().size(); i++) {
+      int64 size_in_bytes = rnn_desc->ParamsBiasRegions()[i].size;
+      int64 size = size_in_bytes / sizeof(T);
+      CHECK(size == num_units) << "Params size mismatch. Expected " << num_units
+                               << ", got " << size;
+      // If data is aligned, use slice view to avoid expensive memcpy.
+      bool start_aligned =
+          rnn_desc->ParamsBiasRegions()[i].offset % EIGEN_MAX_ALIGN_BYTES == 0;
+      bool size_aligned = size_in_bytes % EIGEN_MAX_ALIGN_BYTES == 0;
+      if (start_aligned && size_aligned) {
+        int start = rnn_desc->ParamsBiasRegions()[i].offset / sizeof(T);
+        int end = start + size_in_bytes / sizeof(T);
+        context->set_output(num_params_ + i, input.Slice(start, end));
+      } else {
+        Tensor* output = nullptr;
+        OP_REQUIRES_OK(context,
+                       context->allocate_output(num_params_ + i,
+                                                TensorShape({size}), &output));
+        DeviceMemoryBase data_src_ptr = SliceDeviceMemory(
+            input_ptr, rnn_desc->ParamsBiasRegions()[i].offset, size_in_bytes);
+        auto data_dst_ptr = StreamExecutorUtil::AsDeviceMemory<T>(*output);
+        stream->ThenMemcpy(&data_dst_ptr, data_src_ptr, size_in_bytes);
+      }
+    }
+  }
+
+ private:
+  int num_params_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("CudnnRNNParamsToCanonical")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("num_layers")
+                            .HostMemory("num_units")
+                            .HostMemory("input_size")
+                            .TypeConstraint<float>("T"),
+                        CudnnRNNParamsToCanonical<GPUDevice, float>);
+
+// Convert weight and bias params from the canonical form to a
+// platform-specific layout.
+template <typename T>
+class CudnnRNNCanonicalToParams<GPUDevice, T> : public CudnnRNNKernelCommon {
+ public:
+  typedef GPUDevice Device;
+  explicit CudnnRNNCanonicalToParams(OpKernelConstruction* context)
+      : CudnnRNNKernelCommon(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    std::unique_ptr<RnnDescriptor> rnn_desc;
+    OP_REQUIRES_OK(context, ExtractCudnnRNNParamsInfo<T>(context, &rnn_desc));
+    int64 params_size_in_bytes = rnn_desc->ParamsSizeInBytes();
+    CHECK(params_size_in_bytes % sizeof(T) == 0)
+        << "params_size_in_bytes must be multiple of element size";
+    Tensor* output = nullptr;
+    int params_size = params_size_in_bytes / sizeof(T);
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, {params_size}, &output));
+    auto output_ptr = StreamExecutorUtil::AsDeviceMemory<T>(*output);
+    auto* stream = context->op_device_context()->stream();
+
+    OpInputList weights;
+    OP_REQUIRES_OK(context, context->input_list("weights", &weights));
+    RestoreParams<T>(weights, rnn_desc->ParamsWeightRegions(), &output_ptr,
+                     stream);
+
+    OpInputList biases;
+    OP_REQUIRES_OK(context, context->input_list("biases", &biases));
+    RestoreParams<T>(biases, rnn_desc->ParamsBiasRegions(), &output_ptr,
+                     stream);
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("CudnnRNNCanonicalToParams")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("num_layers")
+                            .HostMemory("num_units")
+                            .HostMemory("input_size")
+                            .TypeConstraint<float>("T"),
+                        CudnnRNNCanonicalToParams<GPUDevice, float>);
+
 // Run the forward operation of the RNN model.
 template <typename T>
 class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
diff --git a/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc b/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc
index 9d2b1b2ebb644d..b5c2390de17468 100644
--- a/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc
+++ b/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc
@@ -21,6 +21,12 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+constexpr auto kCudnnRNNCommonInputs = R"doc(
+num_layers: Specifies the number of layers in the RNN model.
+num_units: Specifies the size of the hidden state.
+input_size: Specifies the size of the input state.
+)doc";
+
 constexpr auto kCudnnRNNCommonAttrs = R"doc(
 rnn_mode: Indicates the type of the RNN model.
 input_mode: Indicate whether there is a linear projection between the input and
@@ -47,12 +53,12 @@ constexpr auto kRNNInputModeAttrs =
 constexpr auto kRNNDirectionAttrs =
     "direction: {'unidirectional', 'bidirectional'} = 'unidirectional'";
 
-constexpr auto kCudnnRNNCanonicalParams = R"doc(
-canonical_weights: the canonical form of weights that can be used for saving
+constexpr auto kCudnnRNNParamsCanonical = R"doc(
+weights: the canonical form of weights that can be used for saving
     and restoration. They are more likely to be compatible across different
     generations.
-canonical_biases: the canonical form of biases that can be used for saving and
-    restoration. They are more likely to be compatible across different
+biases: the canonical form of biases that can be used for saving
+    and restoration. They are more likely to be compatible across different
     generations.
 )doc";
 
@@ -80,11 +86,8 @@ REGISTER_OP("CudnnRNNParamsSize")
 Return the params size that can be used by the Cudnn RNN model. Subsequent
 weight allocation and initialization should use this size.
 )doc",
-                         kCudnnRNNCommonAttrs,
+                         kCudnnRNNCommonInputs, kCudnnRNNCommonAttrs,
                          R"doc(
-num_layers: Specifies the number of layers in the RNN model.
-num_units: Specifies the size of the hidden state.
-input_size: Specifies the size of the input state.
 params_size: The size of the params buffer that should be allocated and
     initialized for this RNN model. Note that this params buffer may not be
     compatible across GPUs. Please use CudnnRNNParamsWeights and
@@ -213,46 +216,72 @@ params_backprop: The backprop to the params buffer in the forward pass. Has the
     same shape as params.
 )doc"));
 
-// NOTE(zhengxq): this is not currently implemented yet. And may subject to
-// change.
 REGISTER_OP("CudnnRNNParamsToCanonical")
     .Input("num_layers: int32")
     .Input("num_units: int32")
     .Input("input_size: int32")
     .Input("params: T")
-    .Output("canonical_weights: T")
-    .Output("canonical_biases: T")
+    .Output("weights: num_params * T")
+    .Output("biases: num_params * T")
     .Attr("T: {float}")
-    .Attr("N: int >= 1")
+    .Attr("num_params: int")
     .Attr(kRNNModeAttrs)
     .Attr(kRNNInputModeAttrs)
     .Attr(kRNNDirectionAttrs)
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 1, &unused));
+      int num_params;
+      c->GetAttr("num_params", &num_params);
+      // Set shape for weight matrices
+      for (int i = 0; i < num_params; i++) {
+        c->set_output(i,
+                      c->Matrix(InferenceContext::kUnknownDim,
+                                InferenceContext::kUnknownDim));
+      }
+      // Set shape for bias vectors
+      for (int i = 0; i < num_params; i++) {
+        c->set_output(num_params + i, c->Vector(InferenceContext::kUnknownDim));
+      }
+      return Status::OK();
+    })
     .Doc(strings::StrCat(R"doc(
 Retrieves a set of weights from the opaque params buffer that can be saved and
 restored in a way compatible with future runs.
 )doc",
-                         kCudnnRNNCommonAttrs, kCudnnRNNParamsBuffer,
-                         kCudnnRNNCanonicalParams));
+                         kCudnnRNNCommonInputs, kCudnnRNNParamsBuffer, R"doc(
+num_params: number of parameter sets for all layers.
+    Each layer may contain multiple parameter sets, with each set consisting of
+    a weight matrix and a bias vector.
+)doc",
+                         kCudnnRNNParamsCanonical, kCudnnRNNCommonAttrs));
 
-// NOTE(zhengxq): this is not currently implemented yet. And may subject to
-// change.
-REGISTER_OP("CudnnRNNParamsFromCanonical")
+REGISTER_OP("CudnnRNNCanonicalToParams")
     .Input("num_layers: int32")
     .Input("num_units: int32")
     .Input("input_size: int32")
-    .Input("params: Ref(T)")
-    .Input("canonical_weights: T")
-    .Input("canonical_biases: T")
+    .Input("weights: num_params * T")
+    .Input("biases: num_params * T")
+    .Output("params: T")
     .Attr("T: {float}")
-    .Attr("N: int >= 1")
+    .Attr("num_params: int")
     .Attr(kRNNModeAttrs)
     .Attr(kRNNInputModeAttrs)
     .Attr(kRNNDirectionAttrs)
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->Vector(InferenceContext::kUnknownDim));
+      return Status::OK();
+    })
     .Doc(strings::StrCat(R"doc(
 Writes a set of weights into the opaque params buffer so they can be used in
 upcoming training or inferences.
 )doc",
-                         kCudnnRNNCommonAttrs, kCudnnRNNParamsBuffer,
-                         kCudnnRNNCanonicalParams));
+                         kCudnnRNNCommonInputs, kCudnnRNNParamsCanonical,
+                         kCudnnRNNParamsBuffer, R"doc(
+num_params: number of parameter sets for all layers.
+    Each layer may contain multiple parameter sets, with each set consisting of
+    a weight matrix and a bias vector.
+)doc",
+                         kCudnnRNNCommonAttrs));
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py
index e46d9dbb0f34f1..33632bd574cf70 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py
@@ -114,9 +114,9 @@ def benchmarkTfRNNLSTMTraining(self):
         inputs = seq_length * [tf.zeros([batch_size, num_units], tf.float32)]
         initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=127)
 
-        cell = tf.nn.rnn_cell.LSTMCell(
+        cell = tf.contrib.rnn.LSTMCell(
             num_units=num_units, initializer=initializer, state_is_tuple=True)
-        multi_cell = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers)
+        multi_cell = tf.contrib.rnn.MultiRNNCell([cell] * num_layers)
         outputs, final_state = tf.nn.rnn(multi_cell, inputs, dtype=tf.float32)
         trainable_variables = tf.get_collection(
             tf.GraphKeys.TRAINABLE_VARIABLES)
@@ -137,7 +137,7 @@ def benchmarkTfRNNLSTMBlockCellTraining(self):
         inputs = seq_length * [tf.zeros([batch_size, num_units], tf.float32)]
         cell = tf.contrib.rnn.python.ops.lstm_ops.LSTMBlockCell(
             num_units=num_units)
-        multi_cell = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers)
+        multi_cell = tf.contrib.rnn.MultiRNNCell([cell] * num_layers)
         outputs, final_state = tf.nn.rnn(multi_cell, inputs, dtype=tf.float32)
         trainable_variables = tf.get_collection(
             tf.GraphKeys.TRAINABLE_VARIABLES)
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
index 28ac8d15dbeeff..6ed11953370ec9 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
@@ -18,8 +18,12 @@
 from __future__ import division
 from __future__ import print_function
 
+import os
+import unittest
 import tensorflow as tf
+from tensorflow.python.framework import ops
 from tensorflow.python.framework.test_util import TensorFlowTestCase
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
 
@@ -40,6 +44,90 @@ def _CreateModel(self, rnn_mode, num_layers, num_units, input_size):
       raise ValueError("Invalid rnn_mode: %s" % rnn_mode)
     return model
 
+  def _create_params_savable(self, params, model):
+    """Create a RNNParamsSaveable for the weight and bias parameters.
+
+    Args:
+      params: a Variable for weight and bias parameters.
+      model: a CudnnRNN model.
+    """
+    params_saveable = tf.contrib.cudnn_rnn.RNNParamsSaveable(
+        model.params_to_canonical, model.canonical_to_params, params)
+    ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, params_saveable)
+
+  def _testSaveRestoreVariable(self, rnn_mode):
+    model = self._CreateModel(rnn_mode, num_layers=2, num_units=7, input_size=3)
+    tf.set_random_seed(1234)
+    params_size_t = model.params_size()
+    params = variables.Variable(
+        tf.random_uniform([params_size_t]), validate_shape=False)
+    self._create_params_savable(params, model)
+    save_path = os.path.join(self.get_temp_dir(), "save-restore-variable-test")
+    saver = tf.train.Saver(write_version=tf.train.SaverDef.V2)
+    with self.test_session(use_gpu=True) as sess:
+      sess.run(tf.global_variables_initializer())
+      params_v = sess.run(params)
+      val = saver.save(sess, save_path)
+      self.assertEqual(save_path, val)
+    with self.test_session(use_gpu=True) as sess:
+      reset_params = tf.assign(params, tf.zeros([params_size_t]))
+      sess.run(reset_params)
+      saver.restore(sess, save_path)
+      params_v_restored = sess.run(params)
+      self.assertAllEqual(params_v, params_v_restored)
+
+  def _testSaveRestoreOutput(self, rnn_mode):
+    num_layers = 2
+    num_units = 7
+    input_size = 7
+    seq_length = 10
+    batch_size = 5
+    dir_count = 1
+    model = self._CreateModel(rnn_mode, num_layers, num_units, input_size)
+    params_size_t = model.params_size()
+    params = variables.Variable(tf.ones([params_size_t]), validate_shape=False)
+    self._create_params_savable(params, model)
+    save_path = os.path.join(self.get_temp_dir(), "save-restore-output-test")
+    saver = tf.train.Saver(write_version=tf.train.SaverDef.V2)
+
+    has_input_c = (rnn_mode == "lstm")
+    input_data = tf.ones([seq_length, batch_size, input_size])
+    input_h = tf.ones([num_layers * dir_count, batch_size, num_units])
+    if has_input_c:
+      input_c = tf.ones([num_layers * dir_count, batch_size, num_units])
+      outputs = model(
+          input_data=input_data,
+          input_h=input_h,
+          input_c=input_c,
+          params=params,
+          is_training=False)
+    else:
+      outputs = model(
+          input_data=input_data,
+          input_h=input_h,
+          params=params,
+          is_training=False)
+    total_sum = sum(map(tf.reduce_sum, outputs))
+    with self.test_session(use_gpu=True) as sess:
+      sess.run(tf.global_variables_initializer())
+      total_sum_v = sess.run(total_sum)
+      val = saver.save(sess, save_path)
+      self.assertEqual(save_path, val)
+    with self.test_session(use_gpu=True) as sess:
+      reset_params = tf.assign(params, tf.zeros([params_size_t]))
+      sess.run(reset_params)
+      saver.restore(sess, save_path)
+      total_sum_v_restored = sess.run(total_sum)
+      self.assertAllEqual(total_sum_v, total_sum_v_restored)
+
+  @unittest.skipUnless(tf.test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def testSaveRestore(self):
+    rnn_modes = ["lstm", "gru", "rnn_tanh", "rnn_relu"]
+    for rnn_mode in rnn_modes:
+      self._testSaveRestoreVariable(rnn_mode)
+      self._testSaveRestoreOutput(rnn_mode)
+
   def _MinLSTMParamSize(self,
                         num_layers,
                         num_units,
@@ -62,9 +150,9 @@ def _testOneLSTMParamsSize(self, num_layers, num_units, input_size):
       params_size_v = sess.run(params_size)
       self.assertLessEqual(min_params_size, params_size_v)
 
+  @unittest.skipUnless(tf.test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
   def testLSTMParamsSize(self):
-    if not tf.test.is_built_with_cuda():
-      return
     test_configs = [
         [4, 200, 200],
         [4, 200, 300],
@@ -85,10 +173,9 @@ def _testOneSimpleInference(self, rnn_mode, num_layers, num_units, input_size,
     params_size_t = model.params_size()
     input_data = tf.ones([seq_length, batch_size, input_size])
     input_h = tf.ones([num_layers * dir_count, batch_size, num_units])
-    if has_input_c:
-      input_c = tf.ones([num_layers * dir_count, batch_size, num_units])
     params = tf.Variable(tf.ones([params_size_t]), validate_shape=False)
     if has_input_c:
+      input_c = tf.ones([num_layers * dir_count, batch_size, num_units])
       output, output_h, output_c = model(
           input_data=input_data,
           input_h=input_h,
@@ -113,9 +200,9 @@ def _testOneSimpleInference(self, rnn_mode, num_layers, num_units, input_size,
       self.assertAllClose(
           total_sum_v[0], expected, atol=tolerance, rtol=tolerance)
 
+  @unittest.skipUnless(tf.test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
   def testSimpleInference(self):
-    if not tf.test.is_built_with_cuda():
-      return
     test_configs = [
         ["lstm",
          231833.22,
@@ -183,12 +270,11 @@ def _testOneSimpleTraining(self, rnn_mode, num_layers, num_units, input_size,
         tf.random_uniform([seq_length, batch_size, input_size]))
     input_h = tf.Variable(
         tf.random_uniform([num_layers * dir_count, batch_size, num_units]))
-    if has_input_c:
-      input_c = tf.Variable(
-          tf.random_uniform([num_layers * dir_count, batch_size, num_units]))
     params = tf.Variable(
         tf.random_uniform([params_size_t]), validate_shape=False)
     if has_input_c:
+      input_c = tf.Variable(
+          tf.random_uniform([num_layers * dir_count, batch_size, num_units]))
       output, output_h, output_c = model(
           input_data=input_data,
           input_h=input_h,
@@ -221,9 +307,9 @@ def _testOneSimpleTraining(self, rnn_mode, num_layers, num_units, input_size,
                                            [1])
       self.assertLess(err, tolerance)
 
+  @unittest.skipUnless(tf.test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
   def testSimpleTraining(self):
-    if not tf.test.is_built_with_cuda():
-      return
     test_configs = [
         ["lstm",
          1e-2,
diff --git a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
index 4a3120dcb8ded1..8fc3e2d1151b63 100644
--- a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
+++ b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
@@ -16,17 +16,77 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+import itertools
 
 from tensorflow.contrib.cudnn_rnn.ops import gen_cudnn_rnn_ops
 from tensorflow.contrib.util import loader
+from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import resource_loader
+from tensorflow.python.training import saver
+
 
 _cudnn_rnn_ops_so = loader.load_op_library(
     resource_loader.get_path_to_datafile("_cudnn_rnn_ops.so"))
 
+
+# TODO(yaozhang): make sure we only save the canonical version of params and
+# don't save the platform-specific version to avoid potential race
+# conditions where params is updated by both versions when being restored.
+# Currently, checkpointing will function properly, despite that we save both
+# versions, because Saver restores customized savables after Variables.
+# However, it is good to not rely on this restoring order of Saver and to
+# avoid unnecessary storage. Add a test to check only the canonical version is
+# saved.
+class RNNParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
+  """SaveableObject implementation that handles the RNN params variable."""
+
+  def __init__(self, params_to_canonical, canonical_to_params,
+               *param_variables):
+    """Creates a RNNParamsSaveable object.
+
+    Args:
+      params_to_canonical: a function to convert params from a specific format
+          for cuDNN or other RNN ops to the canonical format .
+      canonical_to_params: a function to convert params from the canonical
+          format to a specific format for cuDNN or other RNN ops. The function
+          must return a scalar (e.g. in the case of cuDNN) or a tuple.
+      *param_variables: a list of Variables for parameters in a specific form.
+          For cuDNN RNN ops, this is a single merged variable for both weights
+          and biases; for other RNN ops, this might be multiple unmerged or
+          partially merged variables respectively for weights and biases.
+    """
+    # There is only a single merged parameter variable for cuDNN when saving.
+    weights, biases = params_to_canonical(param_variables[0])
+    self._canonical_to_params = canonical_to_params
+    self._variables = param_variables
+    # We currently don't use slice_spec. It might be useful in a distributed
+    # setting where each parameter server node stores a slice of variable,
+    # instead of having the master pull all slices and then save them.
+    slice_spec = ""
+    specs = [
+        saver.BaseSaverBuilder.SaveSpec(param, slice_spec, param.name)
+        for param in itertools.chain(weights, biases)
+    ]
+    super(RNNParamsSaveable, self).__init__(None, specs, "params_canonical")
+
+  def restore(self, restored_tensors, restored_shapes):
+    weights = restored_tensors[:len(restored_tensors) // 2]
+    biases = restored_tensors[len(restored_tensors) // 2:]
+    params = self._canonical_to_params(weights, biases)
+    if not isinstance(params, tuple):
+      params = (params,)
+    assign_ops = [
+        state_ops.assign(
+            variable, param, validate_shape=False)
+        for variable, param in zip(self._variables, params)
+    ]
+    return control_flow_ops.group(*assign_ops)
+
 _cudnn_rnn_common_doc_string = """
   Cudnn RNN has an opaque parameter buffer that can be used for inference and
   training. But it is possible that the layout of the parameter buffers
@@ -48,7 +108,10 @@
 
 
 class _CudnnRNN(object):
-  """Create an RNN model using the underlying Cudnn implementation.
+  """Creates an RNN model using the underlying Cudnn implementation.
+
+  Note that self._NUM_PARAMS_PER_LAYER is the number of parameter sets of
+  weight and bias per layer. It needs to be defined in subclasses.
   """
   __doc__ += _cudnn_rnn_common_doc_string
 
@@ -62,7 +125,7 @@ def __init__(self,
                dropout=0.,
                seed=0,
                seed2=0):
-    """Create a CudnnRNN model from model spec.
+    """Creates a CudnnRNN model from model spec.
 
     Args:
       rnn_mode: a string specifies the mode, under which this RNN model runs.
@@ -94,7 +157,7 @@ def __init__(self,
     self._seed2 = seed2
 
   def params_size(self):
-    """Calculate the size of the opaque parameter buffer needed for this model.
+    """Calculates the size of the opaque parameter buffer needed for this model.
 
     Returns:
       The calculated parameter buffer size.
@@ -110,7 +173,7 @@ def params_size(self):
         direction=self._direction)[0]
 
   def __call__(self, input_data, input_h, input_c, params, is_training=True):
-    """Run the forward step for the RNN model.
+    """Runs the forward step for the RNN model.
 
     Args:
       input_data: the input sequence to the RNN model.
@@ -141,13 +204,53 @@ def __call__(self, input_data, input_h, input_c, params, is_training=True):
         is_training=is_training)
     return (output, output_h, output_c)
 
-  # TODO(zhengxq): add reading and writing canonical weights.
+  def params_to_canonical(self, params):
+    """Converts params from a specific format of cuDNN to the canonical format.
+
+    Args:
+      params: a Variable for weight and bias parameters.
+
+    Returns:
+      A function for the specific-to-canonical conversion.
+    """
+    weights, biases = gen_cudnn_rnn_ops.cudnn_rnn_params_to_canonical(
+        num_layers=self._num_layers,
+        num_units=self._num_units,
+        input_size=self._input_size,
+        params=params,
+        num_params=self._num_layers * self._NUM_PARAMS_PER_LAYER,
+        rnn_mode=self._rnn_mode,
+        input_mode=self._input_mode,
+        direction=self._direction)
+    return weights, biases
+
+  def canonical_to_params(self, weights, biases):
+    """Converts params from the canonical format to a specific format of cuDNN.
+
+    Args:
+      weights: a Tensor for weight parameters.
+      biases: a Tensor for bias parameters.
+
+    Returns:
+      A function for the canonical-to-params-to-specific conversion..
+    """
+    return gen_cudnn_rnn_ops.cudnn_rnn_canonical_to_params(
+        num_layers=self._num_layers,
+        num_units=self._num_units,
+        input_size=self._input_size,
+        weights=weights,
+        biases=biases,
+        rnn_mode=self._rnn_mode,
+        input_mode=self._input_mode,
+        direction=self._direction)
 
 
 class CudnnLSTM(_CudnnRNN):
-  """Cudnn implementation of the LSTM model.
-  """
+  """Cudnn implementation of the LSTM model."""
   __doc__ += _cudnn_rnn_common_doc_string
+  # 4 sets of weight and bias parameters for the recurrent input, and 4 for the
+  # previous layer input.
+  _NUM_PARAMS_PER_LAYER = 8
 
   def __init__(self,
                num_layers,
@@ -158,7 +261,7 @@ def __init__(self,
                dropout=0.,
                seed=0,
                seed2=0):
-    """Create a Cudnn LSTM model from model spec.
+    """Creates a Cudnn LSTM model from model spec.
 
     Args:
       num_layers: the number of layers for the RNN model.
@@ -189,7 +292,7 @@ def __init__(self,
         seed2=seed2)
 
   def __call__(self, input_data, input_h, input_c, params, is_training=True):
-    """Run the forward step for the Cudnn LSTM model.
+    """Runs the forward step for the Cudnn LSTM model.
 
     Args:
       input_data: the input sequence to the LSTM model.
@@ -212,8 +315,7 @@ def __call__(self, input_data, input_h, input_c, params, is_training=True):
 
 
 class _CudnnRNNNoInputC(_CudnnRNN):
-  """Simple CudnnRNN models without input_c.
-  """
+  """Simple CudnnRNN models without input_c."""
   __doc__ += _cudnn_rnn_common_doc_string
 
   def __init__(self,
@@ -225,7 +327,7 @@ def __init__(self,
                dropout=0.,
                seed=0,
                seed2=0):
-    """Create a Cudnn RNN model from model without hidden-state C.
+    """Creates a Cudnn RNN model from model without hidden-state C.
 
     Args:
       num_layers: the number of layers for the RNN model.
@@ -256,7 +358,7 @@ def __init__(self,
         seed2=seed2)
 
   def __call__(self, input_data, input_h, params, is_training=True):
-    """Run the forward step for the Cudnn LSTM model.
+    """Runs the forward step for the Cudnn LSTM model.
 
     Args:
       input_data: the input sequence to the LSTM model.
@@ -274,24 +376,30 @@ def __call__(self, input_data, input_h, params, is_training=True):
 
 
 class CudnnGRU(_CudnnRNNNoInputC):
-  """Cudnn implementation of the GRU model.
-  """
+  """Cudnn implementation of the GRU model."""
   __doc__ += _cudnn_rnn_common_doc_string
   _rnn_mode = "gru"
+  # 3 sets of weight and bias parameters for the recurrent input, and 3 for the
+  # previous layer input.
+  _NUM_PARAMS_PER_LAYER = 6
 
 
 class CudnnRNNTanh(_CudnnRNNNoInputC):
-  """Cudnn implementation of the RNN-tanh model.
-  """
+  """Cudnn implementation of the RNN-tanh model."""
   __doc__ += _cudnn_rnn_common_doc_string
   _rnn_mode = "rnn_tanh"
+  # 1 set of weight and bias parameters for the recurrent input, and 1 for the
+  # previous layer input.
+  _NUM_PARAMS_PER_LAYER = 2
 
 
 class CudnnRNNRelu(_CudnnRNNNoInputC):
-  """Cudnn implementation of the RNN-relu model.
-  """
+  """Cudnn implementation of the RNN-relu model."""
   __doc__ += _cudnn_rnn_common_doc_string
   _rnn_mode = "rnn_relu"
+  # 1 set of weight and bias parameters for the recurrent input, and 1 for the
+  # previous layer input.
+  _NUM_PARAMS_PER_LAYER = 2
 
 
 @ops.RegisterGradient("CudnnRNN")
@@ -314,3 +422,10 @@ def _cudnn_rnn_backward(op, *grad):
       rnn_mode=op.get_attr("rnn_mode"),
       input_mode=op.get_attr("input_mode"),
       direction=op.get_attr("direction"))
+
+
+ops.RegisterShape("CudnnRNNParamsSize")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("CudnnRNNParamsToCanonical")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("CudnnRNNCanonicalToParams")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("CudnnRNN")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("CudnnRNNBackprop")(common_shapes.call_cpp_shape_fn)
diff --git a/tensorflow/contrib/deprecated/BUILD b/tensorflow/contrib/deprecated/BUILD
index ba21a89e79501e..791580a04abed2 100644
--- a/tensorflow/contrib/deprecated/BUILD
+++ b/tensorflow/contrib/deprecated/BUILD
@@ -11,7 +11,6 @@ py_library(
     name = "deprecated_py",
     srcs = [
         "__init__.py",
-        "summaries.py",
     ],
     srcs_version = "PY2AND3",
     deps = ["//tensorflow/python:logging_ops"],
diff --git a/tensorflow/contrib/deprecated/__init__.py b/tensorflow/contrib/deprecated/__init__.py
index 314e1e28d6bf11..2c94882cd75322 100644
--- a/tensorflow/contrib/deprecated/__init__.py
+++ b/tensorflow/contrib/deprecated/__init__.py
@@ -12,17 +12,86 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Deprecated endpoints that we aren't yet ready to remove entirely.
+"""Non-core alias for the deprecated tf.X_summary ops.
+
+For TensorFlow 1.0, we have re-organized the TensorFlow summary ops into a
+submodule, and made some semantic tweaks. The first thing to note is that we
+moved the APIs around as follows:
+
+tf.scalar_summary      -> tf.summary.scalar
+tf.histogram_summary   -> tf.summary.histogram
+tf.audio_summary       -> tf.summary.audio
+tf.image_summary       -> tf.summary.image
+tf.merge_summary       -> tf.summary.merge
+tf.merge_all_summaries -> tf.summary.merge_all
+
+We think this is a cleaner API and will improve long-term discoverability and
+clarity of the TensorFlow API. However, we also took the opportunity to make an
+important change to how summary "tags" work. The "tag" of a summary is the
+string that is associated with the output data, i.e. the key for organizing the
+generated protobufs.
+
+Previously, the tag was allowed to be any unique string, and had no relation
+to the summary op generating it, and no relation to the TensorFlow name system.
+This made it very difficult to write re-usable code that would add summary
+ops to the graph. If you had a function that would add summary ops, you would
+need to manually pass in a name scope to that function to create de-duplicated
+tags, otherwise your program would fail with a runtime error due to tag
+collision.
+
+The new summary APIs under tf.summary throw away the "tag" as an independent
+concept; instead, the first argument is the node name. This means that summary
+tags now automatically inherit the surrounding TF name scope, and automatically
+are deduplicated if there is a conflict. However, now the only allowed
+characters are alphanumerics, underscores, and forward slashes. To make
+migration easier, the new APIs automatically convert illegal characters to
+underscores.
+
+Just as an example, consider the following "before" and "after" code snippets:
+
+# Before
+def add_activation_summaries(v, scope):
+  tf.scalar_summary("%s/fraction_of_zero" % scope, tf.nn.fraction_of_zero(v))
+  tf.histogram_summary("%s/activations" % scope, v)
+
+# After
+def add_activation_summaries(v):
+  tf.summary.scalar("fraction_of_zero", tf.nn.fraction_of_zero(v))
+  tf.summary.histogram("activations", v)
+
+Now, so long as the add_activation_summaries function is called from within the
+right name scope, the behavior is the same.
+
+Because this change does modify the behavior and could break tests, we can't
+automatically migrate usage to the new APIs. That is why we are making the old
+APIs temporarily available here at tf.contrib.deprecated.
+
+In addition to the name change described above, there are two further changes
+to the new summary ops:
+
+- the "max_images" argument for tf.image_summary was renamed to "max_outputs
+  for tf.summary.image
+- tf.scalar_summary accepted arbitrary tensors of tags and values. However,
+  tf.summary.scalar requires a single scalar name and scalar value. In most
+  cases, you can create tf.summary.scalars in a loop to get the same behavior
+
+As before, TensorBoard will group charts by the top-level name scope. This may
+be inconvenient, since in the new summary ops the summary will inherit that
+name scope without user control. We plan to add more grouping mechanisms to
+TensorBoard, so it will be possible to specify the TensorBoard group for
+each summary via the summary API.
+
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+
 # pylint: disable=unused-import,line-too-long
-from tensorflow.contrib.deprecated.summaries import audio_summary
-from tensorflow.contrib.deprecated.summaries import histogram_summary
-from tensorflow.contrib.deprecated.summaries import image_summary
-from tensorflow.contrib.deprecated.summaries import merge_all_summaries
-from tensorflow.contrib.deprecated.summaries import merge_summary
-from tensorflow.contrib.deprecated.summaries import scalar_summary
+from tensorflow.python.ops.logging_ops import audio_summary
+from tensorflow.python.ops.logging_ops import histogram_summary
+from tensorflow.python.ops.logging_ops import image_summary
+from tensorflow.python.ops.logging_ops import merge_all_summaries
+from tensorflow.python.ops.logging_ops import merge_summary
+from tensorflow.python.ops.logging_ops import scalar_summary
 # pylint: enable=unused-import,line-too-long
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bernoulli_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bernoulli_test.py
index a3bc82d8a7f6c7..ba69c505c85f25 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bernoulli_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bernoulli_test.py
@@ -127,6 +127,17 @@ def _testPmf(self, **kwargs):
         self.assertAllClose(dist.pmf(x).eval(), expected_pmf)
         self.assertAllClose(dist.log_pmf(x).eval(), np.log(expected_pmf))
 
+  def testPmfCorrectBroadcastDynamicShape(self):
+    with self.test_session():
+      p = tf.placeholder(dtype=tf.float32)
+      dist = tf.contrib.distributions.Bernoulli(p=p)
+      event1 = [1, 0, 1]
+      event2 = [[1, 0, 1]]
+      self.assertAllClose(dist.pmf(event1).eval({p: [0.2, 0.3, 0.4]}),
+                          [0.2, 0.7, 0.4])
+      self.assertAllClose(dist.pmf(event2).eval({p: [0.2, 0.3, 0.4]}),
+                          [[0.2, 0.7, 0.4]])
+
   def testPmfWithP(self):
     p = [[0.2, 0.4], [0.3, 0.6]]
     self._testPmf(p=p)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py b/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py
index 8f4e00bfa15680..d18ce9ab61be99 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py
@@ -261,6 +261,89 @@ def testLogCombinationsShape(self):
       self.assertEqual([2, 2], log_binom.get_shape())
 
 
+class DynamicShapeTest(tf.test.TestCase):
+
+  def testSameDynamicShape(self):
+    with self.test_session():
+      scalar = tf.constant(2.0)
+      scalar1 = tf.placeholder(dtype=tf.float32)
+
+      vector = [0.3, 0.4, 0.5]
+      vector1 = tf.placeholder(dtype=tf.float32, shape=[None])
+      vector2 = tf.placeholder(dtype=tf.float32, shape=[None])
+
+      multidimensional = [[0.3, 0.4], [0.2, 0.6]]
+      multidimensional1 = tf.placeholder(dtype=tf.float32, shape=[None, None])
+      multidimensional2 = tf.placeholder(dtype=tf.float32, shape=[None, None])
+
+      # Scalar
+      self.assertTrue(distribution_util.same_dynamic_shape(
+          scalar, scalar1).eval({
+              scalar1: 2.0}))
+
+      # Vector
+
+      self.assertTrue(distribution_util.same_dynamic_shape(
+          vector, vector1).eval({
+              vector1: [2.0, 3.0, 4.0]}))
+      self.assertTrue(distribution_util.same_dynamic_shape(
+          vector1, vector2).eval({
+              vector1: [2.0, 3.0, 4.0],
+              vector2: [2.0, 3.5, 6.0]}))
+
+      # Multidimensional
+      self.assertTrue(distribution_util.same_dynamic_shape(
+          multidimensional, multidimensional1).eval({
+              multidimensional1: [[2.0, 3.0], [3.0, 4.0]]}))
+      self.assertTrue(distribution_util.same_dynamic_shape(
+          multidimensional1, multidimensional2).eval({
+              multidimensional1: [[2.0, 3.0], [3.0, 4.0]],
+              multidimensional2: [[1.0, 3.5], [6.3, 2.3]]}))
+
+
+      # Scalar, X
+      self.assertFalse(distribution_util.same_dynamic_shape(
+          scalar, vector1).eval({
+              vector1: [2.0, 3.0, 4.0]}))
+      self.assertFalse(distribution_util.same_dynamic_shape(
+          scalar1, vector1).eval({
+              scalar1: 2.0,
+              vector1: [2.0, 3.0, 4.0]}))
+      self.assertFalse(distribution_util.same_dynamic_shape(
+          scalar, multidimensional1).eval({
+              multidimensional1: [[2.0, 3.0], [3.0, 4.0]]}))
+      self.assertFalse(distribution_util.same_dynamic_shape(
+          scalar1, multidimensional1).eval({
+              scalar1: 2.0,
+              multidimensional1: [[2.0, 3.0], [3.0, 4.0]]}))
+
+      # Vector, X
+      self.assertFalse(distribution_util.same_dynamic_shape(
+          vector, vector1).eval({
+              vector1: [2.0, 3.0]}))
+      self.assertFalse(distribution_util.same_dynamic_shape(
+          vector1, vector2).eval({
+              vector1: [2.0, 3.0, 4.0],
+              vector2: [6.0]}))
+      self.assertFalse(distribution_util.same_dynamic_shape(
+          vector, multidimensional1).eval({
+              multidimensional1: [[2.0, 3.0], [3.0, 4.0]]}))
+      self.assertFalse(distribution_util.same_dynamic_shape(
+          vector1, multidimensional1).eval({
+              vector1: [2.0, 3.0, 4.0],
+              multidimensional1: [[2.0, 3.0], [3.0, 4.0]]}))
+
+      # Multidimensional, X
+      self.assertFalse(distribution_util.same_dynamic_shape(
+          multidimensional, multidimensional1).eval({
+              multidimensional1: [[1.0, 3.5, 5.0], [6.3, 2.3, 7.1]]}))
+      self.assertFalse(distribution_util.same_dynamic_shape(
+          multidimensional1, multidimensional2).eval({
+              multidimensional1: [[2.0, 3.0], [3.0, 4.0]],
+              multidimensional2: [[1.0, 3.5, 5.0], [6.3, 2.3, 7.1]]}))
+
+
+
 class RotateTransposeTest(tf.test.TestCase):
 
   def _np_rotate_transpose(self, x, shift):
diff --git a/tensorflow/contrib/distributions/python/ops/bernoulli.py b/tensorflow/contrib/distributions/python/ops/bernoulli.py
index 44962a5f1b93ee..2a338526751daa 100644
--- a/tensorflow/contrib/distributions/python/ops/bernoulli.py
+++ b/tensorflow/contrib/distributions/python/ops/bernoulli.py
@@ -25,6 +25,7 @@
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
@@ -131,13 +132,21 @@ def _log_prob(self, event):
     logits = self.logits
     # sigmoid_cross_entropy_with_logits doesn't broadcast shape,
     # so we do this here.
-    # TODO(b/30637701): Check dynamic shape, and don't broadcast if the
-    # dynamic shapes are the same.
-    if (not event.get_shape().is_fully_defined() or
-        not logits.get_shape().is_fully_defined() or
-        event.get_shape() != logits.get_shape()):
-      logits = array_ops.ones_like(event) * logits
-      event = array_ops.ones_like(logits) * event
+
+    broadcast = lambda logits, event: (
+        array_ops.ones_like(event) * logits,
+        array_ops.ones_like(logits) * event)
+
+    # First check static shape.
+    if (event.get_shape().is_fully_defined() and
+        logits.get_shape().is_fully_defined()):
+      if event.get_shape() != logits.get_shape():
+        logits, event = broadcast(logits, event)
+    else:
+      logits, event = control_flow_ops.cond(
+          distribution_util.same_dynamic_shape(logits, event),
+          lambda: (logits, event),
+          lambda: broadcast(logits, event))
     return -nn.sigmoid_cross_entropy_with_logits(logits, event)
 
   def _prob(self, event):
diff --git a/tensorflow/contrib/distributions/python/ops/bijector.py b/tensorflow/contrib/distributions/python/ops/bijector.py
index b29c272405f854..e83d8469f2c20f 100644
--- a/tensorflow/contrib/distributions/python/ops/bijector.py
+++ b/tensorflow/contrib/distributions/python/ops/bijector.py
@@ -146,7 +146,7 @@ def merge(self, x=None, y=None, ildj=None,
     if mapping is None:
       mapping = _Mapping(x=x, y=y, ildj=ildj,
                          condition_kwargs=condition_kwargs)
-    elif not all([arg is None for arg in [x, y, ildj, condition_kwargs]]):
+    elif not all(arg is None for arg in [x, y, ildj, condition_kwargs]):
       raise ValueError("Cannot specify mapping and individual args.")
     return _Mapping(
         x=self._merge(self.x, mapping.x),
@@ -629,12 +629,12 @@ def inverse(self, y, name="inverse", **condition_kwargs):
         try:
           x, ildj = self._inverse_and_inverse_log_det_jacobian(
               y, **condition_kwargs)
-          if self._constant_ildj is not None:
-            ildj = self._constant_ildj  # Use the "global" result.
-          elif self.is_constant_jacobian:
-            self._constant_ildj = ildj
         except NotImplementedError:
           raise original_error
+        if self._constant_ildj is not None:
+          ildj = self._constant_ildj  # Use the "global" result.
+        elif self.is_constant_jacobian:
+          self._constant_ildj = ildj
       x = x if mapping.x is None else mapping.x
       mapping = mapping.merge(x=x, ildj=ildj)
       self._cache(mapping)
@@ -683,10 +683,10 @@ def inverse_log_det_jacobian(
         try:
           x, ildj = self._inverse_and_inverse_log_det_jacobian(
               y, **condition_kwargs)
-          if mapping.x is not None:
-            x = mapping.x
         except NotImplementedError:
           raise original_error
+        if mapping.x is not None:
+          x = mapping.x
       if self.is_constant_jacobian:
         self._constant_ildj = ildj
       x = x if mapping.x is None else mapping.x
@@ -736,6 +736,7 @@ def inverse_and_inverse_log_det_jacobian(
         # to see if we can separately use _inverse and
         # _inverse_log_det_jacobian members.
         try:
+          # We want this same try/except to catch either NotImplementedError.
           x = self._inverse(y, **condition_kwargs)
           if self._constant_ildj is None:
             ildj = self._inverse_log_det_jacobian(y, **condition_kwargs)
@@ -790,6 +791,7 @@ def forward_log_det_jacobian(
         ildj = -self._forward_log_det_jacobian(x, **condition_kwargs)
       except NotImplementedError as original_error:
         try:
+          # We want this same try/except to catch either NotImplementedError.
           y = self.inverse(x, **condition_kwargs) if y is None else y
           ildj = self.inverse_log_det_jacobian(y, **condition_kwargs)
         except NotImplementedError:
@@ -824,8 +826,9 @@ def _cache(self, mapping):
     # which is not None.
     mapping = mapping.merge(mapping=self._lookup(
         mapping.x, mapping.y, mapping.condition_kwargs))
-    if mapping.x is None or mapping.y is None:
-      ValueError("Caching expects both (x,y) to be known, i.e., not None.")
+    if mapping.x is None and mapping.y is None:
+      raise ValueError("Caching expects at least one of (x,y) to be known, "
+                       "i.e., not None.")
     self._from_x[mapping.x_key] = mapping
     self._from_y[mapping.y_key] = mapping
 
@@ -1092,12 +1095,13 @@ def __init__(self, bijectors=(), validate_args=False, name=None):
     else:
       dtype = None
 
+    parameters = {}
+    for b in bijectors:
+      parameters.update(("{}={}".format(b.name, k), v)
+                        for k, v in b.parameters.items())
     super(Chain, self).__init__(
-        parameters=dict(("=".join([b.name, k]), v)
-                        for b in bijectors
-                        for k, v in b.parameters.items()),
-        is_constant_jacobian=all([b.is_constant_jacobian
-                                  for b in bijectors]),
+        parameters=parameters,
+        is_constant_jacobian=all(b.is_constant_jacobian for b in bijectors),
         validate_args=validate_args,
         dtype=dtype,
         name=name or ("identity" if not bijectors else
@@ -1753,12 +1757,12 @@ def _inverse(self, y):
                               on_value=shape[-1]-np.array(1, dtype=shape.dtype),
                               dtype=shape.dtype)
     size = array_ops.concat(0, (shape[:-1], np.asarray([1], dtype=shape.dtype)))
-    log_normalization = -array_ops.slice(x, begin, size)
+    log_normalization = -array_ops.strided_slice(x, begin, begin + size)
 
     # Here we slice out all but the last coordinate; see above for idea.
     begin = array_ops.zeros_like(shape)
     size = array_ops.concat(0, (shape[:-1], [shape[-1]-1]))
-    x = array_ops.slice(x, begin, size)
+    x = array_ops.strided_slice(x, begin, begin + size)
 
     x += log_normalization
 
diff --git a/tensorflow/contrib/distributions/python/ops/distribution_util.py b/tensorflow/contrib/distributions/python/ops/distribution_util.py
index 1da931c08e1222..8f4b893faa9a20 100644
--- a/tensorflow/contrib/distributions/python/ops/distribution_util.py
+++ b/tensorflow/contrib/distributions/python/ops/distribution_util.py
@@ -32,6 +32,7 @@
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 
@@ -104,6 +105,36 @@ def assert_symmetric(matrix):
       [check_ops.assert_equal(matrix, matrix_t)], matrix)
 
 
+def same_dynamic_shape(a, b):
+  """Returns whether a and b have the same dynamic shape.
+
+  Args:
+    a: `Tensor`
+    b: `Tensor`
+
+  Returns:
+    `Boolean` `Tensor` representing if both tensors have the same shape.
+  """
+  a = ops.convert_to_tensor(a, name="a")
+  b = ops.convert_to_tensor(b, name="b")
+
+  # One of the shapes isn't fully defined, so we need to use the dynamic
+  # shape.
+  return control_flow_ops.cond(
+      math_ops.equal(array_ops.rank(a), array_ops.rank(b)),
+      # Here we can't just do math_ops.equal(a.shape, b.shape), since
+      # static shape inference may break the equality comparison between
+      # shape(a) and shape(b) in math_ops.equal.
+      lambda: math_ops.reduce_all(math_ops.equal(
+          array_ops.concat(0, (
+              array_ops.shape(a),
+              array_ops.shape(b))),
+          array_ops.concat(0, (
+              array_ops.shape(b),
+              array_ops.shape(a))))),
+      lambda: constant_op.constant(False))
+
+
 def get_logits_and_prob(
     logits=None, p=None,
     multidimensional=False, validate_args=False, name="GetLogitsAndProb"):
diff --git a/tensorflow/contrib/distributions/python/ops/operator_pd.py b/tensorflow/contrib/distributions/python/ops/operator_pd.py
index 283adf8b7974c8..14238bfa4bbb94 100644
--- a/tensorflow/contrib/distributions/python/ops/operator_pd.py
+++ b/tensorflow/contrib/distributions/python/ops/operator_pd.py
@@ -383,7 +383,7 @@ def batch_shape(self, name="batch_shape"):
     # Derived classes get this "for free" once .shape() is implemented.
     with ops.name_scope(self.name):
       with ops.name_scope(name, values=self.inputs):
-        return array_ops.slice(self.shape(), [0], [self.rank() - 2])
+        return array_ops.strided_slice(self.shape(), [0], [self.rank() - 2])
 
   def vector_shape(self, name="vector_shape"):
     """Shape of (batch) vectors that this (batch) matrix will multiply.
@@ -746,7 +746,7 @@ def _flip_vector_to_matrix_dynamic(vec, batch_shape):
 
   m = vec_batch_rank - batch_rank
   # vec_shape_left = [M1,...,Mm] or [].
-  vec_shape_left = array_ops.slice(vec_shape, [0], [m])
+  vec_shape_left = array_ops.strided_slice(vec_shape, [0], [m])
   # If vec_shape_left = [], then condensed_shape = [1] since reduce_prod([]) = 1
   # If vec_shape_left = [M1,...,Mm], condensed_shape = [M1*...*Mm]
   condensed_shape = [math_ops.reduce_prod(vec_shape_left)]
@@ -819,5 +819,5 @@ def extract_batch_shape(x, num_event_dims, name="extract_batch_shape"):
   """
   with ops.name_scope(name, values=[x]):
     x = ops.convert_to_tensor(x, name="x")
-    return array_ops.slice(
+    return array_ops.strided_slice(
         array_ops.shape(x), [0], [array_ops.rank(x) - num_event_dims])
diff --git a/tensorflow/contrib/distributions/python/ops/operator_pd_vdvt_update.py b/tensorflow/contrib/distributions/python/ops/operator_pd_vdvt_update.py
index 819a6da47cb162..912572351fb4e6 100644
--- a/tensorflow/contrib/distributions/python/ops/operator_pd_vdvt_update.py
+++ b/tensorflow/contrib/distributions/python/ops/operator_pd_vdvt_update.py
@@ -145,7 +145,7 @@ def _get_identity_operator(self, v):
       else:
         v_shape = array_ops.shape(v)
         v_rank = array_ops.rank(v)
-        v_batch_shape = array_ops.slice(v_shape, [0], [v_rank - 2])
+        v_batch_shape = array_ops.strided_slice(v_shape, [0], [v_rank - 2])
         r = array_ops.gather(v_shape, v_rank - 1)  # Last dim of v
         id_shape = array_ops.concat(0, (v_batch_shape, [r, r]))
       return operator_pd_identity.OperatorPDIdentity(
@@ -228,11 +228,13 @@ def _check_shapes_dynamic(self, operator, v, diag):
         checks.append(check_ops.assert_rank(diag, r_op - 1))
 
       # Check batch shape
-      checks.append(check_ops.assert_equal(
-          operator.batch_shape(), array_ops.slice(s_v, [0], [r_v - 2])))
+      checks.append(
+          check_ops.assert_equal(operator.batch_shape(),
+                                 array_ops.strided_slice(s_v, [0], [r_v - 2])))
       if diag is not None:
-        checks.append(check_ops.assert_equal(
-            operator.batch_shape(), array_ops.slice(s_d, [0], [r_d - 1])))
+        checks.append(
+            check_ops.assert_equal(operator.batch_shape(
+            ), array_ops.strided_slice(s_d, [0], [r_d - 1])))
 
       # Check event shape
       checks.append(check_ops.assert_equal(
diff --git a/tensorflow/contrib/distributions/python/ops/shape.py b/tensorflow/contrib/distributions/python/ops/shape.py
index fa1596d555da7c..795bccb800e15a 100644
--- a/tensorflow/contrib/distributions/python/ops/shape.py
+++ b/tensorflow/contrib/distributions/python/ops/shape.py
@@ -309,7 +309,7 @@ def make_dims(start_sum, size, name):
         start_sum = start_sum if start_sum else (
             array_ops.zeros((), dtype=dtypes.int32, name="zero"),)
         if self._is_all_constant_helper(size, *start_sum):
-          start = sum([tensor_util.constant_value(s) for s in start_sum])
+          start = sum(tensor_util.constant_value(s) for s in start_sum)
           stop = start + tensor_util.constant_value(size)
           return ops.convert_to_tensor(
               list(range(start, stop)), dtype=dtypes.int32, name=name)
@@ -342,7 +342,7 @@ def slice_shape(start_sum, size, name):
             array_ops.zeros((), dtype=dtypes.int32, name="zero"),)
         if (x.get_shape().ndims is not None and
             self._is_all_constant_helper(size, *start_sum)):
-          start = sum([tensor_util.constant_value(s) for s in start_sum])
+          start = sum(tensor_util.constant_value(s) for s in start_sum)
           stop = start + tensor_util.constant_value(size)
           slice_ = x.get_shape()[start:stop].as_list()
           if all(s is not None for s in slice_):
diff --git a/tensorflow/contrib/distributions/python/ops/wishart.py b/tensorflow/contrib/distributions/python/ops/wishart.py
index b478a12d368cad..e85a6fc1df4426 100644
--- a/tensorflow/contrib/distributions/python/ops/wishart.py
+++ b/tensorflow/contrib/distributions/python/ops/wishart.py
@@ -180,7 +180,8 @@ def dimension(self):
 
   def _event_shape(self):
     s = self.scale_operator_pd.shape()
-    return array_ops.slice(s, array_ops.shape(s) - 2, [2])
+    return array_ops.strided_slice(s, array_ops.shape(s) - 2,
+                                   array_ops.shape(s))
 
   def _get_event_shape(self):
     return self.scale_operator_pd.get_shape()[-2:]
@@ -261,7 +262,7 @@ def _log_prob(self, x):
     ndims = array_ops.rank(x_sqrt)
     # sample_ndims = ndims - batch_ndims - event_ndims
     sample_ndims = ndims - array_ops.shape(batch_shape)[0] - 2
-    sample_shape = array_ops.slice(
+    sample_shape = array_ops.strided_slice(
         array_ops.shape(x_sqrt), [0], [sample_ndims])
 
     # We need to be able to pre-multiply each matrix by its corresponding
diff --git a/tensorflow/contrib/factorization/python/kernel_tests/wals_solver_ops_test.py b/tensorflow/contrib/factorization/python/kernel_tests/wals_solver_ops_test.py
index 33877e0a95d25f..4bfbf275dbcc0c 100644
--- a/tensorflow/contrib/factorization/python/kernel_tests/wals_solver_ops_test.py
+++ b/tensorflow/contrib/factorization/python/kernel_tests/wals_solver_ops_test.py
@@ -54,7 +54,7 @@ def testWalsSolverLhs(self):
       ] = tf.contrib.factorization.wals_compute_partial_lhs_and_rhs(
           self._column_factors, self._column_weights, self._unobserved_weights,
           self._row_weights, sparse_block.indices, sparse_block.values,
-          sparse_block.shape[0], False)
+          sparse_block.dense_shape[0], False)
       self.assertAllClose(lhs_tensor.eval(), [
           [
               [0.014800, 0.017000, 0.019200],
diff --git a/tensorflow/contrib/factorization/python/ops/factorization_ops.py b/tensorflow/contrib/factorization/python/ops/factorization_ops.py
index 2914f255992289..cceca3e7893929 100644
--- a/tensorflow/contrib/factorization/python/ops/factorization_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/factorization_ops.py
@@ -268,7 +268,7 @@ def initialize_op(self):
     if self._row_weights is not None:
       assert self._col_weights is not None
       all_vars.extend(self._row_weights + self._col_weights)
-    return tf.initialize_variables(all_vars)
+    return tf.variables_initializer(all_vars)
 
   @classmethod
   def _shard_sizes(cls, dims, num_shards):
@@ -769,7 +769,8 @@ def _process_input_helper(self, update_row_factors,
     new_sp_shape = (tf.concat(0, [row_shape, col_shape]) if transpose_input
                     else tf.concat(0, [col_shape, row_shape]))
     new_sp_input = tf.SparseTensor(indices=new_sp_indices,
-                                   values=sp_input.values, shape=new_sp_shape)
+                                   values=sp_input.values,
+                                   dense_shape=new_sp_shape)
 
     # Compute lhs and rhs of the normal equations
     total_lhs = (self._unobserved_weight * gramian)
diff --git a/tensorflow/contrib/framework/python/framework/experimental_test.py b/tensorflow/contrib/framework/python/framework/experimental_test.py
index 7b28b1337de7fd..151c1fdd57c655 100644
--- a/tensorflow/contrib/framework/python/framework/experimental_test.py
+++ b/tensorflow/contrib/framework/python/framework/experimental_test.py
@@ -50,13 +50,12 @@ def _fn(arg0, arg1):
         "be removed at any time, and without warning."
         "\n"
         "\n"
-        "\n      Args:"
-        "\n        arg0: Arg 0."
-        "\n        arg1: Arg 1."
+        "\nArgs:"
+        "\n  arg0: Arg 0."
+        "\n  arg1: Arg 1."
         "\n"
-        "\n      Returns:"
-        "\n        Sum of args."
-        "\n      ", _fn.__doc__)
+        "\nReturns:"
+        "\n  Sum of args.", _fn.__doc__)
 
     # Assert calling new fn issues log warning.
     self.assertEqual(3, _fn(1, 2))
diff --git a/tensorflow/contrib/framework/python/framework/tensor_util.py b/tensorflow/contrib/framework/python/framework/tensor_util.py
index c149d1484994fb..a326b78a5f2ae4 100644
--- a/tensorflow/contrib/framework/python/framework/tensor_util.py
+++ b/tensorflow/contrib/framework/python/framework/tensor_util.py
@@ -39,6 +39,10 @@
     'with_same_shape']
 
 
+convert_to_tensor_or_sparse_tensor = (
+    sparse_tensor.convert_to_tensor_or_sparse_tensor)
+
+
 def _assert_same_base_type(items, expected_type=None):
   r"""Asserts all items are of the same base type.
 
@@ -361,33 +365,3 @@ def with_shape(expected_shape, tensor):
         tensor.name, expected_shape, actual_shape))
 
   return tensor
-
-
-def convert_to_tensor_or_sparse_tensor(value, dtype=None, name=None):
-  """Converts value to a `SparseTensor` or `Tensor`.
-
-  Args:
-    value: A `SparseTensor`, `SparseTensorValue`, or an object whose type has a
-      registered `Tensor` conversion function.
-    dtype: Optional element type for the returned tensor. If missing, the
-      type is inferred from the type of `value`.
-    name: Optional name to use if a new `Tensor` is created.
-
-  Returns:
-    A `SparseTensor` or `Tensor` based on `value`.
-
-  Raises:
-    RuntimeError: If result type is incompatible with `dtype`.
-  """
-  if dtype is not None:
-    dtype = dtypes.as_dtype(dtype)
-  if isinstance(value, sparse_tensor.SparseTensorValue):
-    value = sparse_tensor.SparseTensor.from_value(value)
-  if isinstance(value, sparse_tensor.SparseTensor):
-    if dtype and not dtype.is_compatible_with(value.dtype):
-      raise RuntimeError(
-          'Sparse dtype: requested = %s, actual = %s' % (
-              dtype.name, value.dtype.name))
-    return value
-  return ops.internal_convert_to_tensor(
-      value, dtype=dtype, name=name)
diff --git a/tensorflow/contrib/framework/python/framework/tensor_util_test.py b/tensorflow/contrib/framework/python/framework/tensor_util_test.py
index d5686251a68c94..dba302bab669eb 100644
--- a/tensorflow/contrib/framework/python/framework/tensor_util_test.py
+++ b/tensorflow/contrib/framework/python/framework/tensor_util_test.py
@@ -279,32 +279,6 @@ def test_with_shape_partial(self):
             ValueError, tensor_2x2.eval, {tensor_partial_shape: [42.0]})
 
 
-class ConvertToTensorOrSparseTensorTest(tf.test.TestCase):
-
-  def test_convert_dense(self):
-    with self.test_session():
-      value = [42, 43]
-      from_value = tf.contrib.framework.convert_to_tensor_or_sparse_tensor(
-          value)
-      self.assertAllEqual(value, from_value.eval())
-
-  def test_convert_sparse(self):
-    with self.test_session():
-      indices = [[0, 1], [1, 0]]
-      values = [42, 43]
-      shape = [2, 2]
-      sparse_tensor_value = tf.SparseTensorValue(indices, values, shape)
-      sparse_tensor = tf.SparseTensor.from_value(sparse_tensor_value)
-      from_value = tf.contrib.framework.convert_to_tensor_or_sparse_tensor(
-          sparse_tensor_value).eval()
-      from_tensor = tf.contrib.framework.convert_to_tensor_or_sparse_tensor(
-          sparse_tensor).eval()
-      for convertee in [from_value, from_tensor]:
-        self.assertAllEqual(sparse_tensor_value.indices, convertee.indices)
-        self.assertAllEqual(sparse_tensor_value.values, convertee.values)
-        self.assertAllEqual(sparse_tensor_value.shape, convertee.shape)
-
-
 class RemoveSqueezableDimensionsTest(tf.test.TestCase):
 
   def testRemoveSqueezableDimensions(self):
diff --git a/tensorflow/contrib/framework/python/ops/prettyprint_ops_test.py b/tensorflow/contrib/framework/python/ops/prettyprint_ops_test.py
index bcea8692a6a93d..dfaaafd88eae68 100644
--- a/tensorflow/contrib/framework/python/ops/prettyprint_ops_test.py
+++ b/tensorflow/contrib/framework/python/ops/prettyprint_ops_test.py
@@ -31,8 +31,10 @@ def testPrintTensorPassthrough(self):
       self.assertEqual(a.eval(), tf.constant([1]).eval())
 
   def testPrintSparseTensorPassthrough(self):
-    a = tf.SparseTensor(indices=[[0, 0], [1, 2]], values=[1, 2], shape=[3, 4])
-    b = tf.SparseTensor(indices=[[0, 0], [1, 2]], values=[1, 2], shape=[3, 4])
+    a = tf.SparseTensor(
+        indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])
+    b = tf.SparseTensor(
+        indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])
     a = tf.contrib.framework.print_op(a)
     with self.test_session():
       self.assertAllEqual(tf.sparse_tensor_to_dense(a).eval(),
@@ -50,7 +52,7 @@ def testPrintVariable(self):
     a = tf.Variable(1.0)
     a = tf.contrib.framework.print_op(a)
     with self.test_session():
-      tf.initialize_all_variables().run()
+      tf.global_variables_initializer().run()
       a.eval()
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/framework/python/ops/variables.py b/tensorflow/contrib/framework/python/ops/variables.py
index 2db91cd889e0d2..c6425e8d243e50 100644
--- a/tensorflow/contrib/framework/python/ops/variables.py
+++ b/tensorflow/contrib/framework/python/ops/variables.py
@@ -19,6 +19,8 @@
 from __future__ import division
 from __future__ import print_function
 
+import re
+
 from tensorflow.contrib.framework.python.ops import add_arg_scope as contrib_add_arg_scope
 from tensorflow.contrib.framework.python.ops import gen_variable_ops
 from tensorflow.contrib.util import loader
@@ -46,6 +48,7 @@
            'assign_from_values',
            'assign_from_values_fn',
            'create_global_step',
+           'filter_variables',
            'get_global_step',
            'get_or_create_global_step',
            'get_local_variables',
@@ -624,3 +627,63 @@ def __call__(self, op):
       device_spec.job = self._job_name
       device_spec.task = task_id
     return device_spec.to_string()
+
+
+def filter_variables(var_list, include_patterns=None, exclude_patterns=None,
+                     reg_search=True):
+  """Filter a list of variables using regular expressions.
+
+  First includes variables according to the list of include_patterns.
+  Afterwards, eliminates variables according to the list of exclude_patterns.
+
+  For example, one can obtain a list of variables with the weights of all
+  convolutional layers (depending on the network definition) by:
+
+  ```python
+  variables = tf.contrib.framework.get_model_variables()
+  conv_weight_variables = tf.contrib.framework.filter_variables(
+      variables,
+      include_patterns=['Conv'],
+      exclude_patterns=['biases', 'Logits'])
+  ```
+
+  Args:
+    var_list: list of variables.
+    include_patterns: list of regular expressions to include. Defaults to None,
+        which means all variables are selected according to the include rules.
+        A variable is included if it matches any of the include_patterns.
+    exclude_patterns: list of regular expressions to exclude. Defaults to None,
+        which means all variables are selected according to the exclude rules.
+        A variable is excluded if it matches any of the exclude_patterns.
+    reg_search: boolean. If True (default), performs re.search to find matches
+        (i.e. pattern can match any substring of the variable name). If False,
+        performs re.match (i.e. regexp should match from the beginning of the
+        variable name).
+
+  Returns:
+    filtered list of variables.
+  """
+  if reg_search:
+    reg_exp_func = re.search
+  else:
+    reg_exp_func = re.match
+
+  # First include variables.
+  if include_patterns is None:
+    included_variables = list(var_list)
+  else:
+    included_variables = []
+    for var in var_list:
+      if any(reg_exp_func(ptrn, var.name) for ptrn in include_patterns):
+        included_variables.append(var)
+
+  # Afterwards, exclude variables.
+  if exclude_patterns is None:
+    filtered_variables = included_variables
+  else:
+    filtered_variables = []
+    for var in included_variables:
+      if not any(reg_exp_func(ptrn, var.name) for ptrn in exclude_patterns):
+        filtered_variables.append(var)
+
+  return filtered_variables
diff --git a/tensorflow/contrib/framework/python/ops/variables_test.py b/tensorflow/contrib/framework/python/ops/variables_test.py
index 191291cbbf027e..d0264678da3c6c 100644
--- a/tensorflow/contrib/framework/python/ops/variables_test.py
+++ b/tensorflow/contrib/framework/python/ops/variables_test.py
@@ -1094,17 +1094,18 @@ def testMissingVariablesDict(self):
       self.assertEqual(init_value0, var0.eval())
       self.assertEqual(init_value1, var1.eval())
 
+
 class ZeroInitializerOpTest(tf.test.TestCase):
 
   def _testZeroInitializer(self, shape, initializer, use_init):
     var = tf.Variable(initializer)
     var_zero = tf.contrib.framework.zero_initializer(var)
     with self.test_session() as sess:
-      with self.assertRaisesOpError("Attempting to use uninitialized value"):
+      with self.assertRaisesOpError('Attempting to use uninitialized value'):
         var.eval()
       if use_init:
         sess.run(var.initializer)
-        with self.assertRaisesOpError("input is already initialized"):
+        with self.assertRaisesOpError('input is already initialized'):
           var_zero.eval()
         self.assertAllClose(np.ones(shape), var.eval())
       else:
@@ -1115,7 +1116,103 @@ def testZeroInitializer(self):
     for dtype in (tf.int32, tf.int64, tf.float32, tf.float64):
       for use_init in (False, True):
         self._testZeroInitializer(
-            [10, 20], tf.ones([10, 20], dtype = dtype), use_init)
+            [10, 20], tf.ones([10, 20], dtype=dtype), use_init)
+
+
+class FilterVariablesTest(tf.test.TestCase):
+
+  def setUp(self):
+    g = tf.Graph()
+    with g.as_default():
+      var_list = []
+      var_list.append(tf.Variable(0, name='conv1/weights'))
+      var_list.append(tf.Variable(0, name='conv1/biases'))
+      var_list.append(tf.Variable(0, name='conv2/weights'))
+      var_list.append(tf.Variable(0, name='conv2/biases'))
+      var_list.append(tf.Variable(0, name='clfs/weights'))
+      var_list.append(tf.Variable(0, name='clfs/biases'))
+      self._var_list = var_list
+
+  def _test_filter_variables(self, expected_var_names, include_patterns=None,
+                             exclude_patterns=None, reg_search=True):
+    filtered_var_list = tf.contrib.framework.filter_variables(
+        self._var_list,
+        include_patterns=include_patterns,
+        exclude_patterns=exclude_patterns,
+        reg_search=reg_search)
+
+    filtered_var_names = [var.op.name for var in filtered_var_list]
+
+    for name in filtered_var_names:
+      self.assertIn(name, expected_var_names)
+    for name in expected_var_names:
+      self.assertIn(name, filtered_var_names)
+    self.assertEqual(len(filtered_var_names), len(expected_var_names))
+
+  def testNoFiltering(self):
+    self._test_filter_variables(
+        expected_var_names=[
+            'conv1/weights',
+            'conv1/biases',
+            'conv2/weights',
+            'conv2/biases',
+            'clfs/weights',
+            'clfs/biases'])
+
+  def testIncludeBiases(self):
+    self._test_filter_variables(
+        expected_var_names=[
+            'conv1/biases',
+            'conv2/biases',
+            'clfs/biases'],
+        include_patterns=['biases'])
+
+  def testExcludeWeights(self):
+    self._test_filter_variables(
+        expected_var_names=[
+            'conv1/biases',
+            'conv2/biases',
+            'clfs/biases'],
+        exclude_patterns=['weights'])
+
+  def testExcludeWeightsAndConv1(self):
+    self._test_filter_variables(
+        expected_var_names=[
+            'conv2/biases',
+            'clfs/biases'],
+        exclude_patterns=['weights', 'conv1'])
+
+  def testTwoIncludePatternsEnsureNoVariablesTwiceInFilteredList(self):
+    self._test_filter_variables(
+        expected_var_names=[
+            'conv1/weights',
+            'conv1/biases',
+            'conv2/weights',
+            'clfs/weights'],
+        include_patterns=['conv1', 'weights'])
+
+  def testIncludeConv1ExcludeBiases(self):
+    self._test_filter_variables(
+        expected_var_names=[
+            'conv1/weights'],
+        include_patterns=['conv1'],
+        exclude_patterns=['biases'])
+
+  def testRegMatchIncludeBiases(self):
+    self._test_filter_variables(
+        expected_var_names=[
+            'conv1/biases',
+            'conv2/biases',
+            'clfs/biases'],
+        include_patterns=['.*biases'],
+        reg_search=False)
+
+  def testRegMatchIncludeBiasesWithIncompleteRegExpHasNoMatches(self):
+    self._test_filter_variables(
+        expected_var_names=[],
+        include_patterns=['biases'],
+        reg_search=False)
+
 
 if __name__ == '__main__':
   tf.test.main()
diff --git a/tensorflow/contrib/input_pipeline/BUILD b/tensorflow/contrib/input_pipeline/BUILD
new file mode 100644
index 00000000000000..777d66f6843ca9
--- /dev/null
+++ b/tensorflow/contrib/input_pipeline/BUILD
@@ -0,0 +1,106 @@
+# Description:
+#   Contains ops to build an input pipeline for tensorflow.
+#   APIs here are meant to evolve over time.
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+package(default_visibility = ["//visibility:public"])
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_custom_op_library",
+    "tf_cc_tests",
+    "tf_gen_op_libs",
+    "tf_gen_op_wrapper_py",
+    "tf_kernel_library",
+)
+load(
+    "//tensorflow/core:platform/default/build_config.bzl",
+    "tf_kernel_tests_linkstatic",
+)
+
+tf_custom_op_library(
+    # TODO(sibyl-Mooth6ku,ptucker): Understand why 'python/ops/_' is needed and fix it.
+    name = "python/ops/_input_pipeline_ops.so",
+    srcs = [
+        "ops/input_pipeline_ops.cc",
+    ],
+    deps = [
+        "//tensorflow/contrib/input_pipeline/kernels:input_pipeline_kernels",
+    ],
+)
+
+tf_gen_op_libs(
+    op_lib_names = ["input_pipeline_ops"],
+)
+
+tf_gen_op_wrapper_py(
+    name = "input_pipeline_ops",
+    deps = [":input_pipeline_ops_op_lib"],
+)
+
+tf_kernel_library(
+    name = "input_pipeline_ops_kernels",
+    deps = [
+        "//tensorflow/contrib/input_pipeline/kernels:input_pipeline_kernels",
+        "//tensorflow/core:framework",
+    ],
+    alwayslink = 1,
+)
+
+py_library(
+    name = "input_pipeline_py",
+    srcs = glob(["python/ops/*.py"]),
+    data = [":python/ops/_input_pipeline_ops.so"],
+    srcs_version = "PY2AND3",
+    deps = [":input_pipeline_ops"],
+)
+
+py_test(
+    name = "input_pipeline_ops_test",
+    size = "small",
+    srcs = ["python/kernel_tests/input_pipeline_ops_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":input_pipeline_py",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+tf_cc_tests(
+    size = "small",
+    srcs = [
+        "ops/input_pipeline_ops_test.cc",
+    ],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        ":input_pipeline_ops_op_lib",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:ops",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+)
diff --git a/tensorflow/contrib/deprecated/summaries.py b/tensorflow/contrib/input_pipeline/__init__.py
similarity index 55%
rename from tensorflow/contrib/deprecated/summaries.py
rename to tensorflow/contrib/input_pipeline/__init__.py
index d11c6f592b8b7d..d1219883c95c5c 100644
--- a/tensorflow/contrib/deprecated/summaries.py
+++ b/tensorflow/contrib/input_pipeline/__init__.py
@@ -12,20 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Deprecated endpoints that we aren't yet ready to remove entirely.
+"""Ops and modules related to input_pipeline.
+
+@@obtain_next
+
 """
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-
-# When the endpoints are removed from core TensorFlow, the old implementations
-# will move to this file.
-# pylint: disable=unused-import,line-too-long
-from tensorflow.python.ops.logging_ops import audio_summary
-from tensorflow.python.ops.logging_ops import histogram_summary
-from tensorflow.python.ops.logging_ops import image_summary
-from tensorflow.python.ops.logging_ops import merge_all_summaries
-from tensorflow.python.ops.logging_ops import merge_summary
-from tensorflow.python.ops.logging_ops import scalar_summary
-# pylint: enable=unused-import,line-too-long
+from tensorflow.contrib.input_pipeline.python.ops.input_pipeline_ops import obtain_next
diff --git a/tensorflow/contrib/metrics/kernels/BUILD b/tensorflow/contrib/input_pipeline/kernels/BUILD
similarity index 74%
rename from tensorflow/contrib/metrics/kernels/BUILD
rename to tensorflow/contrib/input_pipeline/kernels/BUILD
index 967c98ad60e34b..e22e16eef2fd60 100644
--- a/tensorflow/contrib/metrics/kernels/BUILD
+++ b/tensorflow/contrib/input_pipeline/kernels/BUILD
@@ -1,5 +1,5 @@
 # Description:
-#   Contains kernels for evaluation metrics and summary statistics.
+#   Contains kernels for the input pipeline.
 
 licenses(["notice"])  # Apache 2.0
 
@@ -8,9 +8,8 @@ exports_files(["LICENSE"])
 package(default_visibility = ["//tensorflow:__subpackages__"])
 
 cc_library(
-    name = "set_kernels",
-    srcs = ["set_kernels.cc"],
-    copts = ["-Wno-sign-compare"],
+    name = "input_pipeline_kernels",
+    srcs = ["input_pipeline_kernels.cc"],
     deps = [
         "//tensorflow/core:framework_headers_lib",
         "//third_party/eigen3",
diff --git a/tensorflow/contrib/input_pipeline/kernels/input_pipeline_kernels.cc b/tensorflow/contrib/input_pipeline/kernels/input_pipeline_kernels.cc
new file mode 100644
index 00000000000000..3f46dabaaf27b4
--- /dev/null
+++ b/tensorflow/contrib/input_pipeline/kernels/input_pipeline_kernels.cc
@@ -0,0 +1,59 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+// This Op takes in a list of strings and a counter (ref). It increments the
+// counter by 1 and returns the element at that position in the list (circling
+// around if need to).
+class ObtainNextOp : public OpKernel {
+ public:
+  explicit ObtainNextOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* list;
+    OP_REQUIRES_OK(ctx, ctx->input("list", &list));
+    int64 num_elements = list->NumElements();
+    auto list_flat = list->flat<string>();
+
+    // Allocate output.
+    Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output("out_element", TensorShape({1}),
+                                             &output_tensor));
+
+    // Obtain mutex for the "counter" tensor.
+    mutex* mu;
+    OP_REQUIRES_OK(ctx, ctx->input_ref_mutex("counter", &mu));
+    mutex_lock l(*mu);
+    // Increment "counter" tensor by 1.
+    Tensor counter_tensor;
+    OP_REQUIRES_OK(ctx, ctx->mutable_input("counter", &counter_tensor, true));
+    auto counter_tensor_flat = counter_tensor.flat<int64>();
+    int64& pos = counter_tensor_flat(0);
+    pos = (pos + 1) % num_elements;
+
+    // Assign value to output.
+    auto output_tensor_flat = output_tensor->flat<string>();
+    output_tensor_flat(0) = list_flat(pos);
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("ObtainNext").Device(DEVICE_CPU), ObtainNextOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/input_pipeline/ops/input_pipeline_ops.cc b/tensorflow/contrib/input_pipeline/ops/input_pipeline_ops.cc
new file mode 100644
index 00000000000000..05639ed7029ad8
--- /dev/null
+++ b/tensorflow/contrib/input_pipeline/ops/input_pipeline_ops.cc
@@ -0,0 +1,48 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+
+namespace tensorflow {
+
+using shape_inference::DimensionHandle;
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+REGISTER_OP("ObtainNext")
+    .Input("list: string")
+    .Input("counter: Ref(int64)")
+    .Output("out_element: string")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused_input, input1;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &unused_input));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &input1));
+      DimensionHandle unused_dim;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(input1, 0), 1, &unused_dim));
+      c->set_output(0, c->Vector(1));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Takes a list and returns the next based on a counter in a round-robin fashion.
+
+Returns the element in the list at the new position of the counter, so if you
+want to circle the list around start by setting the counter value = -1.
+
+list: A list of strings
+counter: A reference to an int64 variable
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/input_pipeline/ops/input_pipeline_ops_test.cc b/tensorflow/contrib/input_pipeline/ops/input_pipeline_ops_test.cc
new file mode 100644
index 00000000000000..b644c23c7f4770
--- /dev/null
+++ b/tensorflow/contrib/input_pipeline/ops/input_pipeline_ops_test.cc
@@ -0,0 +1,38 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference_testutil.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+TEST(InputPipelineOpsTest, ObtainNext_InvalidNumberOfInputs) {
+  ShapeInferenceTestOp op("ObtainNext");
+  op.input_tensors.resize(3);
+  INFER_ERROR("Wrong number of inputs passed", op, "?;?;?");
+}
+
+TEST(InputPipelineOpsTest, ObtainNext) {
+  ShapeInferenceTestOp op("ObtainNext");
+  INFER_OK(op, "[100];[1]", "[1]");
+
+  INFER_ERROR("Shape must be rank 1 but is rank 2", op, "[1,1];[1]");
+  INFER_ERROR("Dimension must be 1 but is 2", op, "[1000];[2]");
+}
+
+}  // end namespace tensorflow
diff --git a/tensorflow/contrib/input_pipeline/python/kernel_tests/input_pipeline_ops_test.py b/tensorflow/contrib/input_pipeline/python/kernel_tests/input_pipeline_ops_test.py
new file mode 100644
index 00000000000000..b8f1d9c0e4bbdc
--- /dev/null
+++ b/tensorflow/contrib/input_pipeline/python/kernel_tests/input_pipeline_ops_test.py
@@ -0,0 +1,56 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for input_pipeline_ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from tensorflow.contrib.input_pipeline.python.ops import input_pipeline_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variables as var_ops
+
+
+class InputPipelineOpsTest(tf.test.TestCase):
+
+  def testObtainNext(self):
+    with self.test_session():
+      var = state_ops.variable_op([1], tf.int64)
+      tf.assign(var, [-1]).op.run()
+      c = tf.constant(["a", "b"])
+      sample1 = input_pipeline_ops.obtain_next(c, var)
+      self.assertEqual(b"a", sample1.eval())
+      self.assertEqual([0], var.eval())
+      sample2 = input_pipeline_ops.obtain_next(c, var)
+      self.assertEqual(b"b", sample2.eval())
+      self.assertEqual([1], var.eval())
+      sample3 = input_pipeline_ops.obtain_next(c, var)
+      self.assertEqual(b"a", sample3.eval())
+      self.assertEqual([0], var.eval())
+
+  def testSeekNext(self):
+    string_list = ["a", "b", "c"]
+    with self.test_session() as session:
+      elem = input_pipeline_ops.seek_next(string_list)
+      session.run(tf.initialize_all_variables())
+      self.assertEqual(b"a", session.run(elem))
+      self.assertEqual(b"b", session.run(elem))
+      self.assertEqual(b"c", session.run(elem))
+      self.assertEqual(b"a", session.run(elem))
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/contrib/input_pipeline/python/ops/input_pipeline_ops.py b/tensorflow/contrib/input_pipeline/python/ops/input_pipeline_ops.py
new file mode 100644
index 00000000000000..d8c95d9c393232
--- /dev/null
+++ b/tensorflow/contrib/input_pipeline/python/ops/input_pipeline_ops.py
@@ -0,0 +1,69 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python wrapper for input_pipeline_ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.util import loader
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.platform import resource_loader
+
+
+_input_pipeline_ops = loader.load_op_library(
+    resource_loader.get_path_to_datafile("_input_pipeline_ops.so"))
+
+
+def obtain_next(string_list_tensor, counter):
+  """Basic wrapper for the ObtainNextOp.
+
+  Args:
+    string_list_tensor: A tensor that is a list of strings
+    counter: an int64 ref tensor to keep track of which element is returned.
+
+  Returns:
+    An op that produces the element at counter + 1 in the list, round
+    robin style.
+  """
+  return _input_pipeline_ops.obtain_next(string_list_tensor, counter)
+
+
+def seek_next(string_list):
+  """Returns an op that seeks the next element in a list of strings.
+
+  Seeking happens in a round robin fashion. This op creates a variable called
+  counter that is initialized to -1 and is used to keep track of which element
+  in the list was returned.
+
+  Args:
+    string_list: A list of strings
+
+  Returns:
+    An op that produces the next element in the provided list.
+  """
+  with variable_scope.variable_scope("obtain_next"):
+    counter = variable_scope.get_variable(
+        name="obtain_next_counter",
+        initializer=constant_op.constant([-1], dtype=dtypes.int64),
+        dtype=dtypes.int64)
+    with ops.device(counter.device):
+      string_tensor = constant_op.constant(string_list,
+                                           name="obtain_next_string_list")
+  return obtain_next(string_tensor, counter)
+
diff --git a/tensorflow/contrib/layers/BUILD b/tensorflow/contrib/layers/BUILD
index b47638d81be7d8..2c0a75666c3100 100644
--- a/tensorflow/contrib/layers/BUILD
+++ b/tensorflow/contrib/layers/BUILD
@@ -87,7 +87,7 @@ py_library(
 
 cuda_py_test(
     name = "layers_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/layers/layers_test.py"],
     additional_deps = [
         ":layers_py",
diff --git a/tensorflow/contrib/layers/python/layers/embedding_ops.py b/tensorflow/contrib/layers/python/layers/embedding_ops.py
index 25a871cd150661..95a2626c2c91ba 100644
--- a/tensorflow/contrib/layers/python/layers/embedding_ops.py
+++ b/tensorflow/contrib/layers/python/layers/embedding_ops.py
@@ -109,8 +109,8 @@ def safe_embedding_lookup_sparse(embedding_weights,
                       embedding_weights + [sparse_ids,
                                            sparse_weights]) as scope:
     # Reshape higher-rank sparse ids and weights to linear segment ids.
-    original_shape = sparse_ids.shape
-    original_rank_dim = sparse_ids.shape.get_shape()[0]
+    original_shape = sparse_ids.dense_shape
+    original_rank_dim = sparse_ids.dense_shape.get_shape()[0]
     original_rank = (
         array_ops.size(original_shape)
         if original_rank_dim.value is None
@@ -122,7 +122,7 @@ def safe_embedding_lookup_sparse(embedding_weights,
     if sparse_weights is not None:
       sparse_weights = sparse_tensor.SparseTensor(
           sparse_ids.indices,
-          sparse_weights.values, sparse_ids.shape)
+          sparse_weights.values, sparse_ids.dense_shape)
 
     # Prune invalid ids and weights.
     sparse_ids, sparse_weights = _prune_invalid_ids(sparse_ids, sparse_weights)
diff --git a/tensorflow/contrib/layers/python/layers/embedding_ops_test.py b/tensorflow/contrib/layers/python/layers/embedding_ops_test.py
index 60b61ed92f4d53..a150446e840647 100644
--- a/tensorflow/contrib/layers/python/layers/embedding_ops_test.py
+++ b/tensorflow/contrib/layers/python/layers/embedding_ops_test.py
@@ -338,7 +338,7 @@ def test_scattered_embedding_lookup_sparse(self):
       embedding_weights = self._random_weights(num_shards=3)
       sparse_tensor = tf.SparseTensor(values=["foo", "bar", "foo", "bar"],
                                       indices=[[0, 0], [1, 0], [1, 1], [3, 0]],
-                                      shape=[5, 2])
+                                      dense_shape=[5, 2])
 
       embedding_lookup_result = (
           tf.contrib.layers.scattered_embedding_lookup_sparse(
diff --git a/tensorflow/contrib/layers/python/layers/feature_column.py b/tensorflow/contrib/layers/python/layers/feature_column.py
index e3ef7328a4d082..b94024bd6b35fb 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column.py
@@ -456,7 +456,7 @@ def insert_transformed_feature(self, columns_to_tensors):
     sparse_id_values = math_ops.mod(input_tensor.values, self.bucket_size,
                                     name="mod")
     columns_to_tensors[self] = sparse_tensor_py.SparseTensor(
-        input_tensor.indices, sparse_id_values, input_tensor.shape)
+        input_tensor.indices, sparse_id_values, input_tensor.dense_shape)
 
 
 def sparse_column_with_integerized_feature(column_name,
@@ -530,7 +530,7 @@ def insert_transformed_feature(self, columns_to_tensors):
     sparse_id_values = string_ops.string_to_hash_bucket_fast(
         sparse_values, self.bucket_size, name="lookup")
     columns_to_tensors[self] = sparse_tensor_py.SparseTensor(
-        input_tensor.indices, sparse_id_values, input_tensor.shape)
+        input_tensor.indices, sparse_id_values, input_tensor.dense_shape)
 
 
 def sparse_column_with_hash_bucket(column_name,
@@ -718,7 +718,7 @@ def weighted_sparse_column(sparse_id_column,
         is a SparseTensor.
      Following are assumed to be true:
        * sparse_tensor.indices = weights_tensor.indices
-       * sparse_tensor.shape = weights_tensor.shape
+       * sparse_tensor.dense_shape = weights_tensor.dense_shape
 
   Args:
     sparse_id_column: A `_SparseColumn` which is created by
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_ops.py b/tensorflow/contrib/layers/python/layers/feature_column_ops.py
index 2c8e0315ba94f0..f7ca285297bf19 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_ops.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_ops.py
@@ -371,7 +371,7 @@ def _create_joint_embedding_lookup(columns_to_tensors,
     sparse_tensors.append(
         sparse_tensor_py.SparseTensor(t.indices,
                                       values,
-                                      t.shape))
+                                      t.dense_shape))
   sparse_tensor = sparse_ops.sparse_concat(1, sparse_tensors)
   with variable_scope.variable_scope(
       None, default_name='linear_weights', values=columns_to_tensors.values()):
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py b/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
index 8a49e14c08a3de..84b4c9c0b99784 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
@@ -78,7 +78,7 @@ def testSparseColumnWithHashBucket(self):
     hashed_sparse = tf.contrib.layers.sparse_column_with_hash_bucket("wire", 10)
     wire_tensor = tf.SparseTensor(values=["omar", "stringer", "marlo"],
                                   indices=[[0, 0], [1, 0], [1, 1]],
-                                  shape=[2, 2])
+                                  dense_shape=[2, 2])
     features = {"wire": wire_tensor}
     # Test transform features.
     output = tf.contrib.layers.transform_features(
@@ -100,7 +100,7 @@ def testSparseIntColumnWithHashBucket(self):
         "wire", 10, dtype=tf.int64)
     wire_tensor = tf.SparseTensor(values=[101, 201, 301],
                                   indices=[[0, 0], [1, 0], [1, 1]],
-                                  shape=[2, 2])
+                                  dense_shape=[2, 2])
     features = {"wire": wire_tensor}
     # Test transform features.
     output = tf.contrib.layers.transform_features(
@@ -135,7 +135,7 @@ def testEmbeddingColumn(self):
     hashed_sparse = tf.contrib.layers.sparse_column_with_hash_bucket("wire", 10)
     wire_tensor = tf.SparseTensor(values=["omar", "stringer", "marlo"],
                                   indices=[[0, 0], [1, 0], [1, 1]],
-                                  shape=[2, 2])
+                                  dense_shape=[2, 2])
     features = {"wire": wire_tensor}
     output = feature_column_ops._Transformer(features).transform(
         tf.contrib.layers.embedding_column(hashed_sparse, 10))
@@ -157,7 +157,7 @@ def testSparseColumnWithKeys(self):
         "wire", ["marlo", "omar", "stringer"])
     wire_tensor = tf.SparseTensor(values=["omar", "stringer", "marlo"],
                                   indices=[[0, 0], [1, 0], [1, 1]],
-                                  shape=[2, 2])
+                                  dense_shape=[2, 2])
     features = {"wire": wire_tensor}
     # Test transform features.
     output = tf.contrib.layers.transform_features(
@@ -196,7 +196,7 @@ def testSparseColumnWithHashBucket_IsIntegerized(self):
         "wire", 10)
     wire_tensor = tf.SparseTensor(values=[100, 1, 25],
                                   indices=[[0, 0], [1, 0], [1, 1]],
-                                  shape=[2, 2])
+                                  dense_shape=[2, 2])
     features = {"wire": wire_tensor}
     # Test transform features.
     output = tf.contrib.layers.transform_features(
@@ -235,11 +235,11 @@ def testWeightedSparseColumn(self):
         "ids", ["marlo", "omar", "stringer"])
     ids_tensor = tf.SparseTensor(values=["stringer", "stringer", "marlo"],
                                  indices=[[0, 0], [1, 0], [1, 1]],
-                                 shape=[2, 2])
+                                 dense_shape=[2, 2])
     weighted_ids = tf.contrib.layers.weighted_sparse_column(ids, "weights")
     weights_tensor = tf.SparseTensor(values=[10.0, 20.0, 30.0],
                                      indices=[[0, 0], [1, 0], [1, 1]],
-                                     shape=[2, 2])
+                                     dense_shape=[2, 2])
     features = {"ids": ids_tensor,
                 "weights": weights_tensor}
     # Test transform features.
@@ -273,10 +273,10 @@ def testCrossColumn(self):
     features = {
         "language": tf.SparseTensor(values=["english", "spanish"],
                                     indices=[[0, 0], [1, 0]],
-                                    shape=[2, 1]),
+                                    dense_shape=[2, 1]),
         "country": tf.SparseTensor(values=["US", "SV"],
                                    indices=[[0, 0], [1, 0]],
-                                   shape=[2, 1])
+                                   dense_shape=[2, 1])
     }
     # Test transform features.
     output = tf.contrib.layers.transform_features(
@@ -301,7 +301,7 @@ def testCrossWithBucketizedColumn(self):
         "price": tf.constant([[20.]]),
         "country": tf.SparseTensor(values=["US", "SV"],
                                    indices=[[0, 0], [0, 1]],
-                                   shape=[1, 2])
+                                   dense_shape=[1, 2])
     }
     # Test transform features.
     output = tf.contrib.layers.transform_features(
@@ -326,7 +326,7 @@ def testCrossWithMultiDimensionBucketizedColumn(self):
       features = {"price": tf.constant([[20., 210.], [110., 50.], [-3., -30.]]),
                   "country": tf.SparseTensor(values=["US", "SV", "US"],
                                              indices=[[0, 0], [1, 0], [2, 0]],
-                                             shape=[3, 2])}
+                                             dense_shape=[3, 2])}
       output, column_to_variable, _ = (
           tf.contrib.layers.weighted_sum_from_feature_columns(features,
                                                               [country_price],
@@ -359,10 +359,10 @@ def testCrossWithCrossedColumn(self):
         "price": tf.constant([[20.]]),
         "country": tf.SparseTensor(values=["US", "SV"],
                                    indices=[[0, 0], [0, 1]],
-                                   shape=[1, 2]),
+                                   dense_shape=[1, 2]),
         "wire": tf.SparseTensor(values=["omar", "stringer", "marlo"],
                                 indices=[[0, 0], [0, 1], [0, 2]],
-                                shape=[1, 3])
+                                dense_shape=[1, 3])
     }
     # Test transform features.
     output = tf.contrib.layers.transform_features(
@@ -509,7 +509,8 @@ def testOneHotColumnFromSparseColumnWithKeysSucceedsForDNN(self):
     ids_column = tf.contrib.layers.sparse_column_with_keys(
         "ids", ["a", "b", "c", "unseen"])
     ids_tensor = tf.SparseTensor(
-        values=["c", "b", "a"], indices=[[0, 0], [1, 0], [2, 0]], shape=[3, 1])
+        values=["c", "b", "a"], indices=[[0, 0], [1, 0], [2, 0]],
+        dense_shape=[3, 1])
     one_hot_sparse = tf.contrib.layers.one_hot_column(ids_column)
     features = {"ids": ids_tensor}
     output = tf.contrib.layers.input_from_feature_columns(features,
@@ -586,7 +587,7 @@ def testEmbeddingColumnSucceedsForDNN(self):
   def testScatteredEmbeddingColumnSucceedsForDNN(self):
     wire_tensor = tf.SparseTensor(values=["omar", "stringer", "marlo", "omar"],
                                   indices=[[0, 0], [1, 0], [1, 1], [2, 0]],
-                                  shape=[3, 2])
+                                  dense_shape=[3, 2])
 
     features = {"wire": wire_tensor}
     # Big enough hash space so that hopefully there is no collision
@@ -610,7 +611,7 @@ def testEmbeddingColumnWithInitializerSucceedsForDNN(self):
     hashed_sparse = tf.contrib.layers.sparse_column_with_hash_bucket("wire", 10)
     wire_tensor = tf.SparseTensor(values=["omar", "stringer", "marlo"],
                                   indices=[[0, 0], [1, 0], [1, 1]],
-                                  shape=[2, 2])
+                                  dense_shape=[2, 2])
     features = {"wire": wire_tensor}
     init_value = 133.7
     embeded_sparse = tf.contrib.layers.embedding_column(
@@ -629,7 +630,7 @@ def testEmbeddingColumnWithMultipleInitializersFails(self):
     hashed_sparse = tf.contrib.layers.sparse_column_with_hash_bucket("wire", 10)
     wire_tensor = tf.SparseTensor(values=["omar", "stringer", "marlo"],
                                   indices=[[0, 0], [1, 0], [1, 1]],
-                                  shape=[2, 2])
+                                  dense_shape=[2, 2])
     features = {"wire": wire_tensor}
     embedded_sparse = tf.contrib.layers.embedding_column(
         hashed_sparse,
@@ -656,11 +657,11 @@ def testEmbeddingColumnWithWeightedSparseColumnSucceedsForDNN(self):
         "ids", ["marlo", "omar", "stringer"])
     ids_tensor = tf.SparseTensor(values=["stringer", "stringer", "marlo"],
                                  indices=[[0, 0], [1, 0], [1, 1]],
-                                 shape=[2, 2])
+                                 dense_shape=[2, 2])
     weighted_ids = tf.contrib.layers.weighted_sparse_column(ids, "weights")
     weights_tensor = tf.SparseTensor(values=[10.0, 20.0, 30.0],
                                      indices=[[0, 0], [1, 0], [1, 1]],
-                                     shape=[2, 2])
+                                     dense_shape=[2, 2])
     features = {"ids": ids_tensor,
                 "weights": weights_tensor}
     embeded_sparse = tf.contrib.layers.embedding_column(weighted_ids, 10)
@@ -680,7 +681,7 @@ def testEmbeddingColumnWithCrossedColumnSucceedsForDNN(self):
         set([a, b]), hash_bucket_size=10000)
     wire_tensor = tf.SparseTensor(values=["omar", "stringer", "marlo"],
                                   indices=[[0, 0], [1, 0], [1, 1]],
-                                  shape=[2, 2])
+                                  dense_shape=[2, 2])
     features = {"aaa": wire_tensor, "bbb": wire_tensor}
     embeded_sparse = tf.contrib.layers.embedding_column(crossed, 10)
     output = tf.contrib.layers.input_from_feature_columns(features,
@@ -693,7 +694,7 @@ def testSparseColumnFailsForDNN(self):
     hashed_sparse = tf.contrib.layers.sparse_column_with_hash_bucket("wire", 10)
     wire_tensor = tf.SparseTensor(values=["omar", "stringer", "marlo"],
                                   indices=[[0, 0], [1, 0], [1, 1]],
-                                  shape=[2, 2])
+                                  dense_shape=[2, 2])
     features = {"wire": wire_tensor}
     with self.test_session():
       with self.assertRaisesRegexp(
@@ -706,11 +707,11 @@ def testWeightedSparseColumnFailsForDNN(self):
         "ids", ["marlo", "omar", "stringer"])
     ids_tensor = tf.SparseTensor(values=["stringer", "stringer", "marlo"],
                                  indices=[[0, 0], [1, 0], [1, 1]],
-                                 shape=[2, 2])
+                                 dense_shape=[2, 2])
     weighted_ids = tf.contrib.layers.weighted_sparse_column(ids, "weights")
     weights_tensor = tf.SparseTensor(values=[10.0, 20.0, 30.0],
                                      indices=[[0, 0], [1, 0], [1, 1]],
-                                     shape=[2, 2])
+                                     dense_shape=[2, 2])
     features = {"ids": ids_tensor,
                 "weights": weights_tensor}
     with self.test_session():
@@ -729,7 +730,7 @@ def testCrossedColumnFailsForDNN(self):
         set([a, b]), hash_bucket_size=10000)
     wire_tensor = tf.SparseTensor(values=["omar", "stringer", "marlo"],
                                   indices=[[0, 0], [1, 0], [1, 1]],
-                                  shape=[2, 2])
+                                  dense_shape=[2, 2])
     features = {"aaa": wire_tensor, "bbb": wire_tensor}
     with self.test_session():
       with self.assertRaisesRegexp(
@@ -748,7 +749,7 @@ def testDeepColumnsSucceedForDNN(self):
         "price": tf.constant([[20., 200], [110, 2], [-20, -30]]),
         "wire": tf.SparseTensor(values=["omar", "stringer", "marlo"],
                                 indices=[[0, 0], [1, 0], [2, 0]],
-                                shape=[3, 1])
+                                dense_shape=[3, 1])
     }
     embeded_sparse = tf.contrib.layers.embedding_column(
         hashed_sparse,
@@ -764,7 +765,7 @@ def testEmbeddingColumnForDNN(self):
     hashed_sparse = tf.contrib.layers.sparse_column_with_hash_bucket("wire", 10)
     wire_tensor = tf.SparseTensor(values=["omar", "stringer", "marlo"],
                                   indices=[[0, 0], [1, 0], [1, 1]],
-                                  shape=[3, 2])
+                                  dense_shape=[3, 2])
     features = {"wire": wire_tensor}
     embeded_sparse = tf.contrib.layers.embedding_column(
         hashed_sparse,
@@ -782,7 +783,7 @@ def testEmbeddingColumnWithMaxNormForDNN(self):
     hashed_sparse = tf.contrib.layers.sparse_column_with_hash_bucket("wire", 10)
     wire_tensor = tf.SparseTensor(values=["omar", "stringer", "marlo"],
                                   indices=[[0, 0], [1, 0], [1, 1]],
-                                  shape=[3, 2])
+                                  dense_shape=[3, 2])
     features = {"wire": wire_tensor}
     embedded_sparse = tf.contrib.layers.embedding_column(
         hashed_sparse,
@@ -802,11 +803,11 @@ def testEmbeddingColumnWithWeightedSparseColumnForDNN(self):
         "ids", ["marlo", "omar", "stringer"])
     ids_tensor = tf.SparseTensor(values=["stringer", "stringer", "marlo"],
                                  indices=[[0, 0], [1, 0], [1, 1]],
-                                 shape=[3, 2])
+                                 dense_shape=[3, 2])
     weighted_ids = tf.contrib.layers.weighted_sparse_column(ids, "weights")
     weights_tensor = tf.SparseTensor(values=[10.0, 20.0, 30.0],
                                      indices=[[0, 0], [1, 0], [1, 1]],
-                                     shape=[3, 2])
+                                     dense_shape=[3, 2])
     features = {"ids": ids_tensor,
                 "weights": weights_tensor}
     embeded_sparse = tf.contrib.layers.embedding_column(
@@ -831,7 +832,7 @@ def testInputLayerWithCollectionsForDNN(self):
         "price": tf.constant([[20.], [110], [-3]]),
         "wire": tf.SparseTensor(values=["omar", "stringer", "marlo"],
                                 indices=[[0, 0], [1, 0], [2, 0]],
-                                shape=[3, 1])
+                                dense_shape=[3, 1])
     }
     embeded_sparse = tf.contrib.layers.embedding_column(hashed_sparse, 10)
     tf.contrib.layers.input_from_feature_columns(
@@ -850,7 +851,7 @@ def testInputLayerWithTrainableArgForDNN(self):
         "price": tf.constant([[20.], [110], [-3]]),
         "wire": tf.SparseTensor(values=["omar", "stringer", "marlo"],
                                 indices=[[0, 0], [1, 0], [2, 0]],
-                                shape=[3, 1])
+                                dense_shape=[3, 1])
     }
     embeded_sparse = tf.contrib.layers.embedding_column(hashed_sparse, 10)
     tf.contrib.layers.input_from_feature_columns(
@@ -1178,7 +1179,7 @@ def testSparseColumn(self):
     hashed_sparse = tf.contrib.layers.sparse_column_with_hash_bucket("wire", 10)
     wire_tensor = tf.SparseTensor(values=["omar", "stringer", "marlo"],
                                   indices=[[0, 0], [1, 0], [1, 1]],
-                                  shape=[2, 2])
+                                  dense_shape=[2, 2])
     features = {"wire": wire_tensor}
     logits, _, _ = tf.contrib.layers.weighted_sum_from_feature_columns(
         features, [hashed_sparse], num_outputs=5)
@@ -1192,7 +1193,7 @@ def testSparseIntColumn(self):
         "wire", 10, dtype=tf.int64)
     wire_tensor = tf.SparseTensor(values=[101, 201, 301],
                                   indices=[[0, 0], [1, 0], [1, 1]],
-                                  shape=[2, 2])
+                                  dense_shape=[2, 2])
     features = {"wire": wire_tensor}
     logits, _, _ = tf.contrib.layers.weighted_sum_from_feature_columns(
         features, [hashed_sparse], num_outputs=5)
@@ -1207,7 +1208,7 @@ def testSparseColumnWithDenseInputTensor(self):
     logits, _, _ = tf.contrib.layers.weighted_sum_from_feature_columns(
         features, [hashed_sparse], num_outputs=5)
     with self.test_session():
-      tf.initialize_all_variables().run()
+      tf.global_variables_initializer().run()
       self.assertAllEqual(logits.eval().shape, [2, 5])
 
   def testWeightedSparseColumn(self):
@@ -1215,11 +1216,11 @@ def testWeightedSparseColumn(self):
         "ids", ["marlo", "omar", "stringer"])
     ids_tensor = tf.SparseTensor(values=["stringer", "stringer", "marlo"],
                                  indices=[[0, 0], [1, 0], [1, 1]],
-                                 shape=[2, 2])
+                                 dense_shape=[2, 2])
     weighted_ids = tf.contrib.layers.weighted_sparse_column(ids, "weights")
     weights_tensor = tf.SparseTensor(values=[10.0, 20.0, 30.0],
                                      indices=[[0, 0], [1, 0], [1, 1]],
-                                     shape=[2, 2])
+                                     dense_shape=[2, 2])
     features = {"ids": ids_tensor,
                 "weights": weights_tensor}
     logits, _, _ = tf.contrib.layers.weighted_sum_from_feature_columns(
@@ -1242,7 +1243,7 @@ def testWeightedSparseColumnWithDenseInputTensor(self):
         features, [weighted_ids], num_outputs=5)
 
     with self.test_session():
-      tf.initialize_all_variables().run()
+      tf.global_variables_initializer().run()
       tf.initialize_all_tables().run()
       self.assertAllEqual(logits.eval().shape, [2, 5])
 
@@ -1255,7 +1256,7 @@ def testCrossedColumn(self):
         set([a, b]), hash_bucket_size=10000)
     wire_tensor = tf.SparseTensor(values=["omar", "stringer", "marlo"],
                                   indices=[[0, 0], [1, 0], [1, 1]],
-                                  shape=[2, 2])
+                                  dense_shape=[2, 2])
     features = {"aaa": wire_tensor, "bbb": wire_tensor}
     logits, _, _ = tf.contrib.layers.weighted_sum_from_feature_columns(
         features, [crossed], num_outputs=5)
@@ -1267,7 +1268,7 @@ def testEmbeddingColumn(self):
     hashed_sparse = tf.contrib.layers.sparse_column_with_hash_bucket("wire", 10)
     wire_tensor = tf.SparseTensor(values=["omar", "stringer", "marlo"],
                                   indices=[[0, 0], [1, 0], [1, 1]],
-                                  shape=[2, 2])
+                                  dense_shape=[2, 2])
     features = {"wire": wire_tensor}
     embeded_sparse = tf.contrib.layers.embedding_column(hashed_sparse, 10)
     with self.test_session():
@@ -1310,7 +1311,7 @@ def testAllWideColumns(self):
         "price": tf.constant([[20.], [110], [-3]]),
         "wire": tf.SparseTensor(values=["omar", "stringer", "marlo"],
                                 indices=[[0, 0], [1, 0], [2, 0]],
-                                shape=[3, 1])
+                                dense_shape=[3, 1])
     }
     output, _, _ = tf.contrib.layers.weighted_sum_from_feature_columns(
         features, [real_valued, bucket, hashed_sparse, crossed],
@@ -1329,7 +1330,7 @@ def testPredictions(self):
           "age": tf.constant([[1], [2]]),
           "language": tf.SparseTensor(values=["hindi", "english"],
                                       indices=[[0, 0], [1, 0]],
-                                      shape=[2, 1]),
+                                      dense_shape=[2, 1]),
       }
       output, column_to_variable, bias = (
           tf.contrib.layers.weighted_sum_from_feature_columns(features,
@@ -1363,10 +1364,10 @@ def testJointPredictions(self):
       features = {
           "country": tf.SparseTensor(values=["finland", "us"],
                                      indices=[[0, 0], [1, 0]],
-                                     shape=[2, 1]),
+                                     dense_shape=[2, 1]),
           "language": tf.SparseTensor(values=["hindi", "english"],
                                       indices=[[0, 0], [1, 0]],
-                                      shape=[2, 1]),
+                                      dense_shape=[2, 1]),
       }
       output, variables, bias = (
           tf.contrib.layers.joint_weighted_sum_from_feature_columns(
@@ -1400,7 +1401,7 @@ def testJointPredictionsWeightedFails(self):
           "weight": tf.constant([[1], [2]]),
           "language": tf.SparseTensor(values=["hindi", "english"],
                                       indices=[[0, 0], [1, 0]],
-                                      shape=[2, 1]),
+                                      dense_shape=[2, 1]),
       }
       with self.assertRaises(AssertionError):
         tf.contrib.layers.joint_weighted_sum_from_feature_columns(
@@ -1427,10 +1428,10 @@ def testPredictionsWithWeightedSparseColumn(self):
       features = {
           "language": tf.SparseTensor(values=["hindi", "english"],
                                       indices=[[0, 0], [1, 0]],
-                                      shape=[2, 1]),
+                                      dense_shape=[2, 1]),
           "age": tf.SparseTensor(values=[10.0, 20.0],
                                  indices=[[0, 0], [1, 0]],
-                                 shape=[2, 1])
+                                 dense_shape=[2, 1])
       }
       output, column_to_variable, bias = (
           tf.contrib.layers.weighted_sum_from_feature_columns(
@@ -1457,7 +1458,7 @@ def testPredictionsWithMultivalentColumnButNoCross(self):
       features = {
           "language": tf.SparseTensor(values=["hindi", "english"],
                                       indices=[[0, 0], [0, 1]],
-                                      shape=[1, 2])
+                                      dense_shape=[1, 2])
       }
       output, column_to_variable, bias = (
           tf.contrib.layers.weighted_sum_from_feature_columns(features,
@@ -1508,10 +1509,10 @@ def testCrossUsageInPredictions(self):
       features = {
           "language": tf.SparseTensor(values=["english", "spanish"],
                                       indices=[[0, 0], [1, 0]],
-                                      shape=[2, 1]),
+                                      dense_shape=[2, 1]),
           "country": tf.SparseTensor(values=["US", "SV"],
                                      indices=[[0, 0], [1, 0]],
-                                     shape=[2, 1])
+                                     dense_shape=[2, 1])
       }
       output, column_to_variable, _ = (
           tf.contrib.layers.weighted_sum_from_feature_columns(
@@ -1534,7 +1535,7 @@ def testCrossColumnByItself(self):
       features = {
           "language": tf.SparseTensor(values=["english", "spanish"],
                                       indices=[[0, 0], [0, 1]],
-                                      shape=[1, 2]),
+                                      dense_shape=[1, 2]),
       }
       output, column_to_variable, _ = (
           tf.contrib.layers.weighted_sum_from_feature_columns(
@@ -1561,10 +1562,10 @@ def testMultivalentCrossUsageInPredictions(self):
       features = {
           "language": tf.SparseTensor(values=["english", "spanish"],
                                       indices=[[0, 0], [0, 1]],
-                                      shape=[1, 2]),
+                                      dense_shape=[1, 2]),
           "country": tf.SparseTensor(values=["US", "SV"],
                                      indices=[[0, 0], [0, 1]],
-                                     shape=[1, 2])
+                                     dense_shape=[1, 2])
       }
       output, column_to_variable, _ = (
           tf.contrib.layers.weighted_sum_from_feature_columns(
@@ -1592,10 +1593,10 @@ def testMultivalentCrossUsageInPredictionsWithPartition(self):
       features = {
           "language": tf.SparseTensor(values=["english", "spanish"],
                                       indices=[[0, 0], [0, 1]],
-                                      shape=[1, 2]),
+                                      dense_shape=[1, 2]),
           "country": tf.SparseTensor(values=["US", "SV"],
                                      indices=[[0, 0], [0, 1]],
-                                     shape=[1, 2])
+                                     dense_shape=[1, 2])
       }
       with tf.variable_scope(
           "weighted_sum_from_feature_columns",
@@ -1634,7 +1635,7 @@ def testRealValuedColumnHavingMultiDimensions(self):
                   "incomes": tf.constant([[100., 200., 300.], [10., 20., 30.]]),
                   "country": tf.SparseTensor(values=["US", "SV"],
                                              indices=[[0, 0], [1, 0]],
-                                             shape=[2, 2])}
+                                             dense_shape=[2, 2])}
       output, column_to_variable, _ = (
           tf.contrib.layers.weighted_sum_from_feature_columns(
               features, [country, age, incomes],
@@ -1658,7 +1659,7 @@ def testMulticlassWithRealValuedColumnHavingMultiDimensions(self):
                   "incomes": tf.constant([[100., 200., 300.], [10., 20., 30.]]),
                   "country": tf.SparseTensor(values=["US", "SV"],
                                              indices=[[0, 0], [1, 0]],
-                                             shape=[2, 2])}
+                                             dense_shape=[2, 2])}
       output, column_to_variable, _ = (
           tf.contrib.layers.weighted_sum_from_feature_columns(
               features, [country, age, incomes],
@@ -1704,7 +1705,7 @@ def testBucketizedColumnHavingMultiDimensions(self):
       features = {"price": tf.constant([[20., 210], [110, 50], [-3, -30]]),
                   "country": tf.SparseTensor(values=["US", "SV"],
                                              indices=[[0, 0], [1, 0]],
-                                             shape=[3, 2])}
+                                             dense_shape=[3, 2])}
       output, column_to_variable, _ = (
           tf.contrib.layers.weighted_sum_from_feature_columns(features,
                                                               [bucket, country],
@@ -1729,7 +1730,7 @@ def testMulticlassWithBucketizedColumnHavingMultiDimensions(self):
       features = {"price": tf.constant([[20., 210], [110, 50], [-3, -30]]),
                   "country": tf.SparseTensor(values=["US", "SV"],
                                              indices=[[0, 0], [1, 0]],
-                                             shape=[3, 2])}
+                                             dense_shape=[3, 2])}
       output, column_to_variable, _ = (
           tf.contrib.layers.weighted_sum_from_feature_columns(features,
                                                               [bucket, country],
@@ -1763,7 +1764,7 @@ def testCrossWithBucketizedColumn(self):
           "price": tf.constant([[20.]]),
           "country": tf.SparseTensor(values=["US", "SV"],
                                      indices=[[0, 0], [0, 1]],
-                                     shape=[1, 2])
+                                     dense_shape=[1, 2])
       }
       output, column_to_variable, _ = (
           tf.contrib.layers.weighted_sum_from_feature_columns(features,
@@ -1797,10 +1798,10 @@ def testCrossWithCrossedColumn(self):
           "price": tf.constant([[20.]]),
           "country": tf.SparseTensor(values=["US", "SV"],
                                      indices=[[0, 0], [0, 1]],
-                                     shape=[1, 2]),
+                                     dense_shape=[1, 2]),
           "language": tf.SparseTensor(values=["english", "spanish"],
                                       indices=[[0, 0], [0, 1]],
-                                      shape=[1, 2])
+                                      dense_shape=[1, 2])
       }
       output, column_to_variable, _ = (
           tf.contrib.layers.weighted_sum_from_feature_columns(
@@ -1822,7 +1823,7 @@ def testIntegerizedColumn(self):
     with tf.Graph().as_default():
       features = {"product": tf.SparseTensor(values=[0, 4, 2],
                                              indices=[[0, 0], [1, 0], [2, 0]],
-                                             shape=[3, 1])}
+                                             dense_shape=[3, 1])}
       output, column_to_variable, _ = (
           tf.contrib.layers.weighted_sum_from_feature_columns(features,
                                                               [product],
@@ -1844,7 +1845,7 @@ def testIntegerizedColumnWithDenseInputTensor(self):
                                                               [product],
                                                               num_outputs=1))
       with self.test_session() as sess:
-        tf.initialize_all_variables().run()
+        tf.global_variables_initializer().run()
         tf.initialize_all_tables().run()
         product_weights = column_to_variable[product][0]
         sess.run(product_weights.assign([[0.1], [0.2], [0.3], [0.4], [0.5]]))
@@ -1860,7 +1861,7 @@ def testIntegerizedColumnWithDenseInputTensor2(self):
                                                               [product],
                                                               num_outputs=1))
       with self.test_session() as sess:
-        tf.initialize_all_variables().run()
+        tf.global_variables_initializer().run()
         tf.initialize_all_tables().run()
         product_weights = column_to_variable[product][0]
         sess.run(product_weights.assign([[0.1], [0.2], [0.3], [0.4], [0.5]]))
@@ -1872,7 +1873,7 @@ def testIntegerizedColumnWithInvalidId(self):
     with tf.Graph().as_default():
       features = {"product": tf.SparseTensor(values=[5, 4, 7],
                                              indices=[[0, 0], [1, 0], [2, 0]],
-                                             shape=[3, 1])}
+                                             dense_shape=[3, 1])}
       output, column_to_variable, _ = (
           tf.contrib.layers.weighted_sum_from_feature_columns(features,
                                                               [product],
@@ -1978,7 +1979,7 @@ def testMulticlassWithCrossedColumn(self):
               shape=[4, 1]),
           "country": tf.SparseTensor(values=["US", "SV", "RU", "KE"],
                                      indices=[[0, 0], [1, 0], [2, 0], [3, 0]],
-                                     shape=[4, 1])
+                                     dense_shape=[4, 1])
       }
       output, column_to_variable, _ = (
           tf.contrib.layers.weighted_sum_from_feature_columns(features,
@@ -2035,7 +2036,7 @@ def testVariablesAddedToCollection(self):
           "price": tf.constant([[20.]]),
           "country": tf.SparseTensor(values=["US", "SV"],
                                      indices=[[0, 0], [0, 1]],
-                                     shape=[1, 2])
+                                     dense_shape=[1, 2])
       }
       tf.contrib.layers.weighted_sum_from_feature_columns(
           features, [country_price, price_bucket],
@@ -2193,7 +2194,7 @@ def testNotGoodDtype(self):
   def testSparseTensor(self):
     with self.assertRaises(ValueError):
       tf.contrib.layers.infer_real_valued_columns(
-          tf.SparseTensor(indices=[[0, 0]], values=["a"], shape=[1, 1]))
+          tf.SparseTensor(indices=[[0, 0]], values=["a"], dense_shape=[1, 1]))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_test.py b/tensorflow/contrib/layers/python/layers/feature_column_test.py
index cb185182f13a46..4565c4b032f58e 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_test.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_test.py
@@ -41,7 +41,7 @@ def _sparse_id_tensor(shape, vocab_size, seed=112123):
   indices = indices[keep]
   values = values[keep]
 
-  return tf.SparseTensor(indices=indices, values=values, shape=shape)
+  return tf.SparseTensor(indices=indices, values=values, dense_shape=shape)
 
 
 class FeatureColumnTest(tf.test.TestCase):
@@ -98,10 +98,10 @@ def testSharedEmbeddingColumn(self):
 
     # Create a sparse id tensor for a1.
     input_tensor_c1 = tf.SparseTensor(indices=[[0, 0], [1, 1], [2, 2]],
-                                      values=[0, 1, 2], shape=[3, 3])
+                                      values=[0, 1, 2], dense_shape=[3, 3])
     # Create a sparse id tensor for a2.
     input_tensor_c2 = tf.SparseTensor(indices=[[0, 0], [1, 1], [2, 2]],
-                                      values=[0, 1, 2], shape=[3, 3])
+                                      values=[0, 1, 2], dense_shape=[3, 3])
     with tf.variable_scope("run_1"):
       b1 = tf.contrib.layers.input_from_feature_columns(
           {b[0]: input_tensor_c1}, [b[0]])
@@ -598,7 +598,7 @@ def testInitEmbeddingColumnWeightsFromCkpt(self):
     # vocab.
     input_tensor = tf.SparseTensor(indices=[[0, 0], [1, 1], [2, 2], [3, 3]],
                                    values=[0, 1, 2, 3],
-                                   shape=[4, 4])
+                                   dense_shape=[4, 4])
 
     # Invoking 'layers.input_from_feature_columns' will create the embedding
     # variable. Creating under scope 'run_1' so as to prevent name conflicts
@@ -654,7 +654,7 @@ def testInitCrossedColumnWeightsFromCkpt(self):
 
     input_tensor = tf.SparseTensor(indices=[[0, 0], [1, 1], [2, 2], [3, 3]],
                                    values=[0, 1, 2, 3],
-                                   shape=[4, 4])
+                                   dense_shape=[4, 4])
 
     # Invoking 'weighted_sum_from_feature_columns' will create the crossed
     # column weights variable.
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index 5c6559b826d4fb..4086ccb55c2a86 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -30,7 +30,10 @@
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.layers import convolutional as convolutional_layers
 from tensorflow.python.layers import core as core_layers
+from tensorflow.python.layers import  normalization as normalization_layers
+from tensorflow.python.layers import pooling as pooling_layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import init_ops
@@ -115,19 +118,14 @@ def avg_pool2d(inputs,
     raise ValueError('data_format has to be either NCHW or NHWC.')
   with ops.name_scope(scope, 'AvgPool2D', [inputs]) as sc:
     inputs = ops.convert_to_tensor(inputs)
-    kernel_h, kernel_w = utils.two_element_tuple(kernel_size)
-    stride_h, stride_w = utils.two_element_tuple(stride)
-    if data_format == DATA_FORMAT_NHWC:
-      ksize = [1, kernel_h, kernel_w, 1]
-      strides = [1, stride_h, stride_w, 1]
-    else:
-      ksize = [1, 1, kernel_h, kernel_w]
-      strides = [1, 1, stride_h, stride_w]
-    outputs = nn.avg_pool(inputs,
-                          ksize=ksize,
-                          strides=strides,
-                          padding=padding,
-                          data_format=data_format)
+    df = ('channels_first' if data_format and data_format.startswith('NC')
+          else 'channels_last')
+    layer = pooling_layers.AveragePooling2D(pool_size=kernel_size,
+                                            strides=stride,
+                                            padding=padding,
+                                            data_format=df,
+                                            _scope=sc)
+    outputs = layer.apply(inputs)
     return utils.collect_named_outputs(outputs_collections, sc, outputs)
 
 
@@ -471,9 +469,64 @@ def batch_norm(
   if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC):
     raise ValueError('data_format has to be either NCHW or NHWC.')
 
-  with variable_scope.variable_scope(scope, 'BatchNorm', [inputs],
-                                     reuse=reuse) as sc:
+  layer_variable_getter = _build_variable_getter()
+  with variable_scope.variable_scope(
+      scope, 'BatchNorm', [inputs], reuse=reuse,
+      custom_getter=layer_variable_getter) as sc:
     inputs = ops.convert_to_tensor(inputs)
+
+    # Determine whether we can use the core layer class.
+    if (batch_weights is None and
+        updates_collections is ops.GraphKeys.UPDATE_OPS):
+      # Use the core layer class.
+      axis = 1 if data_format == DATA_FORMAT_NCHW else -1
+      if not param_initializers:
+        param_initializers = {}
+      beta_initializer = param_initializers.get('beta',
+                                                init_ops.zeros_initializer)
+      gamma_initializer = param_initializers.get('gamma',
+                                                 init_ops.ones_initializer())
+      moving_mean_initializer = param_initializers.get(
+          'moving_mean', init_ops.zeros_initializer)
+      moving_variance_initializer = param_initializers.get(
+          'moving_variance', init_ops.ones_initializer())
+      layer = normalization_layers.BatchNormalization(
+          axis=axis,
+          momentum=decay,
+          epsilon=epsilon,
+          center=center,
+          scale=scale,
+          beta_initializer=beta_initializer,
+          gamma_initializer=gamma_initializer,
+          moving_mean_initializer=moving_mean_initializer,
+          moving_variance_initializer=moving_variance_initializer,
+          trainable=trainable,
+          name=sc.name,
+          _scope=sc,
+          _reuse=reuse)
+      outputs = layer.apply(inputs, training=is_training)
+
+      # Add variables to collections.
+      _add_variable_to_collections(
+          layer.moving_mean, variables_collections, 'moving_mean')
+      _add_variable_to_collections(
+          layer.moving_variance, variables_collections, 'moving_variance')
+      if layer.beta:
+        _add_variable_to_collections(layer.beta, variables_collections, 'beta')
+      if layer.gamma:
+        _add_variable_to_collections(layer.beta, variables_collections, 'gamma')
+
+      if activation_fn is not None:
+        outputs = activation_fn(outputs)
+      return utils.collect_named_outputs(outputs_collections,
+                                         sc.original_name_scope, outputs)
+
+    # Not supported by layer class: batch_weights argument,
+    # and custom updates_collections. In that case, use the legacy BN
+    # implementation.
+    # Custom updates collections are not supported because the update logic
+    # is different in this case, in particular w.r.t. "forced updates" and
+    # update op reuse.
     inputs_shape = inputs.get_shape()
     inputs_rank = inputs_shape.ndims
     if inputs_rank is None:
@@ -804,69 +857,62 @@ def convolution(inputs,
   """
   if data_format not in [None, 'NWC', 'NCW', 'NHWC', 'NCHW', 'NDHWC']:
     raise ValueError('Invalid data_format: %r' % (data_format,))
-  with variable_scope.variable_scope(scope, 'Conv', [inputs],
-                                     reuse=reuse) as sc:
+
+  layer_variable_getter = _build_variable_getter(
+      {'bias': 'biases', 'kernel': 'weights'})
+
+  with variable_scope.variable_scope(
+      scope, 'Conv', [inputs], reuse=reuse,
+      custom_getter=layer_variable_getter) as sc:
     inputs = ops.convert_to_tensor(inputs)
-    dtype = inputs.dtype.base_dtype
     input_rank = inputs.get_shape().ndims
-    if input_rank is None:
-      raise ValueError('Rank of inputs must be known')
-    if input_rank < 3 or input_rank > 5:
-      raise ValueError('Rank of inputs is %d, which is not >= 3 and <= 5' %
-                       input_rank)
-    conv_dims = input_rank - 2
-    kernel_size = utils.n_positive_integers(conv_dims, kernel_size)
-    stride = utils.n_positive_integers(conv_dims, stride)
-    rate = utils.n_positive_integers(conv_dims, rate)
-
-    if data_format is None or data_format.endswith('C'):
-      num_input_channels = inputs.get_shape()[input_rank - 1].value
-    elif data_format.startswith('NC'):
-      num_input_channels = inputs.get_shape()[1].value
+
+    if input_rank == 3:
+      layer_class = convolutional_layers.Convolution1D
+    elif input_rank == 4:
+      layer_class = convolutional_layers.Convolution2D
+    elif input_rank == 5:
+      layer_class = convolutional_layers.Convolution3D
     else:
-      raise ValueError('Invalid data_format')
+      raise ValueError('Convolution not supported for input with rank',
+                       input_rank)
 
-    if num_input_channels is None:
-      raise ValueError('Number of in_channels must be known.')
+    df = ('channels_first' if data_format and data_format.startswith('NC')
+          else 'channels_last')
+    layer = layer_class(filters=num_outputs,
+                        kernel_size=kernel_size,
+                        strides=stride,
+                        padding=padding,
+                        data_format=df,
+                        dilation_rate=rate,
+                        activation=None,
+                        use_bias=not normalizer_fn and biases_initializer,
+                        kernel_initializer=weights_initializer,
+                        bias_initializer=biases_initializer,
+                        kernel_regularizer=weights_regularizer,
+                        bias_regularizer=biases_regularizer,
+                        activity_regularizer=None,
+                        trainable=trainable,
+                        name=sc.name,
+                        dtype=inputs.dtype.base_dtype,
+                        _scope=sc,
+                        _reuse=reuse)
+    outputs = layer.apply(inputs)
+
+    # Add variables to collections.
+    _add_variable_to_collections(layer.kernel, variables_collections, 'weights')
+    if layer.use_bias:
+      _add_variable_to_collections(layer.bias, variables_collections, 'biases')
 
-    weights_shape = (
-        list(kernel_size) + [num_input_channels, num_outputs])
-    weights_collections = utils.get_variable_collections(variables_collections,
-                                                         'weights')
-    weights = variables.model_variable('weights',
-                                       shape=weights_shape,
-                                       dtype=dtype,
-                                       initializer=weights_initializer,
-                                       regularizer=weights_regularizer,
-                                       collections=weights_collections,
-                                       trainable=trainable)
-    outputs = nn.convolution(input=inputs,
-                             filter=weights,
-                             dilation_rate=rate,
-                             strides=stride,
-                             padding=padding,
-                             data_format=data_format)
     if normalizer_fn is not None:
       normalizer_params = normalizer_params or {}
       outputs = normalizer_fn(outputs, **normalizer_params)
-    else:
-      if biases_initializer is not None:
-        biases_collections = utils.get_variable_collections(
-            variables_collections, 'biases')
-        biases = variables.model_variable('biases',
-                                          shape=[num_outputs],
-                                          dtype=dtype,
-                                          initializer=biases_initializer,
-                                          regularizer=biases_regularizer,
-                                          collections=biases_collections,
-                                          trainable=trainable)
-        outputs = nn.bias_add(outputs, biases, data_format=data_format)
+
     if activation_fn is not None:
       outputs = activation_fn(outputs)
     return utils.collect_named_outputs(outputs_collections,
                                        sc.original_name_scope, outputs)
 
-
 convolution2d = convolution
 
 
@@ -1038,87 +1084,47 @@ def convolution2d_transpose(
     ValueError: if `data_format` is neither `NHWC` nor `NCHW`.
     ValueError: if `C` dimension of `inputs` is None.
   """
+  layer_variable_getter = _build_variable_getter(
+      {'bias': 'biases', 'kernel': 'weights'})
+
   with variable_scope.variable_scope(
-      scope, 'Conv2d_transpose', [inputs], reuse=reuse) as sc:
+      scope, 'Conv2d_transpose', [inputs], reuse=reuse,
+      custom_getter=layer_variable_getter) as sc:
     if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC):
       raise ValueError('data_format has to be either NCHW or NHWC.')
-    dtype = inputs.dtype.base_dtype
-    kernel_h, kernel_w = utils.two_element_tuple(kernel_size)
-    stride_h, stride_w = utils.two_element_tuple(stride)
-    if data_format == DATA_FORMAT_NCHW:
-      c_axis, h_axis, w_axis = 1, 2, 3
-    else:
-      h_axis, w_axis, c_axis = 1, 2, 3
-    num_filters_in = inputs.get_shape()[c_axis].value
-    if num_filters_in is None:
-      raise ValueError('`C` dimension of `inputs` must be known but is None.')
-    weights_shape = [kernel_h, kernel_w, num_outputs, num_filters_in]
-    weights_collections = utils.get_variable_collections(
-        variables_collections, 'weights')
-    weights = variables.model_variable(
-        'weights',
-        shape=weights_shape,
-        dtype=dtype,
-        initializer=weights_initializer,
-        regularizer=weights_regularizer,
-        trainable=trainable,
-        collections=weights_collections)
-
-    inputs_shape = array_ops.shape(inputs)
-    batch_size = inputs_shape[0]
-    height, width = inputs_shape[h_axis], inputs_shape[w_axis]
-
-    def get_deconv_dim(dim_size, stride_size, kernel_size, padding):
-      if isinstance(dim_size, ops.Tensor):
-        dim_size = math_ops.mul(dim_size, stride_size)
-      elif dim_size is not None:
-        dim_size *= stride_size
 
-      if padding == 'VALID' and dim_size is not None:
-        dim_size += max(kernel_size - stride_size, 0)
-      return dim_size
+    inputs = ops.convert_to_tensor(inputs)
 
-    # Infer the dynamic output shape:
-    out_height = get_deconv_dim(height, stride_h, kernel_h, padding)
-    out_width = get_deconv_dim(width, stride_w, kernel_w, padding)
+    df = ('channels_first' if data_format and data_format.startswith('NC')
+          else 'channels_last')
+    layer = convolutional_layers.Convolution2DTranspose(
+        filters=num_outputs,
+        kernel_size=kernel_size,
+        strides=stride,
+        padding=padding,
+        data_format=df,
+        activation=None,
+        use_bias=not normalizer_fn and biases_initializer,
+        kernel_initializer=weights_initializer,
+        bias_initializer=biases_initializer,
+        kernel_regularizer=weights_regularizer,
+        bias_regularizer=biases_regularizer,
+        activity_regularizer=None,
+        trainable=trainable,
+        name=sc.name,
+        dtype=inputs.dtype.base_dtype,
+        _scope=sc,
+        _reuse=reuse)
+    outputs = layer.apply(inputs)
 
-    if data_format == DATA_FORMAT_NHWC:
-      output_shape = [batch_size, out_height, out_width, num_outputs]
-      strides = [1, stride_h, stride_w, 1]
-    else:
-      output_shape = [batch_size, num_outputs, out_height, out_width]
-      strides = [1, 1, stride_h, stride_w]
-
-    output_shape = array_ops.pack(output_shape)
-    outputs = nn.conv2d_transpose(inputs, weights, output_shape,
-                                  strides,
-                                  padding=padding,
-                                  data_format=data_format)
-
-    # Infer the static output shape:
-    out_shape = inputs.get_shape().as_list()
-    out_shape[c_axis] = num_outputs
-    out_shape[h_axis] = get_deconv_dim(
-        out_shape[h_axis], stride_h, kernel_h, padding)
-    out_shape[w_axis] = get_deconv_dim(
-        out_shape[w_axis], stride_w, kernel_w, padding)
-    outputs.set_shape(out_shape)
+    # Add variables to collections.
+    _add_variable_to_collections(layer.kernel, variables_collections, 'weights')
+    if layer.bias:
+      _add_variable_to_collections(layer.bias, variables_collections, 'biases')
 
     if normalizer_fn is not None:
       normalizer_params = normalizer_params or {}
       outputs = normalizer_fn(outputs, **normalizer_params)
-    else:
-      if biases_initializer is not None:
-        biases_collections = utils.get_variable_collections(
-            variables_collections, 'biases')
-        biases = variables.model_variable('biases',
-                                          shape=[num_outputs,],
-                                          dtype=dtype,
-                                          initializer=biases_initializer,
-                                          regularizer=biases_regularizer,
-                                          trainable=trainable,
-                                          collections=biases_collections)
-        outputs = nn.bias_add(outputs, biases, data_format=data_format)
 
     if activation_fn is not None:
       outputs = activation_fn(outputs)
@@ -1213,7 +1219,7 @@ def _dense_inner_flatten(inputs, new_rank):
   rank_assertion = check_ops.assert_rank_at_least(
       inputs, new_rank, message='inputs has rank less than new_rank')
   with ops.control_dependencies([rank_assertion]):
-    outer_dimensions = array_ops.slice(
+    outer_dimensions = array_ops.strided_slice(
         array_ops.shape(inputs), [0], [new_rank - 1])
     new_shape = array_ops.concat(0, (outer_dimensions, [-1]))
     reshaped = array_ops.reshape(inputs, new_shape)
@@ -1272,8 +1278,13 @@ def _inner_flatten(inputs, new_rank, output_collections=None, scope=None):
 def _model_variable_getter(getter, name, shape=None, dtype=None,
                            initializer=None, regularizer=None, trainable=True,
                            collections=None, caching_device=None,
-                           partitioner=None, **_):
+                           partitioner=None, rename=None, **_):
   """Getter that uses model_variable for compatibility with core layers."""
+  short_name = name.split('/')[-1]
+  if rename and short_name in rename:
+    name_components = name.split('/')
+    name_components[-1] = rename[short_name]
+    name = '/'.join(name_components)
   return variables.model_variable(
       name, shape=shape, dtype=dtype, initializer=initializer,
       regularizer=regularizer, collections=collections, trainable=trainable,
@@ -1281,6 +1292,18 @@ def _model_variable_getter(getter, name, shape=None, dtype=None,
       custom_getter=getter)
 
 
+def _build_variable_getter(rename=None):
+  """Build a model variable getter that respects scope getter and renames."""
+  # Respect current getter, if one is set.
+  current_custom_getter = variable_scope.get_variable_scope().custom_getter
+  def layer_variable_getter(getter, *args, **kwargs):
+    if current_custom_getter is not None:
+      getter = functools.partial(current_custom_getter, getter)
+    kwargs['rename'] = rename
+    return _model_variable_getter(getter, *args, **kwargs)
+  return layer_variable_getter
+
+
 def _add_variable_to_collections(variable, collections_set, collections_name):
   """Adds variable (or all its parts) to all collections with that name."""
   collections = utils.get_variable_collections(
@@ -1355,11 +1378,13 @@ def fully_connected(inputs,
   if not (isinstance(num_outputs, six.integer_types)):
     raise ValueError('num_outputs should be int or long, got %s.', num_outputs)
 
+  layer_variable_getter = _build_variable_getter({'bias': 'biases'})
+
   with variable_scope.variable_scope(
       scope, 'fully_connected', [inputs],
-      reuse=reuse, custom_getter=_model_variable_getter) as sc:
+      reuse=reuse, custom_getter=layer_variable_getter) as sc:
     inputs = ops.convert_to_tensor(inputs)
-    layer = core_layers.FullyConnected(
+    layer = core_layers.Dense(
         units=num_outputs,
         activation=None,
         use_bias=not normalizer_fn and biases_initializer,
@@ -1372,12 +1397,12 @@ def fully_connected(inputs,
         name=sc.name,
         dtype=inputs.dtype.base_dtype,
         _scope=sc,
-        _reuse_weights=reuse)
+        _reuse=reuse)
     outputs = layer.apply(inputs)
 
     # Add variables to collections.
     _add_variable_to_collections(layer.w, variables_collections, 'weights')
-    if layer.bias:
+    if layer.bias is not None:
       _add_variable_to_collections(layer.bias, variables_collections, 'biases')
 
     # Apply normalizer function / layer.
@@ -1520,19 +1545,14 @@ def max_pool2d(inputs,
     raise ValueError('data_format has to be either NCHW or NHWC.')
   with ops.name_scope(scope, 'MaxPool2D', [inputs]) as sc:
     inputs = ops.convert_to_tensor(inputs)
-    kernel_h, kernel_w = utils.two_element_tuple(kernel_size)
-    stride_h, stride_w = utils.two_element_tuple(stride)
-    if data_format == DATA_FORMAT_NHWC:
-      ksize = [1, kernel_h, kernel_w, 1]
-      strides = [1, stride_h, stride_w, 1]
-    else:
-      ksize = [1, 1, kernel_h, kernel_w]
-      strides = [1, 1, stride_h, stride_w]
-    outputs = nn.max_pool(inputs,
-                          ksize=ksize,
-                          strides=strides,
-                          padding=padding,
-                          data_format=data_format)
+    df = ('channels_first' if data_format and data_format.startswith('NC')
+          else 'channels_last')
+    layer = pooling_layers.MaxPooling2D(pool_size=kernel_size,
+                                        strides=stride,
+                                        padding=padding,
+                                        data_format=df,
+                                        _scope=sc)
+    outputs = layer.apply(inputs)
     return utils.collect_named_outputs(outputs_collections, sc, outputs)
 
 
@@ -1755,62 +1775,91 @@ def separable_convolution2d(
   Returns:
     A `Tensor` representing the output of the operation.
   """
+  layer_variable_getter = _build_variable_getter(
+      {'bias': 'biases',
+       'depthwise_kernel': 'depthwise_weights',
+       'pointwise_kernel': 'pointwise_weights'})
+
   with variable_scope.variable_scope(
-      scope, 'SeparableConv2d', [inputs], reuse=reuse) as sc:
-    dtype = inputs.dtype.base_dtype
-    kernel_h, kernel_w = utils.two_element_tuple(kernel_size)
-    stride_h, stride_w = utils.two_element_tuple(stride)
-    num_filters_in = utils.last_dimension(inputs.get_shape(), min_rank=4)
-    weights_collections = utils.get_variable_collections(
-        variables_collections, 'weights')
+      scope, 'SeparableConv2d', [inputs], reuse=reuse,
+      custom_getter=layer_variable_getter) as sc:
+    inputs = ops.convert_to_tensor(inputs)
 
-    depthwise_shape = [kernel_h, kernel_w,
-                       num_filters_in, depth_multiplier]
-    depthwise_weights = variables.model_variable(
-        'depthwise_weights',
-        shape=depthwise_shape,
-        dtype=dtype,
-        initializer=weights_initializer,
-        regularizer=weights_regularizer,
-        trainable=trainable,
-        collections=weights_collections)
-    strides = [1, stride_h, stride_w, 1]
     if num_outputs is not None:
-      # Full separable convolution: Depthwise followed by pointwise convolution.
-      pointwise_shape = [1, 1, depth_multiplier * num_filters_in,
-                         num_outputs]
-      pointwise_weights = variables.model_variable(
-          'pointwise_weights',
-          shape=pointwise_shape,
+      # Apply separable conv using the SeparableConvolution2D layer.
+      layer = convolutional_layers.SeparableConvolution2D(
+          filters=num_outputs,
+          kernel_size=kernel_size,
+          strides=stride,
+          padding=padding,
+          data_format='channels_last',
+          activation=None,
+          depth_multiplier=depth_multiplier,
+          use_bias=not normalizer_fn and biases_initializer,
+          depthwise_initializer=weights_initializer,
+          pointwise_initializer=weights_initializer,
+          bias_initializer=biases_initializer,
+          depthwise_regularizer=weights_regularizer,
+          pointwise_regularizer=weights_regularizer,
+          bias_regularizer=biases_regularizer,
+          activity_regularizer=None,
+          trainable=trainable,
+          name=sc.name,
+          dtype=inputs.dtype.base_dtype,
+          _scope=sc,
+          _reuse=reuse)
+      outputs = layer.apply(inputs)
+
+      # Add variables to collections.
+      _add_variable_to_collections(layer.depthwise_kernel,
+                                   variables_collections, 'weights')
+      _add_variable_to_collections(layer.pointwise_kernel,
+                                   variables_collections, 'weights')
+      if layer.bias:
+        _add_variable_to_collections(layer.bias,
+                                     variables_collections, 'biases')
+
+      if normalizer_fn is not None:
+        normalizer_params = normalizer_params or {}
+        outputs = normalizer_fn(outputs, **normalizer_params)
+    else:
+      # Actually apply depthwise conv instead of separable conv.
+      dtype = inputs.dtype.base_dtype
+      kernel_h, kernel_w = utils.two_element_tuple(kernel_size)
+      stride_h, stride_w = utils.two_element_tuple(stride)
+      num_filters_in = utils.last_dimension(inputs.get_shape(), min_rank=4)
+      weights_collections = utils.get_variable_collections(
+          variables_collections, 'weights')
+
+      depthwise_shape = [kernel_h, kernel_w,
+                         num_filters_in, depth_multiplier]
+      depthwise_weights = variables.model_variable(
+          'depthwise_weights',
+          shape=depthwise_shape,
           dtype=dtype,
           initializer=weights_initializer,
           regularizer=weights_regularizer,
           trainable=trainable,
           collections=weights_collections)
-      outputs = nn.separable_conv2d(inputs,
-                                    depthwise_weights,
-                                    pointwise_weights,
-                                    strides,
-                                    padding)
-    else:
-      # Depthwise convolution only.
+      strides = [1, stride_h, stride_w, 1]
+
       outputs = nn.depthwise_conv2d(inputs, depthwise_weights, strides, padding)
       num_outputs = depth_multiplier * num_filters_in
 
-    if normalizer_fn is not None:
-      normalizer_params = normalizer_params or {}
-      outputs = normalizer_fn(outputs, **normalizer_params)
-    else:
-      if biases_initializer is not None:
-        biases_collections = utils.get_variable_collections(
-            variables_collections, 'biases')
-        biases = variables.model_variable('biases',
-                                          shape=[num_outputs,],
-                                          dtype=dtype,
-                                          initializer=biases_initializer,
-                                          regularizer=biases_regularizer,
-                                          collections=biases_collections)
-        outputs = nn.bias_add(outputs, biases)
+      if normalizer_fn is not None:
+        normalizer_params = normalizer_params or {}
+        outputs = normalizer_fn(outputs, **normalizer_params)
+      else:
+        if biases_initializer is not None:
+          biases_collections = utils.get_variable_collections(
+              variables_collections, 'biases')
+          biases = variables.model_variable('biases',
+                                            shape=[num_outputs,],
+                                            dtype=dtype,
+                                            initializer=biases_initializer,
+                                            regularizer=biases_regularizer,
+                                            collections=biases_collections)
+          outputs = nn.bias_add(outputs, biases)
 
     if activation_fn is not None:
       outputs = activation_fn(outputs)
@@ -1928,7 +1977,8 @@ def unit_norm(inputs, dim, epsilon=1e-7, scope=None):
     multiples = []
     if dim > 0:
       multiples.append(array_ops.ones([dim], dtypes.int32))
-    multiples.append(array_ops.slice(array_ops.shape(inputs), [dim], [1]))
+    multiples.append(
+        array_ops.strided_slice(array_ops.shape(inputs), [dim], [dim + 1]))
     if dim < (input_rank - 1):
       multiples.append(array_ops.ones([input_rank - 1 - dim], dtypes.int32))
     multiples = array_ops.concat(0, multiples)
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index 527e6d5670c010..d134403e86f902 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -279,6 +279,18 @@ def testCreateFullyConv(self):
       biases = tf.contrib.framework.get_variables_by_name('biases')[0]
       self.assertListEqual(biases.get_shape().as_list(), [64])
 
+  def testFullyConvWithCustomGetter(self):
+    height, width = 7, 9
+    with self.test_session():
+      called = [0]
+      def custom_getter(getter, *args, **kwargs):
+        called[0] += 1
+        return getter(*args, **kwargs)
+      with tf.variable_scope('test', custom_getter=custom_getter):
+        images = tf.random_uniform((5, height, width, 32), seed=1)
+        tf.contrib.layers.convolution2d(images, 64, images.get_shape()[1:3])
+      self.assertEqual(called[0], 2)  # Custom getter called twice.
+
   def testCreateVerticalConv(self):
     height, width = 7, 9
     with self.test_session():
@@ -369,7 +381,7 @@ def testCreateConvWithWD(self):
           tf.contrib.framework.get_variables_by_name('weights')[0])
       wd = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)[0]
       self.assertEqual(wd.op.name,
-                       'Conv/weights/Regularizer/l2_regularizer')
+                       'Conv/kernel/Regularizer/l2_regularizer')
       sess.run(tf.global_variables_initializer())
       self.assertAlmostEqual(sess.run(wd), weight_decay * l2_loss.eval())
 
@@ -1475,6 +1487,19 @@ def testCreateFCWithWD(self):
       sess.run(tf.global_variables_initializer())
       self.assertLess(sess.run(wd), 0.4)
 
+  def testCreateFCWithBD(self):
+    height, width = 3, 3
+    with self.test_session() as sess:
+      inputs = tf.random_uniform((5, height * width * 3), seed=1)
+      bias_decay = tf.contrib.layers.l2_regularizer(0.01)
+      tf.contrib.layers.fully_connected(inputs, 32,
+                                        biases_regularizer=bias_decay)
+      wd = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)[0]
+      self.assertEqual(wd.op.name,
+                       'fully_connected/bias/Regularizer/l2_regularizer')
+      sess.run(tf.global_variables_initializer())
+      self.assertLess(sess.run(wd), 0.4)
+
   def testCreateNoRegularizers(self):
     height, width = 3, 3
     with self.test_session():
@@ -1558,14 +1583,14 @@ def testUnknownChannelsDimNHWC(self):
     with tf.Graph().as_default() as g, self.test_session(g):
       inputs = tf.placeholder(dtype=tf.float32)
       inputs.set_shape(tf.TensorShape((5, 3, 3, None)))
-      with self.assertRaisesRegexp(ValueError, 'undefined channels dimension'):
+      with self.assertRaisesRegexp(ValueError, 'undefined'):
         tf.contrib.layers.batch_norm(inputs, data_format='NHWC')
 
   def testUnknownChannelsDimNCHW(self):
     with tf.Graph().as_default() as g, self.test_session(g):
       inputs = tf.placeholder(dtype=tf.float32)
       inputs.set_shape(tf.TensorShape((5, None, 3, 3)))
-      with self.assertRaisesRegexp(ValueError, 'undefined channels dimension'):
+      with self.assertRaisesRegexp(ValueError, 'undefined'):
         tf.contrib.layers.batch_norm(inputs, data_format='NCHW')
 
   def testWeightedMomentsFused(self):
@@ -2612,13 +2637,13 @@ def testCreateConvWithWeightDecay(self):
       weight_decay = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)[0]
       self.assertEqual(
           weight_decay.op.name,
-          'SeparableConv2d/depthwise_weights/Regularizer/l2_regularizer')
+          'SeparableConv2d/depthwise_kernel/Regularizer/l2_regularizer')
       sess.run(tf.global_variables_initializer())
       self.assertLessEqual(sess.run(weight_decay), 0.05)
       weight_decay = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)[1]
       self.assertEqual(
           weight_decay.op.name,
-          'SeparableConv2d/pointwise_weights/Regularizer/l2_regularizer')
+          'SeparableConv2d/pointwise_kernel/Regularizer/l2_regularizer')
       self.assertLessEqual(sess.run(weight_decay), 0.05)
 
   def testReuseConvWithWeightDecay(self):
diff --git a/tensorflow/contrib/layers/python/layers/optimizers_test.py b/tensorflow/contrib/layers/python/layers/optimizers_test.py
index 1dfed82103a6a4..ab183ba75d3feb 100644
--- a/tensorflow/contrib/layers/python/layers/optimizers_test.py
+++ b/tensorflow/contrib/layers/python/layers/optimizers_test.py
@@ -195,7 +195,7 @@ def testAdaptiveGradientClip(self):
       self.assertAlmostEqual(var_value, 9.8916, 4)
       self.assertEqual(global_step_value, 1)
       var_count = 0
-      for var in tf.all_variables():
+      for var in tf.global_variables():
         if var.name.startswith("OptimizeLoss/AdaptiveMaxNorm"):
           var_count += 1
       self.assertEqual(2, var_count)
@@ -366,7 +366,7 @@ def testAverages(self):
           decay=0.5)(grads_and_vars)
 
       var_dict = {}
-      for var in tf.all_variables():
+      for var in tf.global_variables():
         if var.name.startswith("AdaptiveMaxNorm"):
           var_dict[var.name.split(":")[0]] = var
       self.assertEqual(2, len(var_dict))
diff --git a/tensorflow/contrib/layers/python/ops/sparse_feature_cross_op.py b/tensorflow/contrib/layers/python/ops/sparse_feature_cross_op.py
index e4141c6b6d1305..688315fd12e6f3 100644
--- a/tensorflow/contrib/layers/python/ops/sparse_feature_cross_op.py
+++ b/tensorflow/contrib/layers/python/ops/sparse_feature_cross_op.py
@@ -81,7 +81,7 @@ def sparse_feature_cross(inputs, hashed_output=False, num_buckets=0,
 
   indices = [sp_input.indices for sp_input in sparse_inputs]
   values = [sp_input.values for sp_input in sparse_inputs]
-  shapes = [sp_input.shape for sp_input in sparse_inputs]
+  shapes = [sp_input.dense_shape for sp_input in sparse_inputs]
   out_type = dtypes.int64 if hashed_output else dtypes.string
 
   internal_type = dtypes.string
diff --git a/tensorflow/contrib/learn/BUILD b/tensorflow/contrib/learn/BUILD
index 764971935f2953..d0ba0599d07b5d 100644
--- a/tensorflow/contrib/learn/BUILD
+++ b/tensorflow/contrib/learn/BUILD
@@ -401,7 +401,7 @@ py_test(
 
 py_test(
     name = "head_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/learn/estimators/head_test.py"],
     srcs_version = "PY2AND3",
     deps = [
@@ -638,6 +638,18 @@ py_test(
     ],
 )
 
+py_test(
+    name = "numpy_io_test",
+    size = "small",
+    srcs = ["python/learn/learn_io/numpy_io_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":learn",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
 py_test(
     name = "pandas_io_test",
     size = "small",
diff --git a/tensorflow/contrib/learn/python/learn/README.md b/tensorflow/contrib/learn/python/learn/README.md
index 0dd08cfcccb6cb..6afcfbf3761fda 100644
--- a/tensorflow/contrib/learn/python/learn/README.md
+++ b/tensorflow/contrib/learn/python/learn/README.md
@@ -108,13 +108,12 @@ import tensorflow.contrib.learn.python.learn as learn
 iris = datasets.load_iris()
 
 def my_model(features, labels):
-  """DNN with three hidden layers, and dropout of 0.1 probability."""
+  """DNN with three hidden layers."""
   # Convert the labels to a one-hot tensor of shape (length of features, 3) and
   # with a on-value of 1 for each one-hot vector of length 3.
   labels = tf.one_hot(labels, 3, 1, 0)
 
-  # Create three fully connected layers respectively of size 10, 20, and 10 with
-  # each layer having a dropout probability of 0.1.
+  # Create three fully connected layers respectively of size 10, 20, and 10.
   features = layers.stack(features, layers.fully_connected, [10, 20, 10])
 
   # Create two tensors respectively for prediction and loss.
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/queues/feeding_functions.py b/tensorflow/contrib/learn/python/learn/dataframe/queues/feeding_functions.py
index 46e1def77ef368..209a97f2ae0930 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/queues/feeding_functions.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/queues/feeding_functions.py
@@ -69,9 +69,9 @@ def __call__(self):
       raise errors.OutOfRangeError(None, None,
                                    "Already emitted %s epochs." % self._epoch)
 
-    integer_indexes = [j % self._max
-                       for j in range(self._trav, self._trav + self._batch_size)
-                      ]
+    integer_indexes = [
+        j % self._max for j in range(self._trav, self._trav + self._batch_size)
+    ]
 
     if self._epoch_end in integer_indexes:
       # after this batch we will have processed self._epoch epochs, possibly
@@ -79,8 +79,10 @@ def __call__(self):
       self._epoch += 1
 
     self._trav = (integer_indexes[-1] + 1) % self._max
-    return {self._placeholders[0]: integer_indexes,
-            self._placeholders[1]: self._array[integer_indexes]}
+    return {
+        self._placeholders[0]: integer_indexes,
+        self._placeholders[1]: self._array[integer_indexes]
+    }
 
 
 class _OrderedDictNumpyFeedFn(object):
@@ -99,7 +101,7 @@ def __init__(self,
     self._index_placeholder = placeholders[0]
     self._col_placeholders = placeholders[1:]
     self._ordered_dict_of_arrays = ordered_dict_of_arrays
-    self._max = len(ordered_dict_of_arrays.values()[0])
+    self._max = len(next(iter(ordered_dict_of_arrays.values())))
     for _, v in ordered_dict_of_arrays.items():
       if len(v) != self._max:
         raise ValueError("Array lengths must match.")
@@ -115,9 +117,9 @@ def __call__(self):
       raise errors.OutOfRangeError(None, None,
                                    "Already emitted %s epochs." % self._epoch)
 
-    integer_indexes = [j % self._max
-                       for j in range(self._trav, self._trav + self._batch_size)
-                      ]
+    integer_indexes = [
+        j % self._max for j in range(self._trav, self._trav + self._batch_size)
+    ]
 
     if self._epoch_end in integer_indexes:
       # after this batch we will have processed self._epoch epochs, possibly
@@ -126,8 +128,10 @@ def __call__(self):
 
     self._trav = (integer_indexes[-1] + 1) % self._max
     feed_dict = {self._index_placeholder: integer_indexes}
-    cols = [column[integer_indexes]
-            for column in self._ordered_dict_of_arrays.values()]
+    cols = [
+        column[integer_indexes]
+        for column in self._ordered_dict_of_arrays.values()
+    ]
     feed_dict.update(dict(zip(self._col_placeholders, cols)))
     return feed_dict
 
@@ -161,9 +165,9 @@ def __call__(self):
       raise errors.OutOfRangeError(None, None,
                                    "Already emitted %s epochs." % self._epoch)
 
-    integer_indexes = [j % self._max
-                       for j in range(self._trav, self._trav + self._batch_size)
-                      ]
+    integer_indexes = [
+        j % self._max for j in range(self._trav, self._trav + self._batch_size)
+    ]
 
     if self._epoch_end in integer_indexes:
       # after this batch we will have processed self._epoch epochs, possibly
@@ -172,7 +176,7 @@ def __call__(self):
       if self._epoch == self._num_epochs:
         # trim this batch, so as not to overshoot the last epoch.
         batch_end_inclusive = integer_indexes.index(self._epoch_end)
-        integer_indexes = integer_indexes[:(batch_end_inclusive+1)]
+        integer_indexes = integer_indexes[:(batch_end_inclusive + 1)]
 
     self._trav = (integer_indexes[-1] + 1) % self._max
     result = self._dataframe.iloc[integer_indexes]
@@ -193,14 +197,14 @@ def enqueue_data(data,
                  num_epochs=None):
   """Creates a queue filled from a numpy array or pandas `DataFrame`.
 
-    Returns a queue filled with the rows of the given array or `DataFrame`. In
-    the case of a pandas `DataFrame`, the first enqueued `Tensor` corresponds to
-    the index of the `DataFrame`. For numpy arrays, the first enqueued `Tensor`
-    contains the row number.
+    Returns a queue filled with the rows of the given (`OrderedDict` of) array
+    or `DataFrame`. In the case of a pandas `DataFrame`, the first enqueued
+    `Tensor` corresponds to the index of the `DataFrame`. For (`OrderedDict` of)
+    numpy arrays, the first enqueued `Tensor` contains the row number.
 
   Args:
-    data: a numpy `ndarray or` pandas `DataFrame` that will be read into the
-      queue.
+    data: a numpy `ndarray`, `OrderedDict` of numpy arrays, or pandas
+      `DataFrame` that will be read into the queue.
     capacity: the capacity of the queue.
     shuffle: whether or not to shuffle the rows of the array.
     min_after_dequeue: minimum number of elements that can remain in the queue
@@ -213,10 +217,12 @@ def enqueue_data(data,
     num_epochs: limit enqueuing to a specified number of epochs, if provided.
 
   Returns:
-    A queue filled with the rows of the given array or `DataFrame`.
+    A queue filled with the rows of the given (`OrderedDict` of) array or
+      `DataFrame`.
 
   Raises:
-    TypeError: `data` is not a Pandas `DataFrame` or a numpy `ndarray`.
+    TypeError: `data` is not a Pandas `DataFrame`, an `OrderedDict` of numpy
+      arrays  or a numpy `ndarray`.
   """
   with ops.name_scope(name):
     if isinstance(data, np.ndarray):
@@ -229,8 +235,9 @@ def enqueue_data(data,
       queue_shapes = [()] + [col.shape[1:] for col in data.values()]
       get_feed_fn = _OrderedDictNumpyFeedFn
     elif HAS_PANDAS and isinstance(data, pd.DataFrame):
-      types = [dtypes.as_dtype(dt)
-               for dt in [data.index.dtype] + list(data.dtypes)]
+      types = [
+          dtypes.as_dtype(dt) for dt in [data.index.dtype] + list(data.dtypes)
+      ]
       queue_shapes = [() for _ in types]
       get_feed_fn = _PandasFeedFn
     else:
@@ -264,16 +271,16 @@ def enqueue_data(data,
     if shuffle:
       min_after_dequeue = int(capacity / 4 if min_after_dequeue is None else
                               min_after_dequeue)
-      queue = data_flow_ops.RandomShuffleQueue(capacity,
-                                               min_after_dequeue,
-                                               dtypes=types,
-                                               shapes=queue_shapes,
-                                               seed=seed)
+      queue = data_flow_ops.RandomShuffleQueue(
+          capacity,
+          min_after_dequeue,
+          dtypes=types,
+          shapes=queue_shapes,
+          seed=seed)
     else:
       min_after_dequeue = 0  # just for the summary text
-      queue = data_flow_ops.FIFOQueue(capacity,
-                                      dtypes=types,
-                                      shapes=queue_shapes)
+      queue = data_flow_ops.FIFOQueue(
+          capacity, dtypes=types, shapes=queue_shapes)
 
     enqueue_ops = []
     feed_fns = []
@@ -285,16 +292,17 @@ def enqueue_data(data,
 
       enqueue_ops.append(queue.enqueue_many(placeholders))
       seed_i = None if seed is None else (i + 1) * seed
-      feed_fns.append(get_feed_fn(placeholders,
-                                  data,
-                                  enqueue_size,
-                                  random_start=shuffle,
-                                  seed=seed_i,
-                                  num_epochs=num_epochs))
-
-    runner = fqr.FeedingQueueRunner(queue=queue,
-                                    enqueue_ops=enqueue_ops,
-                                    feed_fns=feed_fns)
+      feed_fns.append(
+          get_feed_fn(
+              placeholders,
+              data,
+              enqueue_size,
+              random_start=shuffle,
+              seed=seed_i,
+              num_epochs=num_epochs))
+
+    runner = fqr.FeedingQueueRunner(
+        queue=queue, enqueue_ops=enqueue_ops, feed_fns=feed_fns)
     queue_runner.add_queue_runner(runner)
 
     full = (math_ops.cast(
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/transforms/difference.py b/tensorflow/contrib/learn/python/learn/dataframe/transforms/difference.py
index 6ce71d882e5c8e..e4a63273ba239b 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/transforms/difference.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transforms/difference.py
@@ -28,7 +28,7 @@
 def _negate_sparse(st):
   return sparse_tensor.SparseTensor(indices=st.indices,
                                     values=-st.values,
-                                    shape=st.shape)
+                                    dense_shape=st.shape)
 
 
 @series.Series.register_binary_op("__sub__")
diff --git a/tensorflow/contrib/learn/python/learn/datasets/base.py b/tensorflow/contrib/learn/python/learn/datasets/base.py
index cdff6baf83c55b..71978d439449e2 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/base.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/base.py
@@ -186,8 +186,8 @@ def _is_retriable(e):
 
 
 @retry(initial_delay=1.0, max_delay=16.0, is_retriable=_is_retriable)
-def urlretrieve_with_retry(url, filename):
-  urllib.request.urlretrieve(url, filename)
+def urlretrieve_with_retry(url, filename=None):
+  return urllib.request.urlretrieve(url, filename)
 
 
 def maybe_download(filename, work_directory, source_url):
@@ -205,11 +205,9 @@ def maybe_download(filename, work_directory, source_url):
     gfile.MakeDirs(work_directory)
   filepath = os.path.join(work_directory, filename)
   if not gfile.Exists(filepath):
-    with tempfile.NamedTemporaryFile() as tmpfile:
-      temp_file_name = tmpfile.name
-      urlretrieve_with_retry(source_url, temp_file_name)
-      gfile.Copy(temp_file_name, filepath)
-      with gfile.GFile(filepath) as f:
-        size = f.size()
-      print('Successfully downloaded', filename, size, 'bytes.')
+    temp_file_name, _ = urlretrieve_with_retry(source_url)
+    gfile.Copy(temp_file_name, filepath)
+    with gfile.GFile(filepath) as f:
+      size = f.size()
+    print('Successfully downloaded', filename, size, 'bytes.')
   return filepath
diff --git a/tensorflow/contrib/learn/python/learn/estimators/composable_model_test.py b/tensorflow/contrib/learn/python/learn/estimators/composable_model_test.py
index 0d1a1c52e225bc..6cee05250adeb8 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/composable_model_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/composable_model_test.py
@@ -117,7 +117,7 @@ def input_fn():
           'age': tf.constant([1]),
           'language': tf.SparseTensor(values=['english'],
                                       indices=[[0, 0]],
-                                      shape=[1, 1])
+                                      dense_shape=[1, 1])
       }, tf.constant([[1]])
 
     language = tf.contrib.layers.sparse_column_with_hash_bucket('language', 100)
@@ -139,10 +139,11 @@ def testJointLinearModel(self):
 
     def input_fn():
       return {
-          'age': tf.SparseTensor(values=['1'], indices=[[0, 0]], shape=[1, 1]),
+          'age': tf.SparseTensor(
+              values=['1'], indices=[[0, 0]], dense_shape=[1, 1]),
           'language': tf.SparseTensor(values=['english'],
                                       indices=[[0, 0]],
-                                      shape=[1, 1])
+                                      dense_shape=[1, 1])
       }, tf.constant([[1]])
 
     language = tf.contrib.layers.sparse_column_with_hash_bucket('language', 100)
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn.py b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
index 98947cc6d43e04..34070bc6a80893 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
@@ -28,6 +28,7 @@
 from tensorflow.contrib.framework.python.ops import variables as contrib_variables
 from tensorflow.contrib.layers.python.layers import optimizers
 from tensorflow.contrib.learn.python.learn import evaluable
+from tensorflow.contrib.learn.python.learn import metric_spec
 from tensorflow.contrib.learn.python.learn import monitors as monitor_lib
 from tensorflow.contrib.learn.python.learn import trainable
 from tensorflow.contrib.learn.python.learn.estimators import dnn_linear_combined
@@ -341,11 +342,13 @@ def fit(self, x=None, y=None, input_fn=None, steps=None, batch_size=None,
     return self
 
   def evaluate(self, x=None, y=None, input_fn=None, feed_fn=None,
-               batch_size=None, steps=None, metrics=None, name=None):
+               batch_size=None, steps=None, metrics=None, name=None,
+               checkpoint_path=None):
     """See evaluable.Evaluable. Note: Labels must be integer class indices."""
     return self._estimator.evaluate(
         x=x, y=y, input_fn=input_fn, feed_fn=feed_fn, batch_size=batch_size,
-        steps=steps, metrics=metrics, name=name)
+        steps=steps, metrics=metrics, name=name,
+        checkpoint_path=checkpoint_path)
 
   @deprecated_arg_values(
       estimator.AS_ITERABLE_DATE, estimator.AS_ITERABLE_INSTRUCTIONS,
@@ -363,9 +366,9 @@ def predict(self, x=None, input_fn=None, batch_size=None, as_iterable=True):
         num_epochs=1 if you are using something like read_batch_features).
 
     Returns:
-      Numpy array of predicted classes (or an iterable of predicted classes if
-      as_iterable is True). Each predicted class is represented by its class
-      index (i.e. integer from 0 to n_classes-1).
+      Numpy array of predicted classes with shape [batch_size] (or an iterable
+      of predicted classes if as_iterable is True). Each predicted class is
+      represented by its class index (i.e. integer from 0 to n_classes-1).
     """
     key = prediction_key.PredictionKey.CLASSES
     preds = self._estimator.predict(x=x, input_fn=input_fn,
@@ -392,9 +395,8 @@ def predict_proba(
         num_epochs=1 if you are using something like read_batch_features).
 
     Returns:
-      Numpy array of predicted probabilities (or an iterable of predicted
-      probabilities if as_iterable is True). Each predicted class is represented
-      by its class index (i.e. integer from 0 to n_classes-1).
+      Numpy array of predicted probabilities with shape [batch_size, n_classes]
+      (or an iterable of predicted probabilities if as_iterable is True).
     """
     key = prediction_key.PredictionKey.PROBABILITIES
     preds = self._estimator.predict(x=x, input_fn=input_fn,
@@ -508,7 +510,7 @@ def config(self):
     return self._estimator.config
 
 
-class DNNRegressor(dnn_linear_combined.DNNLinearCombinedRegressor):
+class DNNRegressor(evaluable.Evaluable, trainable.Trainable):
   """A regressor for TensorFlow DNN models.
 
   Example:
@@ -574,7 +576,8 @@ def __init__(self,
                enable_centered_bias=False,
                config=None,
                feature_engineering_fn=None,
-               label_dimension=1):
+               label_dimension=1,
+               embedding_lr_multipliers=None):
     """Initializes a `DNNRegressor` instance.
 
     Args:
@@ -608,42 +611,147 @@ def __init__(self,
                         returns features and labels which will be fed
                         into the model.
       label_dimension: Dimension of the label for multilabels. Defaults to 1.
+      embedding_lr_multipliers: Optional. A dictionary from `EbeddingColumn` to
+          a `float` multiplier. Multiplier will be used to multiply with
+          learning rate for the embedding variables.
 
     Returns:
       A `DNNRegressor` estimator.
     """
-    super(DNNRegressor, self).__init__(
+    self._feature_columns = feature_columns
+
+    self._estimator = estimator.Estimator(
+        model_fn=_dnn_model_fn,
         model_dir=model_dir,
-        weight_column_name=weight_column_name,
-        dnn_feature_columns=feature_columns,
-        dnn_optimizer=optimizer,
-        dnn_hidden_units=hidden_units,
-        dnn_activation_fn=activation_fn,
-        dnn_dropout=dropout,
-        gradient_clip_norm=gradient_clip_norm,
-        enable_centered_bias=enable_centered_bias,
         config=config,
-        feature_engineering_fn=feature_engineering_fn,
-        label_dimension=label_dimension)
-    self.feature_columns = feature_columns
-    self.optimizer = optimizer
-    self.activation_fn = activation_fn
-    self.dropout = dropout
-    self.hidden_units = hidden_units
-    self._feature_columns_inferred = False
+        params={
+            "head": head_lib._regression_head(  # pylint: disable=protected-access
+                label_dimension=label_dimension,
+                weight_column_name=weight_column_name,
+                enable_centered_bias=enable_centered_bias),
+            "hidden_units": hidden_units,
+            "feature_columns": feature_columns,
+            "optimizer": optimizer,
+            "activation_fn": activation_fn,
+            "dropout": dropout,
+            "gradient_clip_norm": gradient_clip_norm,
+            "num_ps_replicas": config.num_ps_replicas if config else 0,
+            "embedding_lr_multipliers": embedding_lr_multipliers,
+        },
+        feature_engineering_fn=feature_engineering_fn)
+
+  def fit(self, x=None, y=None, input_fn=None, steps=None, batch_size=None,
+          monitors=None, max_steps=None):
+    """See trainable.Trainable."""
+    # TODO(roumposg): Remove when deprecated monitors are removed.
+    hooks = monitor_lib.replace_monitors_with_hooks(monitors, self)
+    self._estimator.fit(x=x,
+                        y=y,
+                        input_fn=input_fn,
+                        steps=steps,
+                        batch_size=batch_size,
+                        monitors=hooks,
+                        max_steps=max_steps)
+    return self
+
+  def evaluate(self, x=None, y=None, input_fn=None, feed_fn=None,
+               batch_size=None, steps=None, metrics=None, name=None,
+               checkpoint_path=None):
+    """See evaluable.Evaluable."""
+    # TODO(zakaria): remove once deprecation is finished (b/31229024)
+    custom_metrics = {}
+    if metrics:
+      for key, metric in six.iteritems(metrics):
+        if (not isinstance(metric, metric_spec.MetricSpec) and
+            not isinstance(key, tuple)):
+          custom_metrics[(key, prediction_key.PredictionKey.SCORES)] = metric
+        else:
+          custom_metrics[key] = metric
+
+    return self._estimator.evaluate(
+        x=x, y=y, input_fn=input_fn, feed_fn=feed_fn, batch_size=batch_size,
+        steps=steps, metrics=custom_metrics, name=name,
+        checkpoint_path=checkpoint_path)
+
+  @deprecated_arg_values(
+      estimator.AS_ITERABLE_DATE, estimator.AS_ITERABLE_INSTRUCTIONS,
+      as_iterable=False)
+  def predict(self, x=None, input_fn=None, batch_size=None, as_iterable=True):
+    """Returns predicted scores for given features.
+
+    Args:
+      x: features.
+      input_fn: Input function. If set, x must be None.
+      batch_size: Override default batch size.
+      as_iterable: If True, return an iterable which keeps yielding predictions
+        for each example until inputs are exhausted. Note: The inputs must
+        terminate if you want the iterable to terminate (e.g. be sure to pass
+        num_epochs=1 if you are using something like read_batch_features).
+
+    Returns:
+      Numpy array of predicted scores (or an iterable of predicted scores if
+      as_iterable is True). If `label_dimension == 1`, the shape of the output
+      is `[batch_size]`, otherwise the shape is `[batch_size, label_dimension]`.
+    """
+    key = prediction_key.PredictionKey.SCORES
+    preds = self._estimator.predict(x=x, input_fn=input_fn,
+                                    batch_size=batch_size, outputs=[key],
+                                    as_iterable=as_iterable)
+    if as_iterable:
+      return (pred[key] for pred in preds)
+    return preds[key]
+
+  def _get_predict_ops(self, features):
+    """See `Estimator` class."""
+    # This method exists to support some models that use the legacy interface.
+    # pylint: disable=protected-access
+    return self._estimator._get_predict_ops(features)
+
+  def get_variable_names(self):
+    """Returns list of all variable names in this model.
+
+    Returns:
+      List of names.
+    """
+    return self._estimator.get_variable_names()
+
+  def get_variable_value(self, name):
+    """Returns value of the variable given by name.
+
+    Args:
+      name: string, name of the tensor.
+
+    Returns:
+      `Tensor` object.
+    """
+    return self._estimator.get_variable_value(name)
+
+  def export(self,
+             export_dir,
+             input_fn=None,
+             input_feature_key=None,
+             use_deprecated_input_fn=True,
+             signature_fn=None,
+             default_batch_size=1,
+             exports_to_keep=None):
+    """See BaseEstimator.export."""
+    def default_input_fn(unused_estimator, examples):
+      return layers.parse_feature_columns_from_examples(
+          examples, self._feature_columns)
+    return self._estimator.export(
+        export_dir=export_dir,
+        input_fn=input_fn or default_input_fn,
+        input_feature_key=input_feature_key,
+        use_deprecated_input_fn=use_deprecated_input_fn,
+        signature_fn=signature_fn or export.regression_signature_fn,
+        prediction_key=prediction_key.PredictionKey.SCORES,
+        default_batch_size=default_batch_size,
+        exports_to_keep=exports_to_keep)
 
   @property
-  @deprecated("2016-10-30",
-              "This method will be removed after the deprecation date. "
-              "To inspect variables, use get_variable_names() and "
-              "get_variable_value().")
-  def weights_(self):
-    return self.dnn_weights_
+  def model_dir(self):
+    return self._estimator.model_dir
 
   @property
-  @deprecated("2016-10-30",
-              "This method will be removed after the deprecation date. "
-              "To inspect variables, use get_variable_names() and "
-              "get_variable_value().")
-  def bias_(self):
-    return self.dnn_bias_
+  def config(self):
+    return self._estimator.config
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
index 256e07407926b6..edaee5dfac7450 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
@@ -32,6 +32,7 @@
 from tensorflow.contrib.layers.python.layers import feature_column_ops
 from tensorflow.contrib.layers.python.layers import optimizers
 from tensorflow.contrib.learn.python.learn import evaluable
+from tensorflow.contrib.learn.python.learn import metric_spec
 from tensorflow.contrib.learn.python.learn import monitors as monitor_lib
 from tensorflow.contrib.learn.python.learn import trainable
 from tensorflow.contrib.learn.python.learn.estimators import composable_model
@@ -248,8 +249,18 @@ def _get_eval_ops(self, features, labels, metrics=None):
     custom_metrics = {}
     if metrics:
       for name, metric in six.iteritems(metrics):
-        if not isinstance(name, tuple):
-          # TODO(zakaria): remove once deprecation is finished (b/31229024)
+        # Apply default_prediction_key
+        if isinstance(metric, metric_spec.MetricSpec):
+          if not metric.prediction_key:
+            custom_metrics[name] = metric_spec.MetricSpec(
+                metric_fn=metric.metric_fn,
+                prediction_key=self._default_prediction_key,
+                label_key=metric.label_key,
+                weight_key=metric.weight_key)
+          else:
+            custom_metrics[name] = metric
+        # TODO(zakaria): remove once deprecation is finished (b/31229024)
+        elif not isinstance(name, tuple):
           custom_metrics[(name, self._default_prediction_key)] = metric
         else:
           custom_metrics[name] = metric
@@ -752,11 +763,13 @@ def fit(self, x=None, y=None, input_fn=None, steps=None, batch_size=None,
     return self
 
   def evaluate(self, x=None, y=None, input_fn=None, feed_fn=None,
-               batch_size=None, steps=None, metrics=None, name=None):
+               batch_size=None, steps=None, metrics=None, name=None,
+               checkpoint_path=None):
     """See evaluable.Evaluable."""
     return self._estimator.evaluate(
         x=x, y=y, input_fn=input_fn, feed_fn=feed_fn, batch_size=batch_size,
-        steps=steps, metrics=metrics, name=name)
+        steps=steps, metrics=metrics, name=name,
+        checkpoint_path=checkpoint_path)
 
   @deprecated_arg_values(
       estimator.AS_ITERABLE_DATE, estimator.AS_ITERABLE_INSTRUCTIONS,
@@ -774,9 +787,9 @@ def predict(self, x=None, input_fn=None, batch_size=None, as_iterable=True):
         num_epochs=1 if you are using something like read_batch_features).
 
     Returns:
-      Numpy array of predicted classes (or an iterable of predicted classes if
-      as_iterable is True). Each predicted class is represented by its class
-      index (i.e. integer from 0 to n_classes-1).
+      Numpy array of predicted classes with shape [batch_size] (or an iterable
+      of predicted classes if as_iterable is True). Each predicted class is
+      represented by its class index (i.e. integer from 0 to n_classes-1).
     """
     key = prediction_key.PredictionKey.CLASSES
     preds = self._estimator.predict(
@@ -806,9 +819,8 @@ def predict_proba(
         num_epochs=1 if you are using something like read_batch_features).
 
     Returns:
-      Numpy array of predicted probabilities (or an iterable of predicted
-      probabilities if as_iterable is True). Each predicted class is represented
-      by its class index (i.e. integer from 0 to n_classes-1).
+      Numpy array of predicted probabilities with shape [batch_size, n_classes]
+      (or an iterable of predicted probabilities if as_iterable is True).
     """
     key = prediction_key.PredictionKey.PROBABILITIES
     preds = self._estimator.predict(
@@ -955,7 +967,7 @@ def config(self):
     return self._estimator.config
 
 
-class DNNLinearCombinedRegressor(_DNNLinearCombinedBaseEstimator):
+class DNNLinearCombinedRegressor(evaluable.Evaluable, trainable.Trainable):
   """A regressor for TensorFlow Linear and DNN joined training models.
 
   Example:
@@ -1028,7 +1040,8 @@ def __init__(self,  # _joint_linear_weights pylint: disable=invalid-name
                enable_centered_bias=False,
                label_dimension=1,
                config=None,
-               feature_engineering_fn=None):
+               feature_engineering_fn=None,
+               embedding_lr_multipliers=None):
     """Initializes a DNNLinearCombinedRegressor instance.
 
     Args:
@@ -1069,62 +1082,160 @@ def __init__(self,  # _joint_linear_weights pylint: disable=invalid-name
                         labels which are the output of `input_fn` and
                         returns features and labels which will be fed
                         into the model.
+      embedding_lr_multipliers: Optional. A dictionary from `EmbeddingColumn` to
+          a `float` multiplier. Multiplier will be used to multiply with
+          learning rate for the embedding variables.
 
     Raises:
       ValueError: If both linear_feature_columns and dnn_features_columns are
         empty at the same time.
     """
+    linear_feature_columns = linear_feature_columns or []
+    dnn_feature_columns = dnn_feature_columns or []
+    self._feature_columns = linear_feature_columns + dnn_feature_columns
+    if not self._feature_columns:
+      raise ValueError("Either linear_feature_columns or dnn_feature_columns "
+                       "must be defined.")
+
     head = head_lib._regression_head(  # pylint: disable=protected-access
         weight_column_name=weight_column_name,
         label_dimension=label_dimension,
         enable_centered_bias=enable_centered_bias)
-    super(DNNLinearCombinedRegressor, self).__init__(
+    self._estimator = estimator.Estimator(
+        model_fn=_dnn_linear_combined_model_fn,
         model_dir=model_dir,
-        linear_feature_columns=linear_feature_columns,
-        linear_optimizer=linear_optimizer,
-        _joint_linear_weights=_joint_linear_weights,
-        dnn_feature_columns=dnn_feature_columns,
-        dnn_optimizer=dnn_optimizer,
-        dnn_hidden_units=dnn_hidden_units,
-        dnn_activation_fn=dnn_activation_fn,
-        dnn_dropout=dnn_dropout,
-        gradient_clip_norm=gradient_clip_norm,
-        head=head,
         config=config,
-        feature_engineering_fn=feature_engineering_fn,
-        default_prediction_key=prediction_key.PredictionKey.SCORES,
-        enable_centered_bias=enable_centered_bias)
+        params={
+            "head": head,
+            "linear_feature_columns": linear_feature_columns,
+            "linear_optimizer": linear_optimizer or "Ftrl",
+            "joint_linear_weights": _joint_linear_weights,
+            "dnn_feature_columns": dnn_feature_columns,
+            "dnn_optimizer": dnn_optimizer or "Adagrad",
+            "dnn_hidden_units": dnn_hidden_units,
+            "dnn_activation_fn": dnn_activation_fn,
+            "dnn_dropout": dnn_dropout,
+            "gradient_clip_norm": gradient_clip_norm,
+            "num_ps_replicas": config.num_ps_replicas if config else 0,
+            "embedding_lr_multipliers": embedding_lr_multipliers,
+        },
+        feature_engineering_fn=feature_engineering_fn)
+
+  def fit(self, x=None, y=None, input_fn=None, steps=None, batch_size=None,
+          monitors=None, max_steps=None):
+    """See trainable.Trainable."""
+    # TODO(roumposg): Remove when deprecated monitors are removed.
+    hooks = monitor_lib.replace_monitors_with_hooks(monitors, self)
+    self._estimator.fit(x=x,
+                        y=y,
+                        input_fn=input_fn,
+                        steps=steps,
+                        batch_size=batch_size,
+                        monitors=hooks,
+                        max_steps=max_steps)
+    return self
+
+  def evaluate(self, x=None, y=None, input_fn=None, feed_fn=None,
+               batch_size=None, steps=None, metrics=None, name=None,
+               checkpoint_path=None):
+    """See evaluable.Evaluable."""
+    # TODO(zakaria): remove once deprecation is finished (b/31229024)
+    custom_metrics = {}
+    if metrics:
+      for key, metric in six.iteritems(metrics):
+        if (not isinstance(metric, metric_spec.MetricSpec) and
+            not isinstance(key, tuple)):
+          custom_metrics[(key, prediction_key.PredictionKey.SCORES)] = metric
+        else:
+          custom_metrics[key] = metric
+
+    return self._estimator.evaluate(
+        x=x, y=y, input_fn=input_fn, feed_fn=feed_fn, batch_size=batch_size,
+        steps=steps, metrics=custom_metrics, name=name,
+        checkpoint_path=checkpoint_path)
 
   @deprecated_arg_values(
       estimator.AS_ITERABLE_DATE, estimator.AS_ITERABLE_INSTRUCTIONS,
       as_iterable=False)
   def predict(self, x=None, input_fn=None, batch_size=None, as_iterable=True):
-    """Runs inference to determine the predicted class."""
+    """Returns predicted scores for given features.
+
+    Args:
+      x: features.
+      input_fn: Input function. If set, x must be None.
+      batch_size: Override default batch size.
+      as_iterable: If True, return an iterable which keeps yielding predictions
+        for each example until inputs are exhausted. Note: The inputs must
+        terminate if you want the iterable to terminate (e.g. be sure to pass
+        num_epochs=1 if you are using something like read_batch_features).
+
+    Returns:
+      Numpy array of predicted scores (or an iterable of predicted scores if
+      as_iterable is True). If `label_dimension == 1`, the shape of the output
+      is `[batch_size]`, otherwise the shape is `[batch_size, label_dimension]`.
+    """
     key = prediction_key.PredictionKey.SCORES
-    preds = super(DNNLinearCombinedRegressor, self).predict(
+    preds = self._estimator.predict(
         x=x,
         input_fn=input_fn,
         batch_size=batch_size,
         outputs=[key],
         as_iterable=as_iterable)
     if as_iterable:
-      return _as_iterable(preds, output=key)
+      return (pred[key] for pred in preds)
     return preds[key]
 
+  def _get_predict_ops(self, features):
+    """See `Estimator` class."""
+    # This method exists to support some models that use the legacy interface.
+    # pylint: disable=protected-access
+    return self._estimator._get_predict_ops(features)
+
+  def get_variable_names(self):
+    """Returns list of all variable names in this model.
+
+    Returns:
+      List of names.
+    """
+    return self._estimator.get_variable_names()
+
+  def get_variable_value(self, name):
+    """Returns value of the variable given by name.
+
+    Args:
+      name: string, name of the tensor.
+
+    Returns:
+      `Tensor` object.
+    """
+    return self._estimator.get_variable_value(name)
+
   def export(self,
              export_dir,
              input_fn=None,
              input_feature_key=None,
              use_deprecated_input_fn=True,
              signature_fn=None,
-             default_batch_size=None,
+             default_batch_size=1,
              exports_to_keep=None):
-    return super(DNNLinearCombinedRegressor, self).export(
+    """See BaseEstimator.export."""
+    def default_input_fn(unused_estimator, examples):
+      return layers.parse_feature_columns_from_examples(
+          examples, self._feature_columns)
+    return self._estimator.export(
         export_dir=export_dir,
-        input_fn=input_fn,
+        input_fn=input_fn or default_input_fn,
         input_feature_key=input_feature_key,
         use_deprecated_input_fn=use_deprecated_input_fn,
-        signature_fn=signature_fn,
+        signature_fn=signature_fn or export.regression_signature_fn,
         prediction_key=prediction_key.PredictionKey.SCORES,
         default_batch_size=default_batch_size,
         exports_to_keep=exports_to_keep)
+
+  @property
+  def model_dir(self):
+    return self._estimator.model_dir
+
+  @property
+  def config(self):
+    return self._estimator.config
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py
index 33d0d2eb4f054c..ff059a9727198a 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py
@@ -207,7 +207,7 @@ def _input_fn():
       features = {
           'language': tf.SparseTensor(values=['en', 'fr', 'zh'],
                                       indices=[[0, 0], [0, 1], [2, 0]],
-                                      shape=[3, 2])
+                                      dense_shape=[3, 2])
       }
       labels = tf.constant([[1], [0], [0]])
       return features, labels
@@ -566,7 +566,7 @@ def input_fn():
           'age': tf.constant([1]),
           'language': tf.SparseTensor(values=['english'],
                                       indices=[[0, 0]],
-                                      shape=[1, 1])
+                                      dense_shape=[1, 1])
       }, tf.constant([[1]])
 
     language = tf.contrib.layers.sparse_column_with_hash_bucket('language', 100)
@@ -636,7 +636,7 @@ def input_fn():
           'age': tf.constant([1]),
           'language': tf.SparseTensor(values=['english'],
                                       indices=[[0, 0]],
-                                      shape=[1, 1])
+                                      dense_shape=[1, 1])
       }, tf.constant([[1]])
 
     language = tf.contrib.layers.sparse_column_with_hash_bucket('language', 100)
@@ -664,7 +664,7 @@ def input_fn():
       return {
           'language': tf.SparseTensor(values=['english'],
                                       indices=[[0, 0]],
-                                      shape=[1, 1])
+                                      dense_shape=[1, 1])
       }, tf.constant([[1]])
 
     language = tf.contrib.layers.sparse_column_with_hash_bucket('language', 99)
@@ -859,7 +859,7 @@ def _input_fn(num_epochs=None):
                                        num_epochs=num_epochs),
           'language': tf.SparseTensor(values=['en', 'fr', 'zh'],
                                       indices=[[0, 0], [0, 1], [2, 0]],
-                                      shape=[3, 2])
+                                      dense_shape=[3, 2])
       }
       return features, tf.constant(labels, dtype=tf.float32)
 
@@ -893,7 +893,7 @@ def _input_fn(num_epochs=None):
                                        num_epochs=num_epochs),
           'language': tf.SparseTensor(values=['en', 'fr', 'zh'],
                                       indices=[[0, 0], [0, 1], [2, 0]],
-                                      shape=[3, 2])
+                                      dense_shape=[3, 2])
       }
       return features, tf.constant(labels, dtype=tf.float32)
 
@@ -943,7 +943,7 @@ def _my_metric_op(predictions, labels):
         steps=1,
         metrics={
             'my_error': tf.contrib.metrics.streaming_mean_squared_error,
-            'my_metric': _my_metric_op
+            ('my_metric', 'scores'): _my_metric_op
         })
     self.assertIn('loss', set(scores.keys()))
     self.assertIn('my_error', set(scores.keys()))
@@ -954,7 +954,7 @@ def _my_metric_op(predictions, labels):
         _sklearn.mean_squared_error(np.array([1, 0, 0, 0]), predictions),
         scores['my_error'])
 
-    # Tests that when the key is a tuple, an error is raised.
+    # Tests the case that the 2nd element of the key is not "scores".
     with self.assertRaises(KeyError):
       regressor.evaluate(
           input_fn=_input_fn,
@@ -962,6 +962,65 @@ def _my_metric_op(predictions, labels):
           metrics={('my_error', 'predictions'
                    ): tf.contrib.metrics.streaming_mean_squared_error})
 
+    # Tests the case where the tuple of the key doesn't have 2 elements.
+    with self.assertRaises(ValueError):
+      regressor.evaluate(
+          input_fn=_input_fn,
+          steps=1,
+          metrics={
+              ('bad_length_name', 'scores', 'bad_length'):
+                  tf.contrib.metrics.streaming_mean_squared_error
+          })
+
+  def testCustomMetricsWithMetricSpec(self):
+    """Tests custom evaluation metrics."""
+    def _input_fn(num_epochs=None):
+      # Create 4 rows, one of them (y = x), three of them (y=Not(x))
+      labels = tf.constant([[1.], [0.], [0.], [0.]])
+      features = {'x': tf.train.limit_epochs(
+          tf.ones(shape=[4, 1], dtype=tf.float32), num_epochs=num_epochs)}
+      return features, labels
+
+    def _my_metric_op(predictions, labels):
+      return tf.reduce_sum(tf.mul(predictions, labels))
+
+    regressor = tf.contrib.learn.DNNLinearCombinedRegressor(
+        linear_feature_columns=[tf.contrib.layers.real_valued_column('x')],
+        dnn_feature_columns=[tf.contrib.layers.real_valued_column('x')],
+        dnn_hidden_units=[3, 3],
+        config=tf.contrib.learn.RunConfig(tf_random_seed=1))
+
+    regressor.fit(input_fn=_input_fn, steps=5)
+    scores = regressor.evaluate(
+        input_fn=_input_fn,
+        steps=1,
+        metrics={
+            'my_error': MetricSpec(
+                metric_fn=tf.contrib.metrics.streaming_mean_squared_error,
+                prediction_key='scores'),
+            'my_metric': MetricSpec(
+                metric_fn=_my_metric_op,
+                prediction_key='scores')
+        })
+    self.assertIn('loss', set(scores.keys()))
+    self.assertIn('my_error', set(scores.keys()))
+    self.assertIn('my_metric', set(scores.keys()))
+    predict_input_fn = functools.partial(_input_fn, num_epochs=1)
+    predictions = np.array(list(regressor.predict(input_fn=predict_input_fn)))
+    self.assertAlmostEqual(
+        _sklearn.mean_squared_error(np.array([1, 0, 0, 0]), predictions),
+        scores['my_error'])
+
+    # Tests the case where the prediction_key is not "scores".
+    with self.assertRaisesRegexp(KeyError, 'bad_type'):
+      regressor.evaluate(
+          input_fn=_input_fn,
+          steps=1,
+          metrics={
+              'bad_name': MetricSpec(
+                  metric_fn=tf.contrib.metrics.streaming_auc,
+                  prediction_key='bad_type')})
+
   def testExport(self):
     """Tests export model for servo."""
     labels = [1., 0., 0.2]
@@ -971,7 +1030,7 @@ def _input_fn(num_epochs=None):
                                        num_epochs=num_epochs),
           'language': tf.SparseTensor(values=['en', 'fr', 'zh'],
                                       indices=[[0, 0], [0, 1], [2, 0]],
-                                      shape=[3, 2])
+                                      dense_shape=[3, 2])
       }
       return features, tf.constant(labels, dtype=tf.float32)
 
@@ -1036,7 +1095,7 @@ def _input_fn(num_epochs=None):
                                        num_epochs=num_epochs),
           'language': tf.SparseTensor(values=['en', 'fr', 'zh'],
                                       indices=[[0, 0], [0, 1], [2, 0]],
-                                      shape=[3, 2])
+                                      dense_shape=[3, 2])
       }
       return features, tf.constant([1., 0., 0.2], dtype=tf.float32)
 
@@ -1083,7 +1142,7 @@ def _input_fn(num_epochs=None):
                                        num_epochs=num_epochs),
           'language': tf.SparseTensor(values=['en', 'fr', 'zh'],
                                       indices=[[0, 0], [0, 1], [2, 0]],
-                                      shape=[3, 2])
+                                      dense_shape=[3, 2])
       }
       return features, tf.constant([1., 0., 0.2], dtype=tf.float32)
 
@@ -1116,7 +1175,7 @@ def _input_fn(num_epochs=None):
                                        num_epochs=num_epochs),
           'language': tf.SparseTensor(values=['en', 'fr', 'zh'],
                                       indices=[[0, 0], [0, 1], [2, 0]],
-                                      shape=[3, 2])
+                                      dense_shape=[3, 2])
       }
       return features, tf.constant([1., 0., 0.2], dtype=tf.float32)
 
@@ -1143,7 +1202,7 @@ def _input_fn(num_epochs=None):
                                        num_epochs=num_epochs),
           'language': tf.SparseTensor(values=['en', 'fr', 'zh'],
                                       indices=[[0, 0], [0, 1], [2, 0]],
-                                      shape=[3, 2])
+                                      dense_shape=[3, 2])
       }
       return features, tf.constant([1., 0., 0.2], dtype=tf.float32)
 
@@ -1198,7 +1257,7 @@ def feature_engineering_fn(features, labels):
     # predictions = y
     prediction_with_fe_fn = next(
         estimator_with_fe_fn.predict(input_fn=input_fn, as_iterable=True))
-    self.assertAlmostEqual(1000., prediction_with_fe_fn, delta=1.0)
+    self.assertAlmostEqual(1000., prediction_with_fe_fn, delta=10.0)
     prediction_without_fe_fn = next(
         estimator_without_fe_fn.predict(input_fn=input_fn, as_iterable=True))
     self.assertAlmostEqual(100., prediction_without_fe_fn, delta=1.0)
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
index 9196d78d22db02..9e1bf07245b38c 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
@@ -638,7 +638,7 @@ def input_fn():
           'age': tf.constant([1]),
           'language': tf.SparseTensor(values=['english'],
                                       indices=[[0, 0]],
-                                      shape=[1, 1])
+                                      dense_shape=[1, 1])
       }, tf.constant([[1]])
 
     language = tf.contrib.layers.sparse_column_with_hash_bucket('language', 100)
@@ -962,7 +962,7 @@ def _my_metric_op(predictions, labels):
         steps=1,
         metrics={
             'my_error': tf.contrib.metrics.streaming_mean_squared_error,
-            'my_metric': _my_metric_op
+            ('my_metric', 'scores'): _my_metric_op
         })
     self.assertIn('loss', set(scores.keys()))
     self.assertIn('my_error', set(scores.keys()))
@@ -973,7 +973,7 @@ def _my_metric_op(predictions, labels):
         _sklearn.mean_squared_error(np.array([1, 0, 0, 0]), predictions),
         scores['my_error'])
 
-    # Tests that when the key is a tuple, an error is raised.
+    # Tests the case that the 2nd element of the key is not "scores".
     with self.assertRaises(KeyError):
       regressor.evaluate(
           input_fn=_input_fn,
@@ -981,6 +981,66 @@ def _my_metric_op(predictions, labels):
           metrics={('my_error', 'predictions'):
                    tf.contrib.metrics.streaming_mean_squared_error})
 
+    # Tests the case where the tuple of the key doesn't have 2 elements.
+    with self.assertRaises(ValueError):
+      regressor.evaluate(
+          input_fn=_input_fn,
+          steps=1,
+          metrics={
+              ('bad_length_name', 'scores', 'bad_length'):
+                  tf.contrib.metrics.streaming_mean_squared_error
+          })
+
+  def testCustomMetricsWithMetricSpec(self):
+    """Tests custom evaluation metrics that use MetricSpec."""
+    def _input_fn(num_epochs=None):
+      # Create 4 rows, one of them (y = x), three of them (y=Not(x))
+      labels = tf.constant([[1.], [0.], [0.], [0.]])
+      features = {
+          'x': tf.train.limit_epochs(
+              tf.ones(shape=[4, 1], dtype=tf.float32), num_epochs=num_epochs),
+      }
+      return features, labels
+
+    def _my_metric_op(predictions, labels):
+      return tf.reduce_sum(tf.mul(predictions, labels))
+
+    regressor = tf.contrib.learn.DNNRegressor(
+        feature_columns=[tf.contrib.layers.real_valued_column('x')],
+        hidden_units=[3, 3],
+        config=tf.contrib.learn.RunConfig(tf_random_seed=1))
+
+    regressor.fit(input_fn=_input_fn, steps=5)
+    scores = regressor.evaluate(
+        input_fn=_input_fn,
+        steps=1,
+        metrics={
+            'my_error': MetricSpec(
+                metric_fn=tf.contrib.metrics.streaming_mean_squared_error,
+                prediction_key='scores'),
+            'my_metric': MetricSpec(
+                metric_fn=_my_metric_op,
+                prediction_key='scores')
+        })
+    self.assertIn('loss', set(scores.keys()))
+    self.assertIn('my_error', set(scores.keys()))
+    self.assertIn('my_metric', set(scores.keys()))
+    predict_input_fn = functools.partial(_input_fn, num_epochs=1)
+    predictions = np.array(list(regressor.predict(input_fn=predict_input_fn)))
+    self.assertAlmostEqual(
+        _sklearn.mean_squared_error(np.array([1, 0, 0, 0]), predictions),
+        scores['my_error'])
+
+    # Tests the case where the prediction_key is not "scores".
+    with self.assertRaisesRegexp(KeyError, 'bad_type'):
+      regressor.evaluate(
+          input_fn=_input_fn,
+          steps=1,
+          metrics={
+              'bad_name': MetricSpec(
+                  metric_fn=tf.contrib.metrics.streaming_auc,
+                  prediction_key='bad_type')})
+
   def testTrainSaveLoad(self):
     """Tests that insures you can save and reload a trained model."""
     def _input_fn(num_epochs=None):
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py b/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py
index f5347892708500..ab869958d0b821 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py
@@ -27,7 +27,7 @@
 from tensorflow.python.ops import rnn_cell
 
 
-class IdentityRNNCell(tf.nn.rnn_cell.RNNCell):
+class IdentityRNNCell(tf.contrib.rnn.RNNCell):
 
   def __init__(self, state_size, output_size):
     self._state_size = state_size
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator.py b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
index ade126d6d149fd..38018282591352 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
@@ -39,6 +39,7 @@
 from tensorflow.contrib.framework import list_variables
 from tensorflow.contrib.framework import load_variable
 from tensorflow.contrib.framework.python.framework import experimental
+from tensorflow.contrib.framework.python.ops import ops as contrib_ops
 from tensorflow.contrib.framework.python.ops import variables as contrib_variables
 from tensorflow.contrib.learn.python.learn import evaluable
 from tensorflow.contrib.learn.python.learn import graph_actions
@@ -187,7 +188,7 @@ def _get_replica_device_setter(config):
     A replica device setter, or None.
   """
   ps_ops = [
-      'Variable', 'AutoReloadVariable', 'MutableHashTable',
+      'Variable', 'VariableV2', 'AutoReloadVariable', 'MutableHashTable',
       'MutableHashTableOfTensors', 'MutableDenseHashTable'
   ]
 
@@ -414,7 +415,7 @@ def partial_fit(
   )
   def evaluate(
       self, x=None, y=None, input_fn=None, feed_fn=None, batch_size=None,
-      steps=None, metrics=None, name=None):
+      steps=None, metrics=None, name=None, checkpoint_path=None):
     # pylint: disable=g-doc-args,g-doc-return-or-yield
     """See `Evaluable`.
 
@@ -429,11 +430,13 @@ def evaluate(
     if metrics is not None and not isinstance(metrics, dict):
       raise ValueError('Metrics argument should be None or dict. '
                        'Got %s.' % metrics)
-    eval_results, global_step = self._evaluate_model(input_fn=input_fn,
-                                                     feed_fn=feed_fn,
-                                                     steps=steps,
-                                                     metrics=metrics,
-                                                     name=name)
+    eval_results, global_step = self._evaluate_model(
+        input_fn=input_fn,
+        feed_fn=feed_fn,
+        steps=steps,
+        metrics=metrics,
+        name=name,
+        checkpoint_path=checkpoint_path)
     if eval_results is not None:
       eval_results.update({'global_step': global_step})
     return eval_results
@@ -769,18 +772,21 @@ def _evaluate_model(self,
                       steps,
                       feed_fn=None,
                       metrics=None,
-                      name=''):
+                      name='',
+                      checkpoint_path=None):
     # TODO(wicke): Remove this once Model and associated code are gone.
     if (hasattr(self._config, 'execution_mode') and
         self._config.execution_mode not in ('all', 'evaluate', 'eval_evalset')):
       return None, None
 
-    # Check that model has been trained.
-    checkpoint_path = self._model_dir
-    latest_path = saver.latest_checkpoint(checkpoint_path)
-    if not latest_path:
-      raise NotFittedError("Couldn't find trained model at %s."
-                           % checkpoint_path)
+    # Check that model has been trained (if nothing has been set explicitly).
+    if not checkpoint_path:
+      latest_path = saver.latest_checkpoint(self._model_dir)
+      if not latest_path:
+        raise NotFittedError("Couldn't find trained model at %s."
+                             % self._model_dir)
+      checkpoint_path = self._model_dir
+
     # Setup output directory.
     eval_dir = os.path.join(self._model_dir, 'eval' if not name else
                             'eval_' + name)
@@ -914,7 +920,15 @@ def _feed_fn():
   def _infer_model_as_iterable(
       self, checkpoint_path, predictions, feed_fn, return_dict):
     if feed_fn is None:
-      feed_dicts = itertools.repeat(None)
+      # If there are no queue_runners, the input `predictions` is a
+      # constant, and we should stop after the first epoch.  If,
+      # instead, there are queue_runners, eventually they should throw
+      # an `OutOfRangeError`.
+      graph = contrib_ops.get_graph_from_inputs(predictions.values())
+      if graph.get_collection(ops.GraphKeys.QUEUE_RUNNERS):
+        feed_dicts = itertools.repeat(None)
+      else:
+        feed_dicts = [None]
     else:
       def _feed_fn():
         while True:
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
index a43b960a96b51a..011325f85fb63f 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
@@ -54,6 +54,17 @@ def boston_input_fn(num_epochs=None):
   return features, labels
 
 
+def boston_input_fn_with_queue(num_epochs=None):
+  features, labels = boston_input_fn(num_epochs=num_epochs)
+
+  # Create a minimal queue runner.
+  fake_queue = tf.FIFOQueue(30, tf.int32)
+  queue_runner = tf.train.QueueRunner(fake_queue, [tf.constant(0)])
+  tf.train.add_queue_runner(queue_runner)
+
+  return features, labels
+
+
 def iris_input_fn():
   iris = tf.contrib.learn.datasets.load_iris()
   features = tf.reshape(tf.constant(iris.data), [-1, _IRIS_INPUT_DIM])
@@ -577,6 +588,25 @@ def testPredictInputFn(self):
     output = list(est.predict(input_fn=input_fn))
     self.assertEqual(len(output), boston.target.shape[0])
 
+  def testPredictInputFnWithQueue(self):
+    est = tf.contrib.learn.Estimator(model_fn=linear_model_fn)
+    boston = tf.contrib.learn.datasets.load_boston()
+    est.fit(input_fn=boston_input_fn, steps=1)
+    input_fn = functools.partial(boston_input_fn_with_queue, num_epochs=2)
+    output = list(est.predict(input_fn=input_fn))
+    self.assertEqual(len(output), boston.target.shape[0]*2)
+
+  def testPredictConstInputFn(self):
+    est = tf.contrib.learn.Estimator(model_fn=linear_model_fn)
+    boston = tf.contrib.learn.datasets.load_boston()
+    est.fit(input_fn=boston_input_fn, steps=1)
+    def input_fn():
+      features = tf.reshape(tf.constant(boston.data), [-1, _BOSTON_INPUT_DIM])
+      labels = tf.reshape(tf.constant(boston.target), [-1, 1])
+      return features, labels
+    output = list(est.predict(input_fn=input_fn))
+    self.assertEqual(len(output), boston.target.shape[0])
+
   def testWithModelFnOps(self):
     """Test for model_fn that returns `ModelFnOps`."""
     est = tf.contrib.learn.Estimator(model_fn=linear_model_fn_with_model_fn_ops)
diff --git a/tensorflow/contrib/learn/python/learn/estimators/head.py b/tensorflow/contrib/learn/python/learn/estimators/head.py
index d981ca5e8ac0ea..194ef3c5d19d54 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/head.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head.py
@@ -19,6 +19,7 @@
 from __future__ import print_function
 
 import abc
+import functools
 import six
 
 from tensorflow.contrib import losses
@@ -31,6 +32,7 @@
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
 from tensorflow.contrib.session_bundle import exporter
 from tensorflow.python import summary
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
@@ -78,7 +80,7 @@ def _regression_head(label_name=None,
 
 def _multi_class_head(n_classes, label_name=None, weight_column_name=None,
                       enable_centered_bias=False, head_name=None,
-                      thresholds=None):
+                      thresholds=None, metric_class_ids=None):
   """Creates a _Head for multi class single label classification.
 
   The Head uses softmax cross entropy loss.
@@ -96,18 +98,24 @@ def _multi_class_head(n_classes, label_name=None, weight_column_name=None,
     head_name: name of the head. If provided, predictions, summary and metrics
       keys will be prefixed by the head_name and an underscore.
     thresholds: thresholds for eval metrics, defaults to [.5]
+    metric_class_ids: List of class IDs for which we should report per-class
+      metrics. Must all be in the range `[0, n_classes)`. Invalid if
+      `n_classes` is 2.
 
   Returns:
     An instance of _MultiClassHead.
 
   Raises:
-    ValueError: if n_classes is < 2
+    ValueError: if `n_classes` is < 2, or `metric_class_ids` is provided when
+      `n_classes` is 2.
   """
   if (n_classes is None) or (n_classes < 2):
     raise ValueError(
         "n_classes must be > 1 for classification: %s." % n_classes)
 
   if n_classes == 2:
+    if metric_class_ids:
+      raise ValueError("metric_class_ids invalid for n_classes==2.")
     return _BinaryLogisticHead(label_name=label_name,
                                weight_column_name=weight_column_name,
                                enable_centered_bias=enable_centered_bias,
@@ -119,7 +127,8 @@ def _multi_class_head(n_classes, label_name=None, weight_column_name=None,
                          weight_column_name=weight_column_name,
                          enable_centered_bias=enable_centered_bias,
                          head_name=head_name,
-                         thresholds=thresholds)
+                         thresholds=thresholds,
+                         metric_class_ids=metric_class_ids)
 
 
 def _binary_svm_head(label_name=None, weight_column_name=None,
@@ -155,7 +164,7 @@ def _binary_svm_head(label_name=None, weight_column_name=None,
 
 def _multi_label_head(n_classes, label_name=None, weight_column_name=None,
                       enable_centered_bias=False, head_name=None,
-                      thresholds=None):
+                      thresholds=None, metric_class_ids=None):
   """Creates a _Head for multi label classification.
 
   The Head uses softmax cross entropy loss.
@@ -173,6 +182,8 @@ def _multi_label_head(n_classes, label_name=None, weight_column_name=None,
     head_name: name of the head. If provided, predictions, summary and metrics
       keys will be prefixed by the head_name and an underscore.
     thresholds: thresholds for eval metrics, defaults to [.5]
+    metric_class_ids: List of class IDs for which we should report per-class
+      metrics. Must all be in the range `[0, n_classes)`.
 
   Returns:
     An instance of _MultiClassHead.
@@ -187,7 +198,8 @@ def _multi_label_head(n_classes, label_name=None, weight_column_name=None,
                          weight_column_name=weight_column_name,
                          enable_centered_bias=enable_centered_bias,
                          head_name=head_name,
-                         thresholds=thresholds)
+                         thresholds=thresholds,
+                         metric_class_ids=metric_class_ids)
 
 
 # TODO(zakaria): Make the classes public once we are ready for users to subclass
@@ -314,16 +326,16 @@ def head_ops(self, features, labels, mode, train_op_fn, logits=None,
     train_op = None
     eval_metric_ops = None
     if (mode != model_fn.ModeKeys.INFER) and (labels is not None):
-      labels = _check_labels(labels, self._label_name)
+      labels_tensor = _to_labels_tensor(labels, self._label_name)
       loss = _training_loss(
-          features, labels, logits,
+          features, labels_tensor, logits,
           loss_fn=self._loss_fn,
           weight_column_name=self._weight_column_name,
           head_name=self._head_name)
       if (mode == model_fn.ModeKeys.TRAIN) and (train_op_fn is not None):
         train_op = _train_op(
-            loss, labels, train_op_fn, centered_bias, self.logits_dimension,
-            self._loss_fn)
+            loss, labels_tensor, train_op_fn, centered_bias,
+            self.logits_dimension, self._loss_fn)
       eval_metric_ops = _eval_metric_ops(
           self._default_metrics(), features, labels, predictions)
 
@@ -353,7 +365,9 @@ def _logits_to_predictions(self, logits):
 
   def _signature_fn(self):
     """Returns the signature_fn to be used in exporting."""
-    def _regression_signature_fn(examples, unused_features, predictions):
+    def _regression_signature_fn(examples, features, predictions):
+      # pylint: disable=missing-docstring
+      del features
       if isinstance(predictions, dict):
         score = predictions[prediction_key.PredictionKey.SCORES]
       else:
@@ -440,16 +454,16 @@ def head_ops(self, features, labels, mode, train_op_fn, logits=None,
     train_op = None
     eval_metric_ops = None
     if (mode != model_fn.ModeKeys.INFER) and (labels is not None):
-      labels = _check_labels(labels, self._label_name)
+      labels_tensor = _to_labels_tensor(labels, self._label_name)
       loss = _training_loss(
-          features, labels, logits,
+          features, labels_tensor, logits,
           loss_fn=self._loss_fn,
           weight_column_name=self._weight_column_name,
           head_name=self._head_name)
       if (mode == model_fn.ModeKeys.TRAIN) and (train_op_fn is not None):
         train_op = _train_op(
-            loss, labels, train_op_fn, centered_bias, self.logits_dimension,
-            self._loss_fn)
+            loss, labels_tensor, train_op_fn, centered_bias,
+            self.logits_dimension, self._loss_fn)
       eval_metric_ops = _eval_metric_ops(
           self._default_metrics(), features, labels, predictions)
 
@@ -485,8 +499,9 @@ def _logits_to_predictions(self, logits):
 
   def _signature_fn(self):
     """Returns the signature_fn to be used in exporting."""
-    def _classification_signature_fn(examples, unused_features, predictions):
+    def _classification_signature_fn(examples, features, predictions):
       """Servo signature function."""
+      del features
       if isinstance(predictions, dict):
         default_signature = exporter.classification_signature(
             input_tensor=examples,
@@ -527,12 +542,13 @@ def _add_binary_metric(key, metric_fn):
     _add_binary_metric(
         metric_key.MetricKey.PREDICTION_MEAN, _predictions_streaming_mean)
     _add_binary_metric(
-        metric_key.MetricKey.LABEL_MEAN, _labels_streaming_mean)
+        metric_key.MetricKey.LABEL_MEAN, _indicator_labels_streaming_mean)
 
     # Also include the streaming mean of the label as an accuracy baseline, as
     # a reminder to users.
     _add_binary_metric(
-        metric_key.MetricKey.ACCURACY_BASELINE, _labels_streaming_mean)
+        metric_key.MetricKey.ACCURACY_BASELINE,
+        _indicator_labels_streaming_mean)
 
     _add_binary_metric(metric_key.MetricKey.AUC, _streaming_auc)
 
@@ -571,7 +587,8 @@ class _MultiClassHead(_Head):
 
   def __init__(self, n_classes, label_name,
                weight_column_name, enable_centered_bias, head_name,
-               loss_fn=_softmax_cross_entropy_loss, thresholds=None):
+               loss_fn=_softmax_cross_entropy_loss, thresholds=None,
+               metric_class_ids=None):
     """Base type for all single heads.
 
     Args:
@@ -589,9 +606,11 @@ def __init__(self, n_classes, label_name,
         keys will be prefixed by the head_name and an underscore.
       loss_fn: Loss function.
       thresholds: thresholds for eval.
+      metric_class_ids: List of class IDs for which we should report per-class
+        metrics. Must all be in the range `[0, n_classes)`.
 
     Raises:
-      ValueError: if n_classes is invalid.
+      ValueError: if `n_classes` or `metric_class_ids` is invalid.
     """
     super(_MultiClassHead, self).__init__(head_name=head_name)
 
@@ -604,6 +623,11 @@ def __init__(self, n_classes, label_name,
     self._loss_fn = loss_fn
     self._enable_centered_bias = enable_centered_bias
     self._problem_type = constants.ProblemType.CLASSIFICATION
+    self._metric_class_ids = tuple(
+        [] if metric_class_ids is None else metric_class_ids)
+    for class_id in self._metric_class_ids:
+      if (class_id < 0) or (class_id >= n_classes):
+        raise ValueError("Class ID %s not in [0, %s)." % (class_id, n_classes))
 
   @property
   def logits_dimension(self):
@@ -625,16 +649,16 @@ def head_ops(self, features, labels, mode, train_op_fn, logits=None,
     train_op = None
     eval_metric_ops = None
     if (mode != model_fn.ModeKeys.INFER) and (labels is not None):
-      labels = _check_labels(labels, self._label_name)
+      labels_tensor = _to_labels_tensor(labels, self._label_name)
       loss = _training_loss(
-          features, labels, logits,
+          features, labels_tensor, logits,
           loss_fn=self._loss_fn,
           weight_column_name=self._weight_column_name,
           head_name=self._head_name)
       if (mode == model_fn.ModeKeys.TRAIN) and (train_op_fn is not None):
         train_op = _train_op(
-            loss, labels, train_op_fn, centered_bias, self._logits_dimension,
-            self._loss_fn)
+            loss, labels_tensor, train_op_fn, centered_bias,
+            self._logits_dimension, self._loss_fn)
       eval_metric_ops = _eval_metric_ops(
           self._default_metrics(), features, labels, predictions)
 
@@ -667,8 +691,9 @@ def _logits_to_predictions(self, logits):
 
   def _signature_fn(self):
     """Returns the signature_fn to be used in exporting."""
-    def _classification_signature_fn(examples, unused_features, predictions):
+    def _classification_signature_fn(examples, features, predictions):
       """Servo signature function."""
+      del features
       if isinstance(predictions, dict):
         default_signature = exporter.classification_signature(
             input_tensor=examples,
@@ -684,35 +709,121 @@ def _classification_signature_fn(examples, unused_features, predictions):
       return default_signature, {}
     return _classification_signature_fn
 
+  def _metric_spec(self, metric_fn, prediction_name):
+    return metric_spec.MetricSpec(
+        metric_fn, prediction_name, self._label_name, self._weight_column_name)
+
   def _default_metrics(self):
     """Returns a dict of `MetricSpec` objects keyed by name."""
-    metrics = {_head_prefixed(self._head_name, metric_key.MetricKey.LOSS):
-               _weighted_average_loss_metric_spec(
-                   self._loss_fn,
-                   prediction_key.PredictionKey.LOGITS,
-                   self._label_name,
-                   self._weight_column_name)}
-
-    # TODO(b/29366811): This currently results in both an "accuracy" and an
-    # "accuracy/threshold_0.500000_mean" metric for binary classification.
-    metrics[_head_prefixed(self._head_name, metric_key.MetricKey.ACCURACY)] = (
-        metric_spec.MetricSpec(metrics_lib.streaming_accuracy,
-                               prediction_key.PredictionKey.CLASSES,
-                               self._label_name,
-                               self._weight_column_name))
-
-    # TODO(b/32953199): Add multiclass metrics.
+    def _streaming_auc_with_class_id_label(predictions, labels, weights=None):
+      indicator_labels = _class_id_labels_to_indicator(
+          labels, num_classes=self.logits_dimension)
+      return _streaming_auc(predictions, indicator_labels, weights)
+
+    loss_key = _head_prefixed(self._head_name, metric_key.MetricKey.LOSS)
+    accuracy_key = _head_prefixed(
+        self._head_name, metric_key.MetricKey.ACCURACY)
+    auc_key = _head_prefixed(self._head_name, metric_key.MetricKey.AUC)
+
+    metrics = {
+        loss_key: _weighted_average_loss_metric_spec(
+            self._loss_fn,
+            prediction_key.PredictionKey.LOGITS,
+            self._label_name,
+            self._weight_column_name),
+        # TODO(b/29366811): This currently results in both an "accuracy" and an
+        # "accuracy/threshold_0.500000_mean" metric for binary classification.
+        accuracy_key: self._metric_spec(
+            metrics_lib.streaming_accuracy,
+            prediction_key.PredictionKey.CLASSES),
+        auc_key: self._metric_spec(
+            _streaming_auc_with_class_id_label,
+            prediction_key.PredictionKey.PROBABILITIES)
+    }
+
+    def _class_predictions_streaming_mean(
+        predictions, labels, weights=None, class_id=None):
+      del labels
+      return metrics_lib.streaming_mean(
+          array_ops.where(
+              math_ops.equal(
+                  math_ops.to_int32(class_id),
+                  math_ops.to_int32(predictions)),
+              array_ops.ones_like(predictions),
+              array_ops.zeros_like(predictions)),
+          weights=weights)
+
+    def _class_labels_streaming_mean(
+        predictions, labels, weights=None, class_id=None):
+      del predictions
+      assert class_id is not None
+      return metrics_lib.streaming_mean(
+          array_ops.where(
+              math_ops.equal(
+                  math_ops.to_int32(class_id),
+                  math_ops.to_int32(labels)),
+              array_ops.ones_like(labels),
+              array_ops.zeros_like(labels)),
+          weights=weights)
+
+    def _class_streaming_auc(predictions, labels, weights=None, class_id=None):
+      assert class_id is not None
+      indicator_labels = _class_id_labels_to_indicator(
+          labels, num_classes=self.logits_dimension)
+      return _streaming_auc(
+          predictions, indicator_labels, weights=weights, class_id=class_id)
+
+    for class_id in self._metric_class_ids:
+
+      # TODO(ptucker): Add per-class accuracy, precision, recall.
+
+      prediction_mean_key = _head_prefixed(
+          self._head_name,
+          metric_key.MetricKey.CLASS_PREDICTION_MEAN % class_id)
+      label_mean_key = _head_prefixed(
+          self._head_name, metric_key.MetricKey.CLASS_LABEL_MEAN % class_id)
+      probability_mean_key = _head_prefixed(
+          self._head_name,
+          metric_key.MetricKey.CLASS_PROBABILITY_MEAN % class_id)
+      logits_mean_key = _head_prefixed(
+          self._head_name,
+          metric_key.MetricKey.CLASS_LOGITS_MEAN % class_id)
+      auc_key = _head_prefixed(
+          self._head_name, metric_key.MetricKey.CLASS_AUC % class_id)
+
+      metrics[prediction_mean_key] = self._metric_spec(
+          functools.partial(
+              _class_predictions_streaming_mean, class_id=class_id),
+          prediction_key.PredictionKey.CLASSES)
+      metrics[label_mean_key] = self._metric_spec(
+          functools.partial(_class_labels_streaming_mean, class_id=class_id),
+          prediction_key.PredictionKey.PROBABILITIES)
+      metrics[probability_mean_key] = self._metric_spec(
+          functools.partial(_predictions_streaming_mean, class_id=class_id),
+          prediction_key.PredictionKey.PROBABILITIES)
+      metrics[logits_mean_key] = self._metric_spec(
+          functools.partial(_predictions_streaming_mean, class_id=class_id),
+          prediction_key.PredictionKey.LOGITS)
+      metrics[auc_key] = self._metric_spec(
+          functools.partial(_class_streaming_auc, class_id=class_id),
+          prediction_key.PredictionKey.LOGITS)
 
     return metrics
 
 
-def _check_labels(labels, label_name):
+def _to_labels_tensor(labels, label_name):
   labels = labels[label_name] if isinstance(labels, dict) else labels
   if isinstance(labels, sparse_tensor.SparseTensor):
     raise ValueError("SparseTensor is not supported as labels.")
   return labels
 
 
+def _assert_labels_rank(labels):
+  return control_flow_ops.Assert(
+      math_ops.less_equal(array_ops.rank(labels), 2),
+      ("labels shape should be either [batch_size, 1] or [batch_size]",))
+
+
 class _BinarySvmHead(_BinaryLogisticHead):
   """_Head for binary classification using SVMs."""
 
@@ -720,12 +831,8 @@ def __init__(self, label_name, weight_column_name, enable_centered_bias,
                head_name, thresholds):
     def _loss_fn(logits, labels):
       with ops.name_scope(None, "hinge_loss", (logits, labels)) as name:
-        check_shape_op = control_flow_ops.Assert(
-            math_ops.less_equal(array_ops.rank(labels), 2),
-            ("labels shape should be either [batch_size, 1] or [batch_size]",))
-        with ops.control_dependencies((check_shape_op,)):
-          labels = array_ops.reshape(
-              labels, shape=(array_ops.shape(labels)[0], 1))
+        with ops.control_dependencies((_assert_labels_rank(labels),)):
+          labels = array_ops.reshape(labels, shape=(-1, 1))
         return losses.hinge_loss(logits, labels, scope=name)
 
     super(_BinarySvmHead, self).__init__(
@@ -769,7 +876,7 @@ class _MultiLabelHead(_MultiClassHead):
   # TODO(zakaria): add signature and metric for multilabel.
   def __init__(self, n_classes, label_name,
                weight_column_name, enable_centered_bias, head_name,
-               thresholds):
+               thresholds, metric_class_ids=None):
 
     super(_MultiLabelHead, self).__init__(
         n_classes=n_classes,
@@ -778,7 +885,8 @@ def __init__(self, n_classes, label_name,
         enable_centered_bias=enable_centered_bias,
         head_name=head_name,
         loss_fn=_sigmoid_cross_entropy_loss,
-        thresholds=thresholds)
+        thresholds=thresholds,
+        metric_class_ids=metric_class_ids)
 
   def _logits_to_predictions(self, logits):
     """See `_MultiClassHead`."""
@@ -792,19 +900,79 @@ def _logits_to_predictions(self, logits):
               name=prediction_key.PredictionKey.CLASSES)
       }
 
+  def _metric_spec(self, metric_fn, prediction_name):
+    return metric_spec.MetricSpec(
+        metric_fn, prediction_name, self._label_name, self._weight_column_name)
+
+  def _default_metrics(self):
+    """Returns a dict of `MetricSpec` objects keyed by name."""
+    loss_key = _head_prefixed(self._head_name, metric_key.MetricKey.LOSS)
+    accuracy_key = _head_prefixed(
+        self._head_name, metric_key.MetricKey.ACCURACY)
+    auc_key = _head_prefixed(self._head_name, metric_key.MetricKey.AUC)
+
+    metrics = {
+        loss_key: _weighted_average_loss_metric_spec(
+            self._loss_fn,
+            prediction_key.PredictionKey.LOGITS,
+            self._label_name,
+            self._weight_column_name),
+        # TODO(b/29366811): This currently results in both an "accuracy" and an
+        # "accuracy/threshold_0.500000_mean" metric for binary classification.
+        accuracy_key: self._metric_spec(
+            metrics_lib.streaming_accuracy,
+            prediction_key.PredictionKey.CLASSES),
+        auc_key: self._metric_spec(
+            _streaming_auc, prediction_key.PredictionKey.PROBABILITIES),
+    }
+
+    for class_id in self._metric_class_ids:
+
+      # TODO(ptucker): Add per-class accuracy, precision, recall.
+
+      prediction_mean_key = _head_prefixed(
+          self._head_name,
+          metric_key.MetricKey.CLASS_PREDICTION_MEAN % class_id)
+      label_mean_key = _head_prefixed(
+          self._head_name, metric_key.MetricKey.CLASS_LABEL_MEAN % class_id)
+      probability_mean_key = _head_prefixed(
+          self._head_name,
+          metric_key.MetricKey.CLASS_PROBABILITY_MEAN % class_id)
+      logits_mean_key = _head_prefixed(
+          self._head_name, metric_key.MetricKey.CLASS_LOGITS_MEAN % class_id)
+      auc_key = _head_prefixed(
+          self._head_name, metric_key.MetricKey.CLASS_AUC % class_id)
+
+      metrics[prediction_mean_key] = self._metric_spec(
+          functools.partial(_predictions_streaming_mean, class_id=class_id),
+          prediction_key.PredictionKey.CLASSES)
+      metrics[label_mean_key] = self._metric_spec(
+          functools.partial(
+              _indicator_labels_streaming_mean, class_id=class_id),
+          prediction_key.PredictionKey.CLASSES)
+      metrics[probability_mean_key] = self._metric_spec(
+          functools.partial(_predictions_streaming_mean, class_id=class_id),
+          prediction_key.PredictionKey.PROBABILITIES)
+      metrics[logits_mean_key] = self._metric_spec(
+          functools.partial(_predictions_streaming_mean, class_id=class_id),
+          prediction_key.PredictionKey.LOGITS)
+      metrics[auc_key] = self._metric_spec(
+          functools.partial(_streaming_auc, class_id=class_id),
+          prediction_key.PredictionKey.LOGITS)
+
+    return metrics
+
 
 def _weighted_loss(loss, weight):
-  """Returns cumulative weighted loss."""
+  """Returns cumulative weighted loss as 1d `Tensor`."""
   with ops.name_scope(None, "weighted_loss", (loss, weight)) as name:
-    unweighted_loss = array_ops.reshape(loss, shape=(-1,))
-    weighted_loss = math_ops.mul(unweighted_loss,
-                                 array_ops.reshape(
-                                     weight, shape=(-1,)),
-                                 name=name)
-    return weighted_loss
+    return math_ops.mul(array_ops.reshape(loss, shape=(-1,)),
+                        array_ops.reshape(weight, shape=(-1,)),
+                        name=name)
 
 
 def _weight_tensor(features, weight_column_name):
+  """Returns weights as 1d `Tensor`."""
   if not weight_column_name:
     return None
   with ops.name_scope(
@@ -872,17 +1040,18 @@ def _centered_bias_step(centered_bias, logits_dimension, labels, loss_fn):
   """Creates and returns training op for centered bias."""
   if (logits_dimension is None) or (logits_dimension < 1):
     raise ValueError("Invalid logits_dimension %s." % logits_dimension)
-  batch_size = array_ops.shape(labels)[0]
-  logits = array_ops.reshape(
-      array_ops.tile(centered_bias, (batch_size,)),
-      (batch_size, logits_dimension))
-  with ops.name_scope(None, "centered_bias", (labels, logits)):
-    centered_bias_loss = math_ops.reduce_mean(
-        loss_fn(logits, labels), name="training_loss")
+  with ops.name_scope(None, "centered_bias_step", (labels,)) as name:
+    batch_size = array_ops.shape(labels)[0]
+    logits = array_ops.reshape(
+        array_ops.tile(centered_bias, (batch_size,)),
+        (batch_size, logits_dimension))
+    with ops.name_scope(None, "centered_bias", (labels, logits)):
+      centered_bias_loss = math_ops.reduce_mean(
+          loss_fn(logits, labels), name="training_loss")
   # Learn central bias by an optimizer. 0.1 is a convervative lr for a
   # single variable.
   return training.AdagradOptimizer(0.1).minimize(
-      centered_bias_loss, var_list=(centered_bias,), name="centered_bias_step")
+      centered_bias_loss, var_list=(centered_bias,), name=name)
 
 
 def _head_prefixed(head_name, val):
@@ -981,17 +1150,49 @@ def _streaming_weighted_average_loss(predictions, labels, weights=None):
                                 pred_key, label_key, weight_key)
 
 
-def _labels_streaming_mean(unused_predictions, labels, weights=None):
+def _indicator_labels_streaming_mean(
+    predictions, labels, weights=None, class_id=None):
+  del predictions
+  if class_id is not None:
+    labels = labels[:, class_id]
   return metrics_lib.streaming_mean(labels, weights=weights)
 
 
-def _predictions_streaming_mean(predictions, unused_labels, weights=None):
+def _predictions_streaming_mean(
+    predictions, labels, weights=None, class_id=None):
+  del labels
+  if class_id is not None:
+    predictions = predictions[:, class_id]
   return metrics_lib.streaming_mean(predictions, weights=weights)
 
 
-def _streaming_auc(predictions, labels, weights=None):
-  return metrics_lib.streaming_auc(predictions, labels,
-                                   weights=_float_weights_or_none(weights))
+# TODO(ptucker): Add support for SparseTensor labels.
+def _class_id_labels_to_indicator(labels, num_classes):
+  if (num_classes is None) or (num_classes < 2):
+    raise ValueError("Invalid num_classes %s." % num_classes)
+  with ops.control_dependencies((_assert_labels_rank(labels),)):
+    labels = array_ops.reshape(labels, (-1,))
+  return array_ops.one_hot(labels, depth=num_classes, axis=-1)
+
+
+def _streaming_auc(predictions, labels, weights=None, class_id=None):
+  if class_id is not None:
+    predictions = predictions[:, class_id]
+    labels = labels[:, class_id]
+  return metrics_lib.streaming_auc(
+      predictions, math_ops.cast(labels, dtypes.bool),
+      weights=_float_weights_or_none(weights))
+
+
+def _assert_class_id(class_id, num_classes=None):
+  """Average label value for class `class_id`."""
+  if (class_id is None) or (class_id < 0):
+    raise ValueError("Invalid class_id %s." % class_id)
+  if num_classes is not None:
+    if num_classes < 2:
+      raise ValueError("Invalid num_classes %s." % num_classes)
+    if class_id >= num_classes:
+      raise ValueError("Invalid class_id %s." % class_id)
 
 
 def _accuracy_at_threshold(threshold):
@@ -1012,6 +1213,6 @@ def _streaming_metrics(predictions, labels, weights=None):
     precision_tensor, update_op = streaming_metrics_fn(
         predictions, labels=labels, thresholds=(threshold,),
         weights=_float_weights_or_none(weights))
-    return array_ops.squeeze(precision_tensor), update_op
+    return array_ops.squeeze(precision_tensor), array_ops.squeeze(update_op)
 
   return _streaming_metrics
diff --git a/tensorflow/contrib/learn/python/learn/estimators/head_test.py b/tensorflow/contrib/learn/python/learn/estimators/head_test.py
index 79f75e9ba7a11f..b84a8ce3c2081a 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/head_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head_test.py
@@ -18,6 +18,7 @@
 from __future__ import division
 from __future__ import print_function
 
+import math
 import numpy as np
 import six
 import tensorflow as tf
@@ -40,59 +41,94 @@ def _assert_variables(
 
 
 def _assert_no_variables(test_case):
-  _assert_variables(test_case, set([]), set([]), set([]))
+  _assert_variables(test_case)
 
 
-class RegressionModelHeadTest(tf.test.TestCase):
+# This must be called from within a tf.Session.
+def _assert_metrics(
+    test_case, expected_loss, expected_eval_metrics, model_fn_ops):
+  test_case.assertAlmostEqual(expected_loss, model_fn_ops.loss.eval(), places=4)
+  for k in six.iterkeys(expected_eval_metrics):
+    test_case.assertIn(k, six.iterkeys(model_fn_ops.eval_metric_ops))
+  tf.initialize_local_variables().run()
+  for key, expected_value in six.iteritems(expected_eval_metrics):
+    value_tensor, update_tensor = model_fn_ops.eval_metric_ops[key]
+    update = update_tensor.eval()
+    test_case.assertAlmostEqual(
+        expected_value, update, places=4,
+        msg="%s: update, expected %s, got %s." % (key, expected_value, update))
+    value = value_tensor.eval()
+    test_case.assertAlmostEqual(
+        expected_value, value, places=4,
+        msg="%s: value, expected %s, got %s." % (key, expected_value, value))
+
+
+def _sigmoid(x):
+  return 1. / (1. + math.exp(-1 * x))
+
 
-  def _assert_metrics(self, model_fn_ops):
-    self.assertItemsEqual((
-        "loss",
-    ), six.iterkeys(model_fn_ops.eval_metric_ops))
+class RegressionModelHeadTest(tf.test.TestCase):
 
-  # TODO(zakaria): test multilabel regresssion.
+  # TODO(zakaria): test multilabel regression.
   def testRegression(self):
     head = head_lib._regression_head()
-    with tf.Graph().as_default(), tf.Session() as sess:
+    with tf.Graph().as_default(), tf.Session():
       prediction = tf.constant([[1.], [1.], [3.]])
       labels = tf.constant([[0.], [1.], [1.]])
       model_fn_ops = head.head_ops({}, labels,
                                    tf.contrib.learn.ModeKeys.TRAIN,
                                    _noop_train_op, logits=prediction)
-      self._assert_metrics(model_fn_ops)
       _assert_no_variables(self)
-      self.assertAlmostEqual(5. / 3, sess.run(model_fn_ops.loss))
+      _assert_metrics(self, 5. / 3, {"loss": 5. / 3}, model_fn_ops)
 
+  def testRegressionEvalMode(self):
+    head = head_lib._regression_head()
+    with tf.Graph().as_default(), tf.Session():
+      prediction = tf.constant([[1.], [1.], [3.]])
+      labels = tf.constant([[0.], [1.], [1.]])
       model_fn_ops = head.head_ops({}, labels,
                                    tf.contrib.learn.ModeKeys.EVAL,
                                    _noop_train_op, logits=prediction)
       self.assertIsNone(model_fn_ops.train_op)
+      _assert_no_variables(self)
+      _assert_metrics(self, 5. / 3, {"loss": 5. / 3}, model_fn_ops)
+
+  def testRegressionWithLabelName(self):
+    label_name = "my_label"
+    head = head_lib._regression_head(label_name=label_name)
+    with tf.Graph().as_default(), tf.Session():
+      prediction = tf.constant([[1.], [1.], [3.]])
+      labels = {label_name: tf.constant([[0.], [1.], [1.]])}
+      model_fn_ops = head.head_ops({}, labels,
+                                   tf.contrib.learn.ModeKeys.TRAIN,
+                                   _noop_train_op, logits=prediction)
+      _assert_no_variables(self)
+      _assert_metrics(self, 5. / 3, {"loss": 5. / 3}, model_fn_ops)
 
   def testRegressionWithWeights(self):
     head = head_lib._regression_head(
         weight_column_name="label_weight")
-    with tf.Graph().as_default(), tf.Session() as sess:
-      features = {"label_weight": tf.constant([[2.], [5.], [0.]])}
+    with tf.Graph().as_default(), tf.Session():
+      weights = ((2.,), (5.,), (0.,))
+      features = {"label_weight": tf.constant(weights)}
       prediction = tf.constant([[1.], [1.], [3.]])
       labels = tf.constant([[0.], [1.], [1.]])
       model_fn_ops = head.head_ops(features, labels,
                                    tf.contrib.learn.ModeKeys.TRAIN,
                                    _noop_train_op, logits=prediction)
-      self._assert_metrics(model_fn_ops)
       _assert_no_variables(self)
-      self.assertAlmostEqual(2. / 3, sess.run(model_fn_ops.loss), places=3)
+      _assert_metrics(self, 2. / len(weights), {
+          "loss": 2. / np.sum(weights)
+      }, model_fn_ops)
 
   def testRegressionWithCenteredBias(self):
-    head = head_lib._regression_head(
-        weight_column_name="label_weight", enable_centered_bias=True)
-    with tf.Graph().as_default(), tf.Session() as sess:
-      features = {"label_weight": tf.constant([[2.], [5.], [0.]])}
+    head = head_lib._regression_head(enable_centered_bias=True)
+    with tf.Graph().as_default(), tf.Session():
       prediction = tf.constant([[1.], [1.], [3.]])
       labels = tf.constant([[0.], [1.], [1.]])
-      model_fn_ops = head.head_ops(features, labels,
+      model_fn_ops = head.head_ops({}, labels,
                                    tf.contrib.learn.ModeKeys.TRAIN,
                                    _noop_train_op, logits=prediction)
-      self._assert_metrics(model_fn_ops)
       _assert_variables(self, expected_global=(
           "centered_bias_weight:0",
           "centered_bias_weight/Adagrad:0",
@@ -100,7 +136,7 @@ def testRegressionWithCenteredBias(self):
           "centered_bias_weight:0",
       ))
       tf.global_variables_initializer().run()
-      self.assertAlmostEqual(2. / 3, sess.run(model_fn_ops.loss), places=3)
+      _assert_metrics(self, 5. / 3, {"loss": 5. / 3}, model_fn_ops)
 
   def testErrorInSparseTensorLabels(self):
     head = head_lib._regression_head()
@@ -118,52 +154,111 @@ def testErrorInSparseTensorLabels(self):
 
 class MultiLabelModelHeadTest(tf.test.TestCase):
 
-  def _assert_metrics(self, model_fn_ops):
-    self.assertItemsEqual((
-        "accuracy",
-        "loss",
-    ), six.iterkeys(model_fn_ops.eval_metric_ops))
+  def setUp(self):
+    self._logits = ((1., 0., 0.),)
+    self._labels = ((0, 0, 1),)
+
+  def _expected_eval_metrics(self, expected_loss):
+    return {
+        "accuracy": 1. / 3,
+        "auc": 1. / 4,
+        "loss": expected_loss,
+        "auc/class0": 1.,
+        "auc/class1": 1.,
+        "auc/class2": 0.,
+        "labels/actual_label_mean/class0": self._labels[0][0],
+        "labels/actual_label_mean/class1": self._labels[0][1],
+        "labels/actual_label_mean/class2": self._labels[0][2],
+        "labels/logits_mean/class0": self._logits[0][0],
+        "labels/logits_mean/class1": self._logits[0][1],
+        "labels/logits_mean/class2": self._logits[0][2],
+        "labels/prediction_mean/class0": self._logits[0][0],
+        "labels/prediction_mean/class1": self._logits[0][1],
+        "labels/prediction_mean/class2": self._logits[0][2],
+        "labels/probability_mean/class0": _sigmoid(self._logits[0][0]),
+        "labels/probability_mean/class1": _sigmoid(self._logits[0][1]),
+        "labels/probability_mean/class2": _sigmoid(self._logits[0][2]),
+    }
 
   def testMultiLabel(self):
-    head = head_lib._multi_label_head(n_classes=3)
-    with tf.Graph().as_default(), tf.Session() as sess:
-      logits = tf.constant([[1., 0., 0.]])
-      labels = tf.constant([[0, 0, 1]])
+    n_classes = 3
+    head = head_lib._multi_label_head(
+        n_classes=n_classes, metric_class_ids=range(n_classes))
+    with tf.Graph().as_default(), tf.Session():
+      logits = tf.constant(self._logits)
+      labels = tf.constant(self._labels)
       model_fn_ops = head.head_ops({}, labels,
                                    tf.contrib.learn.ModeKeys.TRAIN,
                                    _noop_train_op, logits=logits)
-      self._assert_metrics(model_fn_ops)
       _assert_no_variables(self)
-      self.assertAlmostEqual(0.89985204, sess.run(model_fn_ops.loss))
+      expected_loss = .89985204
+      _assert_metrics(
+          self, expected_loss, self._expected_eval_metrics(expected_loss),
+          model_fn_ops)
 
+  def testMultiLabelEvalMode(self):
+    n_classes = 3
+    head = head_lib._multi_label_head(
+        n_classes=n_classes, metric_class_ids=range(n_classes))
+    with tf.Graph().as_default(), tf.Session():
+      logits = tf.constant([[1., 0., 0.]])
+      labels = tf.constant([[0, 0, 1]])
       model_fn_ops = head.head_ops({}, labels,
                                    tf.contrib.learn.ModeKeys.EVAL,
                                    _noop_train_op, logits=logits)
       self.assertIsNone(model_fn_ops.train_op)
+      _assert_no_variables(self)
+      expected_loss = .89985204
+      _assert_metrics(
+          self, expected_loss, self._expected_eval_metrics(expected_loss),
+          model_fn_ops)
+
+  def testMultiLabelWithLabelName(self):
+    n_classes = 3
+    label_name = "my_label"
+    head = head_lib._multi_label_head(
+        n_classes=n_classes, label_name=label_name,
+        metric_class_ids=range(n_classes))
+    with tf.Graph().as_default(), tf.Session():
+      logits = tf.constant([[1., 0., 0.]])
+      labels = {label_name: tf.constant([[0, 0, 1]])}
+      model_fn_ops = head.head_ops({}, labels,
+                                   tf.contrib.learn.ModeKeys.TRAIN,
+                                   _noop_train_op, logits=logits)
+      _assert_no_variables(self)
+      expected_loss = .89985204
+      _assert_metrics(
+          self, expected_loss, self._expected_eval_metrics(expected_loss),
+          model_fn_ops)
 
   def testMultiLabelWithWeight(self):
+    n_classes = 3
     head = head_lib._multi_label_head(
-        n_classes=3, weight_column_name="label_weight")
-    with tf.Graph().as_default(), tf.Session() as sess:
-      features = {"label_weight": tf.constant([0.1])}
+        n_classes=n_classes, weight_column_name="label_weight",
+        metric_class_ids=range(n_classes))
+    with tf.Graph().as_default(), tf.Session():
+      features = {"label_weight": tf.constant([.1])}
       logits = tf.constant([[1., 0., 0.]])
       labels = tf.constant([[0, 0, 1]])
       model_fn_ops = head.head_ops(features, labels,
                                    tf.contrib.learn.ModeKeys.TRAIN,
                                    _noop_train_op, logits=logits)
-      self._assert_metrics(model_fn_ops)
       _assert_no_variables(self)
-      self.assertAlmostEqual(0.089985214, sess.run(model_fn_ops.loss))
+      _assert_metrics(
+          self, .089985214, self._expected_eval_metrics(2.69956),
+          model_fn_ops)
 
   def testMultiLabelWithCenteredBias(self):
-    head = head_lib._multi_label_head(n_classes=3, enable_centered_bias=True)
-    with tf.Graph().as_default(), tf.Session() as sess:
+    n_classes = 3
+    head = head_lib._multi_label_head(
+        n_classes=n_classes, enable_centered_bias=True,
+        metric_class_ids=range(n_classes))
+    with tf.Graph().as_default(), tf.Session():
       logits = tf.constant([[1., 0., 0.]])
       labels = tf.constant([[0, 0, 1]])
       model_fn_ops = head.head_ops({}, labels,
                                    tf.contrib.learn.ModeKeys.TRAIN,
                                    _noop_train_op, logits=logits)
-      self._assert_metrics(model_fn_ops)
       _assert_variables(self, expected_global=(
           "centered_bias_weight:0",
           "centered_bias_weight/Adagrad:0",
@@ -171,45 +266,69 @@ def testMultiLabelWithCenteredBias(self):
           "centered_bias_weight:0",
       ))
       tf.global_variables_initializer().run()
-      self.assertAlmostEqual(0.89985204, sess.run(model_fn_ops.loss))
+      expected_loss = .89985204
+      _assert_metrics(
+          self, expected_loss, self._expected_eval_metrics(expected_loss),
+          model_fn_ops)
 
 
-class MultiClassModelHeadTest(tf.test.TestCase):
+class BinaryClassificationModelHeadTest(tf.test.TestCase):
 
-  def _assert_binary_metrics(self, model_fn_ops):
-    self.assertItemsEqual((
-        "accuracy",
-        "accuracy/baseline_label_mean",
-        "accuracy/threshold_0.500000_mean",
-        "auc",
-        "labels/actual_label_mean",
-        "labels/prediction_mean",
-        "loss",
-        "precision/positive_threshold_0.500000_mean",
-        "recall/positive_threshold_0.500000_mean",
-    ), six.iterkeys(model_fn_ops.eval_metric_ops))
+  def setUp(self):
+    self._logits = ((1.,), (1.,))
+    self._labels = ((1.,), (0.,))
+
+  def _expected_eval_metrics(self, expected_loss):
+    return {
+        "accuracy": 1. / 2,
+        "accuracy/baseline_label_mean": np.mean(self._labels),
+        "accuracy/threshold_0.500000_mean": 1. / 2,
+        "auc": 1. / 2,
+        "labels/actual_label_mean": np.mean(self._labels),
+        "labels/prediction_mean": .731059,  # softmax
+        "loss": expected_loss,
+        "precision/positive_threshold_0.500000_mean": 1. / 2,
+        "recall/positive_threshold_0.500000_mean": 1. / 1,
+    }
 
   def testBinaryClassification(self):
-    head = head_lib._multi_class_head(n_classes=2)
-    with tf.Graph().as_default(), tf.Session() as sess:
-      logits = tf.constant([[1.], [1.]])
-      labels = tf.constant([[1.], [0.]])
+    n_classes = 2
+    head = head_lib._multi_class_head(n_classes=n_classes)
+    with tf.Graph().as_default(), tf.Session():
+      logits = tf.constant(self._logits)
+      labels = tf.constant(self._labels)
       # logloss: z:label, x:logit
       # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
       model_fn_ops = head.head_ops({}, labels,
                                    tf.contrib.learn.ModeKeys.TRAIN,
                                    _noop_train_op, logits=logits)
-      self._assert_binary_metrics(model_fn_ops)
       _assert_no_variables(self)
-      self.assertAlmostEqual(0.81326175, sess.run(model_fn_ops.loss),
-                             delta=1e-6)
+      expected_loss = .81326175
+      _assert_metrics(
+          self, expected_loss, self._expected_eval_metrics(expected_loss),
+          model_fn_ops)
+
+  def testBinaryClassificationEvalMode(self):
+    n_classes = 2
+    head = head_lib._multi_class_head(n_classes=n_classes)
+    with tf.Graph().as_default(), tf.Session():
+      logits = tf.constant(self._logits)
+      labels = tf.constant(self._labels)
+      # logloss: z:label, x:logit
+      # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
       model_fn_ops = head.head_ops({}, labels,
                                    tf.contrib.learn.ModeKeys.EVAL,
                                    _noop_train_op, logits=logits)
       self.assertIsNone(model_fn_ops.train_op)
+      _assert_no_variables(self)
+      expected_loss = .81326175
+      _assert_metrics(
+          self, expected_loss, self._expected_eval_metrics(expected_loss),
+          model_fn_ops)
 
   def testErrorInSparseTensorLabels(self):
-    head = head_lib._multi_class_head(n_classes=2)
+    n_classes = 2
+    head = head_lib._multi_class_head(n_classes=n_classes)
     with tf.Graph().as_default():
       prediction = tf.constant([[1.], [1.], [3.]])
       labels = tf.SparseTensor(
@@ -221,34 +340,63 @@ def testErrorInSparseTensorLabels(self):
         head.head_ops({}, labels, tf.contrib.learn.ModeKeys.TRAIN,
                       _noop_train_op, logits=prediction)
 
+  def testBinaryClassificationWithLabelName(self):
+    label_name = "my_label"
+    head = head_lib._multi_class_head(n_classes=2, label_name=label_name)
+    with tf.Graph().as_default(), tf.Session():
+      logits = tf.constant(self._logits)
+      labels = {label_name: tf.constant(self._labels)}
+      # logloss: z:label, x:logit
+      # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
+      model_fn_ops = head.head_ops({}, labels,
+                                   tf.contrib.learn.ModeKeys.TRAIN,
+                                   _noop_train_op, logits=logits)
+      _assert_no_variables(self)
+      expected_loss = .81326175
+      _assert_metrics(
+          self, expected_loss, self._expected_eval_metrics(expected_loss),
+          model_fn_ops)
+
   def testBinaryClassificationWithWeights(self):
+    n_classes = 2
     head = head_lib._multi_class_head(
-        n_classes=2, weight_column_name="label_weight")
-    with tf.Graph().as_default(), tf.Session() as sess:
-      features = {"label_weight": tf.constant([[1.], [0.]])}
-      logits = tf.constant([[1.], [1.]])
-      labels = tf.constant([[1.], [0.]])
+        n_classes=n_classes, weight_column_name="label_weight")
+    with tf.Graph().as_default(), tf.Session():
+      weights = ((1.,), (0.,))
+      features = {"label_weight": tf.constant(weights)}
+      logits = tf.constant(self._logits)
+      labels = tf.constant(self._labels)
       # logloss: z:label, x:logit
       # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
       model_fn_ops = head.head_ops(features, labels,
                                    tf.contrib.learn.ModeKeys.TRAIN,
                                    _noop_train_op, logits=logits)
-      self._assert_binary_metrics(model_fn_ops)
       _assert_no_variables(self)
-      self.assertAlmostEqual(.31326166 / 2, sess.run(model_fn_ops.loss),
-                             delta=1e-6)
+      expected_total_loss = .31326166
+      _assert_metrics(
+          self, expected_total_loss / len(weights), {
+              "accuracy": 1. / 1,
+              "accuracy/baseline_label_mean": 1. / 1,
+              "accuracy/threshold_0.500000_mean": 1. / 1,
+              "auc": 0. / 1,
+              "labels/actual_label_mean": 1. / 1,
+              "labels/prediction_mean": .731059,  # softmax
+              # TODO(ptucker): Is this the correct eval loss, sum not average?
+              "loss": expected_total_loss,
+              "precision/positive_threshold_0.500000_mean": 1. / 1,
+              "recall/positive_threshold_0.500000_mean": 1. / 1,
+          }, model_fn_ops)
 
   def testBinaryClassificationWithCenteredBias(self):
     head = head_lib._multi_class_head(n_classes=2, enable_centered_bias=True)
-    with tf.Graph().as_default(), tf.Session() as sess:
-      logits = tf.constant([[1.], [1.]])
-      labels = tf.constant([[1.], [0.]])
+    with tf.Graph().as_default(), tf.Session():
+      logits = tf.constant(self._logits)
+      labels = tf.constant(self._labels)
       # logloss: z:label, x:logit
       # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
       model_fn_ops = head.head_ops({}, labels,
                                    tf.contrib.learn.ModeKeys.TRAIN,
                                    _noop_train_op, logits=logits)
-      self._assert_binary_metrics(model_fn_ops)
       _assert_variables(self, expected_global=(
           "centered_bias_weight:0",
           "centered_bias_weight/Adagrad:0",
@@ -256,50 +404,97 @@ def testBinaryClassificationWithCenteredBias(self):
           "centered_bias_weight:0",
       ))
       tf.global_variables_initializer().run()
-      self.assertAlmostEqual(0.81326175, sess.run(model_fn_ops.loss),
-                             delta=1e-6)
+      expected_loss = .81326175
+      _assert_metrics(
+          self, expected_loss, self._expected_eval_metrics(expected_loss),
+          model_fn_ops)
+
+
+class MultiClassModelHeadTest(tf.test.TestCase):
 
-  def _assert_multi_class_metrics(self, model_fn_ops):
-    self.assertItemsEqual((
-        "accuracy",
-        "loss",
-    ), six.iterkeys(model_fn_ops.eval_metric_ops))
+  def setUp(self):
+    self._logits = ((1., 0., 0.),)
+    self._labels = (2,)
+
+  def _expected_eval_metrics(self, expected_loss):
+    return {
+        "accuracy": 0.,
+        "auc": 1. / 4,
+        "loss": expected_loss,
+        "auc/class0": 1.,
+        "auc/class1": 1.,
+        "auc/class2": 0.,
+        "labels/actual_label_mean/class0": 0. / 1,
+        "labels/actual_label_mean/class1": 0. / 1,
+        "labels/actual_label_mean/class2": 1. / 1,
+        "labels/logits_mean/class0": self._logits[0][0],
+        "labels/logits_mean/class1": self._logits[0][1],
+        "labels/logits_mean/class2": self._logits[0][2],
+        "labels/prediction_mean/class0": self._logits[0][0],
+        "labels/prediction_mean/class1": self._logits[0][1],
+        "labels/prediction_mean/class2": self._logits[0][2],
+        "labels/probability_mean/class0": 0.576117,  # softmax
+        "labels/probability_mean/class1": 0.211942,  # softmax
+        "labels/probability_mean/class2": 0.211942,  # softmax
+    }
 
   def testMultiClass(self):
     n_classes = 3
-    head = head_lib._multi_class_head(n_classes=n_classes)
-    with tf.Graph().as_default(), tf.Session() as sess:
-      logits = tf.constant([[1., 0., 0.]])
-      labels = tf.constant([2])
+    head = head_lib._multi_class_head(
+        n_classes=n_classes, metric_class_ids=range(n_classes))
+    with tf.Graph().as_default(), tf.Session():
+      logits = tf.constant(self._logits)
+      labels = tf.constant(self._labels)
       # logloss: z:label, x:logit
       # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
       model_fn_ops = head.head_ops({}, labels,
                                    tf.contrib.learn.ModeKeys.TRAIN,
                                    _noop_train_op, logits=logits)
-      self._assert_multi_class_metrics(model_fn_ops)
       _assert_no_variables(self)
-      self.assertAlmostEqual(1.5514446, sess.run(model_fn_ops.loss))
+      expected_loss = 1.5514446
+      _assert_metrics(
+          self, expected_loss, self._expected_eval_metrics(expected_loss),
+          model_fn_ops)
+
+  def testMultiClassEvalMode(self):
+    n_classes = 3
+    head = head_lib._multi_class_head(
+        n_classes=n_classes, metric_class_ids=range(n_classes))
+    with tf.Graph().as_default(), tf.Session():
+      logits = tf.constant(self._logits)
+      labels = tf.constant(self._labels)
+      # logloss: z:label, x:logit
+      # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
       model_fn_ops = head.head_ops({}, labels,
                                    tf.contrib.learn.ModeKeys.EVAL,
                                    _noop_train_op, logits=logits)
       self.assertIsNone(model_fn_ops.train_op)
+      _assert_no_variables(self)
+      expected_loss = 1.5514446
+      _assert_metrics(
+          self, expected_loss, self._expected_eval_metrics(expected_loss),
+          model_fn_ops)
 
   def testMultiClassWithWeight(self):
     n_classes = 3
     head = head_lib._multi_class_head(
-        n_classes=n_classes, weight_column_name="label_weight")
-    with tf.Graph().as_default(), tf.Session() as sess:
-      features = {"label_weight": tf.constant([0.1])}
-      logits = tf.constant([[1., 0., 0.]])
-      labels = tf.constant([2])
+        n_classes=n_classes, weight_column_name="label_weight",
+        metric_class_ids=range(n_classes))
+    with tf.Graph().as_default(), tf.Session():
+      weight = .1
+      features = {"label_weight": tf.constant([weight])}
+      logits = tf.constant(self._logits)
+      labels = tf.constant(self._labels)
       # logloss: z:label, x:logit
       # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
       model_fn_ops = head.head_ops(features, labels,
                                    tf.contrib.learn.ModeKeys.TRAIN,
                                    _noop_train_op, logits=logits)
-      self._assert_multi_class_metrics(model_fn_ops)
       _assert_no_variables(self)
-      self.assertAlmostEqual(.15514446, sess.run(model_fn_ops.loss))
+      expected_loss = 1.5514446
+      _assert_metrics(
+          self, expected_loss * weight,
+          self._expected_eval_metrics(expected_loss), model_fn_ops)
 
   def testInvalidNClasses(self):
     for n_classes in (None, -1, 0, 1):
@@ -314,15 +509,9 @@ def setUp(self):
     # (i.e., < 0) but it is within the [-1,1] margin. There is a 0.5 loss
     # incurred by this example. The 2nd prediction is outside the margin so it
     # incurs no loss at all.
-    self._predictions = ((-0.5,), (1.2,))
+    self._predictions = ((-.5,), (1.2,))
     self._labels = (0, 1)
-    self._expected_losses = (0.5, 0.0)
-
-  def _assert_metrics(self, model_fn_ops):
-    self.assertItemsEqual((
-        "accuracy",
-        "loss",
-    ), six.iterkeys(model_fn_ops.eval_metric_ops))
+    self._expected_losses = (.5, 0.)
 
   def testBinarySVMDefaultWeights(self):
     head = head_lib._binary_svm_head()
@@ -332,31 +521,62 @@ def testBinarySVMDefaultWeights(self):
       model_fn_ops = head.head_ops({}, labels,
                                    tf.contrib.learn.ModeKeys.TRAIN,
                                    _noop_train_op, logits=predictions)
-      self._assert_metrics(model_fn_ops)
       _assert_no_variables(self)
-      self.assertAlmostEqual(
-          np.average(self._expected_losses), model_fn_ops.loss.eval())
+      expected_loss = np.average(self._expected_losses)
+      _assert_metrics(self, expected_loss, {
+          "accuracy": 1.,
+          "loss": expected_loss,
+      }, model_fn_ops)
 
-    model_fn_ops = head.head_ops({}, labels,
-                                 tf.contrib.learn.ModeKeys.EVAL,
-                                 _noop_train_op, logits=predictions)
-    self.assertIsNone(model_fn_ops.train_op)
+  def testBinarySVMEvalMode(self):
+    head = head_lib._binary_svm_head()
+    with tf.Graph().as_default(), tf.Session():
+      predictions = tf.constant(self._predictions)
+      labels = tf.constant(self._labels)
+      model_fn_ops = head.head_ops({}, labels,
+                                   tf.contrib.learn.ModeKeys.EVAL,
+                                   _noop_train_op, logits=predictions)
+      self.assertIsNone(model_fn_ops.train_op)
+      _assert_no_variables(self)
+      expected_loss = np.average(self._expected_losses)
+      _assert_metrics(self, expected_loss, {
+          "accuracy": 1.,
+          "loss": expected_loss,
+      }, model_fn_ops)
+
+  def testBinarySVMWithLabelName(self):
+    label_name = "my_label"
+    head = head_lib._binary_svm_head(label_name=label_name)
+    with tf.Graph().as_default(), tf.Session():
+      predictions = tf.constant(self._predictions)
+      labels = {label_name: tf.constant(self._labels)}
+      model_fn_ops = head.head_ops({}, labels,
+                                   tf.contrib.learn.ModeKeys.TRAIN,
+                                   _noop_train_op, logits=predictions)
+      _assert_no_variables(self)
+      expected_loss = np.average(self._expected_losses)
+      _assert_metrics(self, expected_loss, {
+          "accuracy": 1.,
+          "loss": expected_loss,
+      }, model_fn_ops)
 
   def testBinarySVMWithWeights(self):
     head = head_lib._binary_svm_head(weight_column_name="weights")
     with tf.Graph().as_default(), tf.Session():
       predictions = tf.constant(self._predictions)
       labels = tf.constant(self._labels)
-      weights = (7.0, 11.0)
+      weights = (7., 11.)
       features = {"weights": tf.constant(weights)}
       model_fn_ops = head.head_ops(features, labels,
                                    tf.contrib.learn.ModeKeys.TRAIN,
                                    _noop_train_op, logits=predictions)
-      self._assert_metrics(model_fn_ops)
       _assert_no_variables(self)
-      self.assertAlmostEqual(
-          np.sum(np.multiply(weights, self._expected_losses)) / 2.0,
-          model_fn_ops.loss.eval())
+      expected_weighted_sum = np.sum(np.multiply(
+          weights, self._expected_losses))
+      _assert_metrics(self, expected_weighted_sum / len(weights), {
+          "accuracy": 1.,
+          "loss": expected_weighted_sum / np.sum(weights),
+      }, model_fn_ops)
 
   def testBinarySVMWithCenteredBias(self):
     head = head_lib._binary_svm_head(enable_centered_bias=True)
@@ -366,7 +586,6 @@ def testBinarySVMWithCenteredBias(self):
       model_fn_ops = head.head_ops({}, labels,
                                    tf.contrib.learn.ModeKeys.TRAIN,
                                    _noop_train_op, logits=predictions)
-      self._assert_metrics(model_fn_ops)
       _assert_variables(self, expected_global=(
           "centered_bias_weight:0",
           "centered_bias_weight/Adagrad:0",
@@ -374,8 +593,11 @@ def testBinarySVMWithCenteredBias(self):
           "centered_bias_weight:0",
       ))
       tf.global_variables_initializer().run()
-      self.assertAlmostEqual(
-          np.average(self._expected_losses), model_fn_ops.loss.eval())
+      expected_loss = np.average(self._expected_losses)
+      _assert_metrics(self, expected_loss, {
+          "accuracy": 1.,
+          "loss": expected_loss,
+      }, model_fn_ops)
 
 
 def _noop_train_op(unused_loss):
diff --git a/tensorflow/contrib/learn/python/learn/estimators/linear.py b/tensorflow/contrib/learn/python/learn/estimators/linear.py
index 0405eb04766360..b9ad438898d4fb 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/linear.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/linear.py
@@ -451,11 +451,13 @@ def fit(self, x=None, y=None, input_fn=None, steps=None, batch_size=None,
     return self
 
   def evaluate(self, x=None, y=None, input_fn=None, feed_fn=None,
-               batch_size=None, steps=None, metrics=None, name=None):
+               batch_size=None, steps=None, metrics=None, name=None,
+               checkpoint_path=None):
     """See evaluable.Evaluable. Note: Labels must be integer class indices."""
     return self._estimator.evaluate(x=x, y=y, input_fn=input_fn,
                                     feed_fn=feed_fn, batch_size=batch_size,
-                                    steps=steps, metrics=metrics, name=name)
+                                    steps=steps, metrics=metrics, name=name,
+                                    checkpoint_path=checkpoint_path)
 
   @deprecated_arg_values(
       estimator.AS_ITERABLE_DATE, estimator.AS_ITERABLE_INSTRUCTIONS,
@@ -727,11 +729,13 @@ def fit(self, x=None, y=None, input_fn=None, steps=None, batch_size=None,
     return self
 
   def evaluate(self, x=None, y=None, input_fn=None, feed_fn=None,
-               batch_size=None, steps=None, metrics=None, name=None):
+               batch_size=None, steps=None, metrics=None, name=None,
+               checkpoint_path=None):
     """See evaluable.Evaluable."""
     return self._estimator.evaluate(x=x, y=y, input_fn=input_fn,
                                     feed_fn=feed_fn, batch_size=batch_size,
-                                    steps=steps, metrics=metrics, name=name)
+                                    steps=steps, metrics=metrics, name=name,
+                                    checkpoint_path=checkpoint_path)
 
   @deprecated_arg_values(
       estimator.AS_ITERABLE_DATE, estimator.AS_ITERABLE_INSTRUCTIONS,
diff --git a/tensorflow/contrib/learn/python/learn/estimators/linear_test.py b/tensorflow/contrib/learn/python/learn/estimators/linear_test.py
index 50f0d2d75d4963..2a5eb29ef96587 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/linear_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/linear_test.py
@@ -28,6 +28,7 @@
 
 from tensorflow.contrib.learn.python.learn.estimators import _sklearn
 from tensorflow.contrib.learn.python.learn.estimators import estimator_test_utils
+from tensorflow.contrib.learn.python.learn.estimators import test_data
 from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec
 
 
@@ -40,13 +41,6 @@ def _prepare_iris_data_for_logistic_regression():
   return iris
 
 
-def _iris_input_fn():
-  iris = tf.contrib.learn.datasets.load_iris()
-  return {
-      'feature': tf.constant(iris.data, dtype=tf.float32)
-  }, tf.constant(iris.target, shape=[150, 1], dtype=tf.int32)
-
-
 class LinearClassifierTest(tf.test.TestCase):
 
   def testEstimatorContract(self):
@@ -61,7 +55,7 @@ def input_fn():
           'age': tf.constant([1]),
           'language': tf.SparseTensor(values=['english'],
                                       indices=[[0, 0]],
-                                      shape=[1, 1])
+                                      dense_shape=[1, 1])
       }, tf.constant([[1]])
 
     language = tf.contrib.layers.sparse_column_with_hash_bucket('language', 100)
@@ -81,10 +75,11 @@ def testJointTrain(self):
 
     def input_fn():
       return {
-          'age': tf.SparseTensor(values=['1'], indices=[[0, 0]], shape=[1, 1]),
+          'age': tf.SparseTensor(
+              values=['1'], indices=[[0, 0]], dense_shape=[1, 1]),
           'language': tf.SparseTensor(values=['english'],
                                       indices=[[0, 0]],
-                                      shape=[1, 1])
+                                      dense_shape=[1, 1])
       }, tf.constant([[1]])
 
     language = tf.contrib.layers.sparse_column_with_hash_bucket('language', 100)
@@ -109,8 +104,9 @@ def testMultiClass_MatrixData(self):
         n_classes=3,
         feature_columns=[feature_column])
 
-    classifier.fit(input_fn=_iris_input_fn, steps=100)
-    scores = classifier.evaluate(input_fn=_iris_input_fn, steps=100)
+    classifier.fit(input_fn=test_data.iris_input_multiclass_fn, steps=100)
+    scores = classifier.evaluate(
+        input_fn=test_data.iris_input_multiclass_fn, steps=100)
     self.assertGreater(scores['accuracy'], 0.9)
 
   def testMultiClass_MatrixData_Labels1D(self):
@@ -204,7 +200,7 @@ def testWeightAndBiasNames(self):
         n_classes=3,
         feature_columns=[feature_column])
 
-    classifier.fit(input_fn=_iris_input_fn, steps=100)
+    classifier.fit(input_fn=test_data.iris_input_multiclass_fn, steps=100)
     self.assertEqual(4, len(classifier.weights_))
     self.assertEqual(3, len(classifier.bias_))
 
@@ -218,8 +214,9 @@ def testCustomOptimizerByObject(self):
         optimizer=tf.train.FtrlOptimizer(learning_rate=0.1),
         feature_columns=[feature_column])
 
-    classifier.fit(input_fn=_iris_input_fn, steps=100)
-    scores = classifier.evaluate(input_fn=_iris_input_fn, steps=100)
+    classifier.fit(input_fn=test_data.iris_input_multiclass_fn, steps=100)
+    scores = classifier.evaluate(
+        input_fn=test_data.iris_input_multiclass_fn, steps=100)
     self.assertGreater(scores['accuracy'], 0.9)
 
   def testCustomOptimizerByString(self):
@@ -235,8 +232,9 @@ def _optimizer():
         optimizer=_optimizer,
         feature_columns=[feature_column])
 
-    classifier.fit(input_fn=_iris_input_fn, steps=100)
-    scores = classifier.evaluate(input_fn=_iris_input_fn, steps=100)
+    classifier.fit(input_fn=test_data.iris_input_multiclass_fn, steps=100)
+    scores = classifier.evaluate(
+        input_fn=test_data.iris_input_multiclass_fn, steps=100)
     self.assertGreater(scores['accuracy'], 0.9)
 
   def testCustomOptimizerByFunction(self):
@@ -249,8 +247,9 @@ def testCustomOptimizerByFunction(self):
         optimizer='Ftrl',
         feature_columns=[feature_column])
 
-    classifier.fit(input_fn=_iris_input_fn, steps=100)
-    scores = classifier.evaluate(input_fn=_iris_input_fn, steps=100)
+    classifier.fit(input_fn=test_data.iris_input_multiclass_fn, steps=100)
+    scores = classifier.evaluate(
+        input_fn=test_data.iris_input_multiclass_fn, steps=100)
     self.assertGreater(scores['accuracy'], 0.9)
 
   def testCustomMetrics(self):
@@ -353,7 +352,7 @@ def _input_fn():
       features = {
           'language': tf.SparseTensor(values=['en', 'fr', 'zh'],
                                       indices=[[0, 0], [0, 1], [2, 0]],
-                                      shape=[3, 2])
+                                      dense_shape=[3, 2])
       }
       labels = tf.constant([[1], [0], [0]])
       return features, labels
@@ -392,7 +391,7 @@ def input_fn(num_epochs=None):
       return {
           'age': tf.train.limit_epochs(tf.constant([1]), num_epochs=num_epochs),
           'language': tf.SparseTensor(
-              values=['english'], indices=[[0, 0]], shape=[1, 1]),
+              values=['english'], indices=[[0, 0]], dense_shape=[1, 1]),
       }, tf.constant([[1]])
 
     language = tf.contrib.layers.sparse_column_with_hash_bucket('language', 100)
@@ -503,7 +502,7 @@ def input_fn():
           'age': tf.constant([1]),
           'language': tf.SparseTensor(values=['english'],
                                       indices=[[0, 0]],
-                                      shape=[1, 1])
+                                      dense_shape=[1, 1])
       }, tf.constant([[1]])
 
     language = tf.contrib.layers.sparse_column_with_hash_bucket('language', 100)
@@ -524,7 +523,7 @@ def input_fn():
           'age': tf.constant([1]),
           'language': tf.SparseTensor(values=['english'],
                                       indices=[[0, 0]],
-                                      shape=[1, 1])
+                                      dense_shape=[1, 1])
       }, tf.constant([[1]])
 
     language = tf.contrib.layers.sparse_column_with_hash_bucket('language', 100)
@@ -543,7 +542,7 @@ def input_fn():
           'age': tf.constant([1]),
           'language': tf.SparseTensor(values=['english'],
                                       indices=[[0, 0]],
-                                      shape=[1, 1])
+                                      dense_shape=[1, 1])
       }, tf.constant([[1]])
 
     language = tf.contrib.layers.sparse_column_with_hash_bucket('language', 100)
@@ -561,7 +560,7 @@ def input_fn():
       return {
           'language': tf.SparseTensor(values=['hindi'],
                                       indices=[[0, 0]],
-                                      shape=[1, 1])
+                                      dense_shape=[1, 1])
       }, tf.constant([[1]])
 
     language = tf.contrib.layers.sparse_column_with_hash_bucket('language', 100)
@@ -586,7 +585,7 @@ def input_fn():
       return {
           'language': tf.SparseTensor(values=['Swahili', 'turkish'],
                                       indices=[[0, 0], [2, 0]],
-                                      shape=[3, 1])
+                                      dense_shape=[3, 1])
       }, tf.constant([[1], [1], [1]])
 
     language = tf.contrib.layers.sparse_column_with_hash_bucket('language', 100)
@@ -677,7 +676,7 @@ def input_fn():
           'price': tf.constant([[0.4], [0.6], [0.3]]),
           'country': tf.SparseTensor(values=['IT', 'US', 'GB'],
                                      indices=[[0, 0], [1, 3], [2, 1]],
-                                     shape=[3, 5]),
+                                     dense_shape=[3, 5]),
           'weights': tf.constant([[1.0], [1.0], [1.0]])
       }, tf.constant([[1], [0], [1]])
 
@@ -702,10 +701,10 @@ def input_fn():
           'example_id': tf.constant(['1', '2', '3']),
           'price': tf.SparseTensor(values=[2., 3., 1.],
                                    indices=[[0, 0], [1, 0], [2, 0]],
-                                   shape=[3, 5]),
+                                   dense_shape=[3, 5]),
           'country': tf.SparseTensor(values=['IT', 'US', 'GB'],
                                      indices=[[0, 0], [1, 0], [2, 0]],
-                                     shape=[3, 5])
+                                     dense_shape=[3, 5])
       }, tf.constant([[1], [0], [1]])
 
     country = tf.contrib.layers.sparse_column_with_hash_bucket(
@@ -729,10 +728,10 @@ def input_fn():
           'example_id': tf.constant(['1', '2', '3']),
           'language': tf.SparseTensor(values=['english', 'italian', 'spanish'],
                                       indices=[[0, 0], [1, 0], [2, 0]],
-                                      shape=[3, 1]),
+                                      dense_shape=[3, 1]),
           'country': tf.SparseTensor(values=['US', 'IT', 'MX'],
                                      indices=[[0, 0], [1, 0], [2, 0]],
-                                     shape=[3, 1])
+                                     dense_shape=[3, 1])
       }, tf.constant([[0], [0], [1]])
 
     language = tf.contrib.layers.sparse_column_with_hash_bucket(
@@ -760,7 +759,7 @@ def input_fn():
           'sq_footage': tf.constant([[900.0], [700.0], [600.0]]),
           'country': tf.SparseTensor(values=['IT', 'US', 'GB'],
                                      indices=[[0, 0], [1, 3], [2, 1]],
-                                     shape=[3, 5]),
+                                     dense_shape=[3, 5]),
           'weights': tf.constant([[3.0], [1.0], [1.0]])
       }, tf.constant([[1], [0], [1]])
 
@@ -792,7 +791,7 @@ def input_fn():
           'age': tf.constant([[1], [2]]),
           'language': tf.SparseTensor(values=['greek', 'chinese'],
                                       indices=[[0, 0], [1, 0]],
-                                      shape=[2, 1]),
+                                      dense_shape=[2, 1]),
       }, tf.constant([[1], [0]])
 
     language = tf.contrib.layers.sparse_column_with_hash_bucket('language', 100)
@@ -823,7 +822,7 @@ def input_fn():
           'age': tf.constant([1]),
           'language': tf.SparseTensor(values=['english'],
                                       indices=[[0, 0]],
-                                      shape=[1, 1])
+                                      dense_shape=[1, 1])
       }, tf.constant([[10.]])
 
     language = tf.contrib.layers.sparse_column_with_hash_bucket('language', 100)
@@ -848,8 +847,9 @@ def testRegression_MatrixData(self):
         feature_columns=cont_features,
         config=tf.contrib.learn.RunConfig(tf_random_seed=1))
 
-    regressor.fit(input_fn=_iris_input_fn, steps=100)
-    scores = regressor.evaluate(input_fn=_iris_input_fn, steps=1)
+    regressor.fit(input_fn=test_data.iris_input_multiclass_fn, steps=100)
+    scores = regressor.evaluate(
+        input_fn=test_data.iris_input_multiclass_fn, steps=1)
     self.assertLess(scores['loss'], 0.2)
 
   def testRegression_TensorData(self):
@@ -860,7 +860,7 @@ def _input_fn(num_epochs=None):
                                        num_epochs=num_epochs),
           'language': tf.SparseTensor(values=['en', 'fr', 'zh'],
                                       indices=[[0, 0], [0, 1], [2, 0]],
-                                      shape=[3, 2])
+                                      dense_shape=[3, 2])
       }
       return features, tf.constant([1.0, 0., 0.2], dtype=tf.float32)
 
@@ -975,7 +975,7 @@ def _input_fn(num_epochs=None):
                                        num_epochs=num_epochs),
           'language': tf.SparseTensor(values=['en', 'fr', 'zh'],
                                       indices=[[0, 0], [0, 1], [2, 0]],
-                                      shape=[3, 2])
+                                      dense_shape=[3, 2])
       }
       return features, tf.constant(labels, dtype=tf.float32)
 
@@ -1005,7 +1005,7 @@ def _input_fn(num_epochs=None):
                                        num_epochs=num_epochs),
           'language': tf.SparseTensor(values=['en', 'fr', 'zh'],
                                       indices=[[0, 0], [0, 1], [2, 0]],
-                                      shape=[3, 2])
+                                      dense_shape=[3, 2])
       }
       return features, tf.constant(labels, dtype=tf.float32)
 
@@ -1100,7 +1100,7 @@ def _input_fn(num_epochs=None):
                                        num_epochs=num_epochs),
           'language': tf.SparseTensor(values=['en', 'fr', 'zh'],
                                       indices=[[0, 0], [0, 1], [2, 0]],
-                                      shape=[3, 2])
+                                      dense_shape=[3, 2])
       }
       return features, tf.constant([1.0, 0., 0.2], dtype=tf.float32)
 
@@ -1135,7 +1135,7 @@ def _input_fn(num_epochs=None):
                                        num_epochs=num_epochs),
           'language': tf.SparseTensor(values=['en', 'fr', 'zh'],
                                       indices=[[0, 0], [0, 1], [2, 0]],
-                                      shape=[3, 2])
+                                      dense_shape=[3, 2])
       }
       return features, tf.constant([1.0, 0., 0.2], dtype=tf.float32)
 
@@ -1177,7 +1177,7 @@ def _input_fn(num_epochs=None):
                                        num_epochs=num_epochs),
           'language': tf.SparseTensor(values=['en', 'fr', 'zh'],
                                       indices=[[0, 0], [0, 1], [2, 0]],
-                                      shape=[3, 2])
+                                      dense_shape=[3, 2])
       }
       return features, tf.constant([1.0, 0., 0.2], dtype=tf.float32)
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor.py b/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor.py
index de5f39657833b3..ba2ac31995847c 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor.py
@@ -150,7 +150,8 @@ def evaluate(self,
                batch_size=None,
                steps=None,
                metrics=None,
-               name=None):
+               name=None,
+               checkpoint_path=None):
     """Evaluates given model with provided evaluation data.
 
     See superclass Estimator for more details.
@@ -164,15 +165,19 @@ def evaluate(self,
       steps: Number of steps for which to evaluate model.
       metrics: Dict of metric ops to run. If None, the default metrics are used.
       name: Name of the evaluation.
+      checkpoint_path: A specific checkpoint to use. By default, use the latest
+        checkpoint in the `model_dir`.
 
     Returns:
       Returns `dict` with evaluation results.
     """
     metrics = metrics or self.get_default_metrics(thresholds=self._thresholds)
-    return super(LogisticRegressor, self).evaluate(x=x,
-                                                   y=y,
-                                                   input_fn=input_fn,
-                                                   batch_size=batch_size,
-                                                   steps=steps,
-                                                   metrics=metrics,
-                                                   name=name)
+    return super(LogisticRegressor, self).evaluate(
+        x=x,
+        y=y,
+        input_fn=input_fn,
+        batch_size=batch_size,
+        steps=steps,
+        metrics=metrics,
+        name=name,
+        checkpoint_path=checkpoint_path)
diff --git a/tensorflow/contrib/learn/python/learn/estimators/metric_key.py b/tensorflow/contrib/learn/python/learn/estimators/metric_key.py
index 8df08e507fed33..10ac888eca7a0f 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/metric_key.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/metric_key.py
@@ -19,10 +19,16 @@
 
 
 class MetricKey(object):
+  """Metric key strings."""
   LOSS = "loss"
   AUC = "auc"
+  CLASS_AUC = "auc/class%d"
   PREDICTION_MEAN = "labels/prediction_mean"
+  CLASS_PREDICTION_MEAN = "labels/prediction_mean/class%d"
+  CLASS_LOGITS_MEAN = "labels/logits_mean/class%d"
+  CLASS_PROBABILITY_MEAN = "labels/probability_mean/class%d"
   LABEL_MEAN = "labels/actual_label_mean"
+  CLASS_LABEL_MEAN = "labels/actual_label_mean/class%d"
   ACCURACY = "accuracy"
   ACCURACY_BASELINE = "accuracy/baseline_label_mean"
   ACCURACY_MEAN = "accuracy/threshold_%f_mean"
diff --git a/tensorflow/contrib/learn/python/learn/estimators/nonlinear_test.py b/tensorflow/contrib/learn/python/learn/estimators/nonlinear_test.py
index 364f826877f6c2..7c1087eb8dfd26 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/nonlinear_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/nonlinear_test.py
@@ -54,13 +54,23 @@ def testBostonDNN(self):
         config=tf.contrib.learn.RunConfig(tf_random_seed=1))
     regressor.fit(
         boston.data, boston.target, steps=300, batch_size=boston.data.shape[0])
-    weights = regressor.weights_
+    weights = ([regressor.get_variable_value("dnn/hiddenlayer_0/weights")] +
+               [regressor.get_variable_value("dnn/hiddenlayer_1/weights")] +
+               [regressor.get_variable_value("dnn/hiddenlayer_2/weights")] +
+               [regressor.get_variable_value("dnn/logits/weights")])
     self.assertEqual(weights[0].shape, (13, 10))
     self.assertEqual(weights[1].shape, (10, 20))
     self.assertEqual(weights[2].shape, (20, 10))
     self.assertEqual(weights[3].shape, (10, 1))
-    biases = regressor.bias_
-    self.assertEqual(len(biases), 4)
+
+    biases = ([regressor.get_variable_value("dnn/hiddenlayer_0/biases")] +
+              [regressor.get_variable_value("dnn/hiddenlayer_1/biases")] +
+              [regressor.get_variable_value("dnn/hiddenlayer_2/biases")] +
+              [regressor.get_variable_value("dnn/logits/biases")])
+    self.assertEqual(biases[0].shape, (10,))
+    self.assertEqual(biases[1].shape, (20,))
+    self.assertEqual(biases[2].shape, (10,))
+    self.assertEqual(biases[3].shape, (1,))
 
   def testDNNDropout0(self):
     # Dropout prob == 0.
diff --git a/tensorflow/contrib/learn/python/learn/estimators/random_forest.py b/tensorflow/contrib/learn/python/learn/estimators/random_forest.py
index deb55efc9f3f15..d2d464ca72c87c 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/random_forest.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/random_forest.py
@@ -218,43 +218,50 @@ def __init__(self, params, device_assigner=None, model_dir=None,
         model_dir=model_dir,
         config=config,
         feature_engineering_fn=feature_engineering_fn)
+    self._skcompat = estimator.SKCompat(self._estimator)
+
+  @property
+  def model_dir(self):
+    """See evaluable.Evaluable."""
+    return self._estimator.model_dir
 
   def evaluate(
-      self, x=None, y=None, input_fn=None, feed_fn=None, batch_size=None,
-      steps=None, metrics=None, name=None):
+      self, x=None, y=None, input_fn=None, batch_size=None,
+      steps=None, metrics=None, name=None, checkpoint_path=None):
     """See evaluable.Evaluable."""
-    return self._estimator.evaluate(
-        input_fn=input_fn, x=x, y=y, feed_fn=feed_fn,
-        batch_size=batch_size, steps=steps,
-        metrics=metrics, name=name)
+    if x is not None and y is not None:
+      return self._skcompat.score(x, y, batch_size=batch_size, steps=steps,
+                                  metrics=metrics)
+    elif input_fn is not None:
+      return self._estimator.evaluate(input_fn=input_fn, steps=steps,
+                                      metrics=metrics, name=name,
+                                      checkpoint_path=checkpoint_path)
+    else:
+      raise ValueError(
+          'evaluate: Must provide either both x and y or input_fn.')
 
   def fit(self, x=None, y=None, input_fn=None, steps=None, batch_size=None,
           monitors=None, max_steps=None):
     """See trainable.Trainable."""
     if not monitors:
       monitors = [TensorForestLossHook(self.early_stopping_rounds)]
-    self._estimator.fit(input_fn=input_fn, x=x, y=y,
-                        batch_size=batch_size, steps=steps, monitors=monitors,
-                        max_steps=max_steps)
+    if x is not None and y is not None:
+      self._skcompat.fit(x, y, batch_size=batch_size, steps=steps,
+                         max_steps=max_steps, monitors=monitors)
+    elif input is not None:
+      self._estimator.fit(input_fn=input_fn, steps=steps, monitors=monitors,
+                          max_steps=max_steps)
+    else:
+      raise ValueError('fit: Must provide either both x and y or input_fn.')
 
-  @deprecated_arg_values(
-      estimator.AS_ITERABLE_DATE, estimator.AS_ITERABLE_INSTRUCTIONS,
-      as_iterable=False)
   def predict_proba(
-      self, x=None, input_fn=None, batch_size=None, outputs=None,
-      as_iterable=True):
+      self, x=None, input_fn=None, batch_size=None):
     """Returns prediction probabilities for given features (classification).
 
     Args:
       x: features.
       input_fn: Input function. If set, x and y must be None.
       batch_size: Override default batch size.
-      outputs: list of `str`, name of the output to predict.
-        If `None`, returns all.
-      as_iterable: If True, return an iterable which keeps yielding predictions
-        for each example until inputs are exhausted. Note: The inputs must
-        terminate if you want the iterable to terminate (e.g. be sure to pass
-        num_epochs=1 if you are using something like read_batch_features).
 
     Returns:
       Numpy array of predicted probabilities (or an iterable of predicted
@@ -263,21 +270,15 @@ def predict_proba(
     Raises:
       ValueError: If both or neither of x and input_fn were given.
     """
-    results = self._estimator.predict(
-        x=x, input_fn=input_fn, batch_size=batch_size, outputs=outputs,
-        as_iterable=as_iterable)
-
-    if as_iterable:
-      return (x[eval_metrics.INFERENCE_PROB_NAME] for x in results)
-    else:
+    if x is not None:
+      results = self._skcompat.predict(x, batch_size=batch_size)
       return results[eval_metrics.INFERENCE_PROB_NAME]
+    else:
+      results = self._estimator.predict(input_fn=input_fn, as_iterable=True)
+      return (x[eval_metrics.INFERENCE_PROB_NAME] for x in results)
 
-  @deprecated_arg_values(
-      estimator.AS_ITERABLE_DATE, estimator.AS_ITERABLE_INSTRUCTIONS,
-      as_iterable=False)
   def predict(
-      self, x=None, input_fn=None, axis=None, batch_size=None, outputs=None,
-      as_iterable=True):
+      self, x=None, input_fn=None, axis=None, batch_size=None):
     """Returns predictions for given features.
 
     Args:
@@ -286,50 +287,37 @@ def predict(
       axis: Axis on which to argmax (for classification).
             Last axis is used by default.
       batch_size: Override default batch size.
-      outputs: list of `str`, name of the output to predict.
-        If `None`, returns all.
-      as_iterable: If True, return an iterable which keeps yielding predictions
-        for each example until inputs are exhausted. Note: The inputs must
-        terminate if you want the iterable to terminate (e.g. be sure to pass
-        num_epochs=1 if you are using something like read_batch_features).
 
     Returns:
       Numpy array of predicted classes or regression values (or an iterable of
       predictions if as_iterable is True).
     """
-    results = self._estimator.predict(
-        x=x, input_fn=input_fn, batch_size=batch_size, outputs=outputs,
-        as_iterable=as_iterable)
-
     predict_name = (eval_metrics.INFERENCE_PROB_NAME if self.params.regression
                     else eval_metrics.INFERENCE_PRED_NAME)
-    if as_iterable:
-      return (x[predict_name] for x in results)
-    else:
+    if x is not None:
+      results = self._skcompat.predict(x, batch_size=batch_size)
       return results[predict_name]
+    else:
+      results = self._estimator.predict(input_fn=input_fn, as_iterable=True)
+      return (x[predict_name] for x in results)
 
-  @deprecated_arg_values(
-      estimator.AS_ITERABLE_DATE, estimator.AS_ITERABLE_INSTRUCTIONS,
-      as_iterable=False)
   def predict_with_keys(
-      self, x=None, input_fn=None, axis=None, batch_size=None, outputs=None,
-      as_iterable=True):
+      self, x=None, input_fn=None, axis=None, batch_size=None):
     """Same as predict but also returns the example keys."""
-    results = self._estimator.predict(
-        x=x, input_fn=input_fn, batch_size=batch_size, outputs=outputs,
-        as_iterable=as_iterable)
-
     predict_name = (eval_metrics.INFERENCE_PROB_NAME if self.params.regression
                     else eval_metrics.INFERENCE_PRED_NAME)
-    if as_iterable:
-      return ((x[predict_name], x.get(KEYS_NAME, None)) for x in results)
+    if x is not None:
+      results = self._skcompat.predict(x, batch_size=batch_size)
+      return results[predict_name]
     else:
-      return results[predict_name], results.get(KEYS_NAME, None)
+      results = self._estimator.predict(input_fn=input_fn, as_iterable=True)
+      return ((x[predict_name], x.get(KEYS_NAME, None)) for x in results)
 
   def export(self,
              export_dir,
              input_fn,
              signature_fn=None,
+             input_feature_key=None,
              default_batch_size=1):
     """See BaseEstimator.export."""
     # Reset model function with basic device assigner.
@@ -343,7 +331,9 @@ def export(self,
         weights_name=self.weights_name)
     result = self._estimator.export(
         export_dir=export_dir,
-        use_deprecated_input_fn=True,
+        input_fn=input_fn,
+        input_feature_key=input_feature_key,
+        use_deprecated_input_fn=False,
         signature_fn=(signature_fn or
                       (export.regression_signature_fn
                        if self.params.regression else
diff --git a/tensorflow/contrib/learn/python/learn/estimators/stability_test.py b/tensorflow/contrib/learn/python/learn/estimators/stability_test.py
index c78fdb704378b9..981ea5c9d3bf0d 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/stability_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/stability_test.py
@@ -124,9 +124,18 @@ def testDNNRegression(self):
           optimizer=_NULL_OPTIMIZER, config=config)
       regressor2.fit(x=boston.data, y=boston.target, steps=1)
 
-    for w1, w2 in zip(regressor1.weights_, regressor2.weights_):
+    weights1 = ([regressor1.get_variable_value('dnn/hiddenlayer_0/weights')] +
+                [regressor1.get_variable_value('dnn/logits/weights')])
+    weights2 = ([regressor2.get_variable_value('dnn/hiddenlayer_0/weights')] +
+                [regressor2.get_variable_value('dnn/logits/weights')])
+    for w1, w2 in zip(weights1, weights2):
       self.assertAllClose(w1, w2)
-    for b1, b2 in zip(regressor2.bias_, regressor2.bias_):
+
+    biases1 = ([regressor1.get_variable_value('dnn/hiddenlayer_0/biases')] +
+               [regressor1.get_variable_value('dnn/logits/biases')])
+    biases2 = ([regressor2.get_variable_value('dnn/hiddenlayer_0/biases')] +
+               [regressor2.get_variable_value('dnn/logits/biases')])
+    for b1, b2 in zip(biases1, biases2):
       self.assertAllClose(b1, b2)
     self.assertAllClose(
         list(regressor1.predict(boston.data, as_iterable=True)),
diff --git a/tensorflow/contrib/learn/python/learn/estimators/svm.py b/tensorflow/contrib/learn/python/learn/estimators/svm.py
index a6e4e7b6a3853d..561a898e78f2f4 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/svm.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/svm.py
@@ -168,6 +168,11 @@ def __init__(self,
     if not self._estimator.config.is_chief:
       self._chief_hook = None
 
+  @property
+  def model_dir(self):
+    """See trainable.Evaluable."""
+    return self._estimator.model_dir
+
   def fit(self, x=None, y=None, input_fn=None, steps=None, batch_size=None,
           monitors=None, max_steps=None):
     """See trainable.Trainable."""
@@ -181,11 +186,13 @@ def fit(self, x=None, y=None, input_fn=None, steps=None, batch_size=None,
 
   # pylint: disable=protected-access
   def evaluate(self, x=None, y=None, input_fn=None, feed_fn=None,
-               batch_size=None, steps=None, metrics=None, name=None):
+               batch_size=None, steps=None, metrics=None, name=None,
+               checkpoint_path=None):
     """See evaluable.Evaluable."""
     return self._estimator.evaluate(x=x, y=y, input_fn=input_fn,
                                     feed_fn=feed_fn, batch_size=batch_size,
-                                    steps=steps, metrics=metrics, name=name)
+                                    steps=steps, metrics=metrics, name=name,
+                                    checkpoint_path=checkpoint_path)
 
   @deprecated_arg_values(
       estimator.AS_ITERABLE_DATE, estimator.AS_ITERABLE_INSTRUCTIONS,
diff --git a/tensorflow/contrib/learn/python/learn/estimators/tensor_signature_test.py b/tensorflow/contrib/learn/python/learn/estimators/tensor_signature_test.py
index bfbd9de397375e..84c0b7c2d23f36 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/tensor_signature_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/tensor_signature_test.py
@@ -76,7 +76,8 @@ def testTensorSignatureCompatible(self):
         {'a': placeholder_c}, signatures))
 
   def testSparseTensorCompatible(self):
-    t = tf.SparseTensor(indices=[[0, 0], [1, 2]], values=[1, 2], shape=[3, 4])
+    t = tf.SparseTensor(
+        indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])
     signatures = tensor_signature.create_signatures(t)
     self.assertTrue(tensor_signature.tensors_compatible(t, signatures))
 
diff --git a/tensorflow/contrib/learn/python/learn/evaluable.py b/tensorflow/contrib/learn/python/learn/evaluable.py
index 17d2929f1b453e..aff0d70cd580d2 100644
--- a/tensorflow/contrib/learn/python/learn/evaluable.py
+++ b/tensorflow/contrib/learn/python/learn/evaluable.py
@@ -27,10 +27,15 @@ class Evaluable(object):
   """
   __metaclass__ = abc.ABCMeta
 
+  @abc.abstractproperty
+  def model_dir(self):
+    """Returns a path in which the eval process will look for checkpoints."""
+    raise NotImplementedError
+
   @abc.abstractmethod
   def evaluate(
       self, x=None, y=None, input_fn=None, feed_fn=None, batch_size=None,
-      steps=None, metrics=None, name=None):
+      steps=None, metrics=None, name=None, checkpoint_path=None):
     """Evaluates given model with provided evaluation data.
 
     Stop conditions - we evaluate on the given input data until one of the
@@ -83,6 +88,8 @@ def evaluate(
         `../../../metrics/python/ops/metrics_ops.py`.
       name: Name of the evaluation if user needs to run multiple evaluations on
         different data sets, such as on training data vs test data.
+      checkpoint_path: Path of a specific checkpoint to evaluate. If `None`, the
+        latest checkpoint in `model_dir` is used.
 
     Returns:
       Returns `dict` with evaluation results.
diff --git a/tensorflow/contrib/learn/python/learn/experiment.py b/tensorflow/contrib/learn/python/learn/experiment.py
index 2615fa8a00c7e7..9e4b9f579340ee 100644
--- a/tensorflow/contrib/learn/python/learn/experiment.py
+++ b/tensorflow/contrib/learn/python/learn/experiment.py
@@ -21,19 +21,21 @@
 
 import contextlib
 import math
+import os
 import time
 
 from tensorflow.contrib.framework import deprecated
 from tensorflow.contrib.framework import deprecated_arg_values
 from tensorflow.contrib.learn.python.learn import evaluable
+from tensorflow.contrib.learn.python.learn import export_strategy
 from tensorflow.contrib.learn.python.learn import monitors
 from tensorflow.contrib.learn.python.learn import trainable
 from tensorflow.contrib.learn.python.learn.estimators import run_config
-from tensorflow.contrib.learn.python.learn.estimators._sklearn import NotFittedError
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import basic_session_run_hooks
+from tensorflow.python.training import saver
 from tensorflow.python.training import server_lib
-
+from tensorflow.python.util import compat
 
 __all__ = ["Experiment"]
 
@@ -70,7 +72,8 @@ def __init__(self,
                eval_delay_secs=120,
                continuous_eval_throttle_secs=60,
                min_eval_frequency=1,
-               delay_workers_by_global_step=False):
+               delay_workers_by_global_step=False,
+               export_strategies=None):
     """Constructor for `Experiment`.
 
     Creates an Experiment instance. None of the functions passed to this
@@ -103,9 +106,11 @@ def __init__(self,
         occur if no new snapshot is available, hence, this is the minimum.
       delay_workers_by_global_step: if `True` delays training workers
         based on global step instead of time.
+      export_strategies: A list of `ExportStrategy`s, or a single one, or None.
 
     Raises:
-      ValueError: if `estimator` does not implement `Evaluable` and `Trainable`.
+      ValueError: if `estimator` does not implement `Evaluable` and `Trainable`,
+        or if export_strategies has the wrong type.
     """
     if not isinstance(estimator, evaluable.Evaluable):
       raise ValueError("`estimator` must implement `Evaluable`.")
@@ -125,6 +130,16 @@ def __init__(self,
     self._min_eval_frequency = min_eval_frequency
     self._delay_workers_by_global_step = delay_workers_by_global_step
 
+    if export_strategies is None:
+      self._export_strategies = []
+    elif isinstance(export_strategies, list):
+      self._export_strategies = export_strategies
+    elif isinstance(export_strategies, export_strategy.ExportStrategy):
+      self._export_strategies = [export_strategies]
+    else:
+      raise ValueError("`export_strategies` must be an ExportStrategy, "
+                       "a list of ExportStrategies, or None.")
+
   @property
   def estimator(self):
     return self._estimator
@@ -219,7 +234,8 @@ def _continuous_eval(self,
                        input_fn,
                        name,
                        delay_secs,
-                       throttle_delay_secs):
+                       throttle_delay_secs,
+                       evaluate_checkpoint_only_once=True):
     """Run continuous eval.
 
     Runs infinite eval on the evaluation data set. This function starts
@@ -235,6 +251,8 @@ def _continuous_eval(self,
       throttle_delay_secs: Do not re-evaluate unless the last evaluation was
         started at least this many seconds ago. If None, defaults to
         self._continuous_eval_throttle_secs.
+      evaluate_checkpoint_only_once: Whether to skip evaluation of checkpoints
+        that have already been evaluated. Default is `True`.
     """
     if delay_secs is None:
       delay_secs = self._eval_delay_secs
@@ -245,21 +263,37 @@ def _continuous_eval(self,
       logging.info("Waiting %f secs before starting eval.", delay_secs)
       time.sleep(delay_secs)
 
-    last_fitted_error_time = 0
+    previous_path = None
+    last_warning_time = 0
     while True:
       start = time.time()
-      try:
-        self._estimator.evaluate(input_fn=input_fn,
-                                 steps=self._eval_steps,
-                                 metrics=self._eval_metrics,
-                                 name=name)
-      except NotFittedError:
+
+      error_msg = None
+      latest_path = saver.latest_checkpoint(self._estimator.model_dir)
+      if not latest_path:
+        error_msg = ("Estimator is not fitted yet. "
+                     "Will start an evaluation when a checkpoint is ready.")
+      elif evaluate_checkpoint_only_once and latest_path == previous_path:
+        error_msg = "No new checkpoint ready for evaluation."
+
+      if error_msg:
         # Print warning message every 10 mins.
-        if time.time() - last_fitted_error_time > 600:
-          logging.warning(
-              "Estimator is not fitted yet. "
-              "Will start an evaluation when a checkpoint will be ready.")
-          last_fitted_error_time = time.time()
+        if time.time() - last_warning_time > 600:
+          logging.warning(error_msg)
+          last_warning_time = time.time()
+      else:
+        eval_result = self._estimator.evaluate(input_fn=input_fn,
+                                               steps=self._eval_steps,
+                                               metrics=self._eval_metrics,
+                                               name=name,
+                                               checkpoint_path=latest_path)
+
+        # TODO(soergel): further throttle how often export happens?
+        self._maybe_export(eval_result)
+
+        # Clear warning timer and update last evaluated checkpoint
+        last_warning_time = 0
+        previous_path = latest_path
 
       duration = time.time() - start
       if duration < throttle_delay_secs:
@@ -268,11 +302,14 @@ def _continuous_eval(self,
                      difference)
         time.sleep(difference)
 
-  def continuous_eval(self, delay_secs=None, throttle_delay_secs=None):
-    self._continuous_eval(self._eval_input_fn,
-                          name="continuous",
-                          delay_secs=delay_secs,
-                          throttle_delay_secs=throttle_delay_secs)
+  def continuous_eval(self, delay_secs=None, throttle_delay_secs=None,
+                      evaluate_checkpoint_only_once=True):
+    self._continuous_eval(
+        self._eval_input_fn,
+        name="continuous",
+        delay_secs=delay_secs,
+        throttle_delay_secs=throttle_delay_secs,
+        evaluate_checkpoint_only_once=evaluate_checkpoint_only_once)
 
   def continuous_eval_on_train_data(self,
                                     delay_secs=None,
@@ -323,11 +360,32 @@ def train_and_evaluate(self):
         )]
       self.train(delay_secs=0)
 
-    return self._estimator.evaluate(input_fn=self._eval_input_fn,
-                                    steps=self._eval_steps,
-                                    metrics=self._eval_metrics,
-                                    name=eval_dir_suffix)
-
+    eval_result = self._estimator.evaluate(input_fn=self._eval_input_fn,
+                                           steps=self._eval_steps,
+                                           metrics=self._eval_metrics,
+                                           name=eval_dir_suffix)
+    export_results = self._maybe_export(eval_result)
+    return eval_result, export_results
+
+  def _maybe_export(self, eval_result):  # pylint: disable=unused-argument
+    """Export the Estimator using export_fn, if defined."""
+    export_dir_base = os.path.join(
+        compat.as_bytes(self._estimator.model_dir),
+        compat.as_bytes("export"))
+
+    export_results = []
+    for strategy in self._export_strategies:
+      # TODO(soergel): possibly, allow users to decide whether to export here
+      # based on the eval_result (e.g., to keep the best export).
+
+      export_results.append(
+          strategy.export(
+              self._estimator,
+              os.path.join(
+                  compat.as_bytes(export_dir_base),
+                  compat.as_bytes(strategy.name))))
+
+    return export_results
 
   def run_std_server(self):
     """Starts a TensorFlow server and joins the serving thread.
@@ -380,7 +438,14 @@ def _new_attr_context(obj, attr):
   This creates a context in which an object's attribute can be changed.
   Once the context is exited, the attribute reverts to its original value.
 
-  Example usage:
+  Args:
+    obj: An object whose attribute to restore at the end of the context.
+    attr: An attribute to remember and restore at the end of the context.
+
+  Yields:
+    Context.
+
+  Example:
     my_obj.x = 1
     with _new_attr_context(my_obj, "x"):
       my_obj.x = 2
diff --git a/tensorflow/contrib/learn/python/learn/experiment_test.py b/tensorflow/contrib/learn/python/learn/experiment_test.py
index d0f04ee8424756..475c7c5626cabe 100644
--- a/tensorflow/contrib/learn/python/learn/experiment_test.py
+++ b/tensorflow/contrib/learn/python/learn/experiment_test.py
@@ -17,11 +17,16 @@
 from __future__ import print_function
 
 import json
+import os
+import tempfile
+import threading
 import time
 
 import tensorflow as tf
 
 from tensorflow.contrib.learn.python.learn import run_config
+from tensorflow.contrib.learn.python.learn.utils import saved_model_export_utils
+from tensorflow.python.util import compat
 from tensorflow.python.util.all_util import reveal_undocumented
 
 patch = tf.test.mock.patch
@@ -29,11 +34,18 @@
 
 class TestEstimator(tf.contrib.learn.Evaluable, tf.contrib.learn.Trainable):
 
-  def __init__(self, config=None):
+  def __init__(self, config=None, max_evals=5):
     self.eval_count = 0
     self.fit_count = 0
+    self._max_evals = max_evals
+    self.export_count = 0
     self.monitors = []
     self._config = config or run_config.RunConfig()
+    self._model_dir = tempfile.mkdtemp()
+
+  @property
+  def model_dir(self):
+    return self._model_dir
 
   @property
   def config(self):
@@ -42,18 +54,34 @@ def config(self):
   def evaluate(self, **kwargs):
     tf.logging.info('evaluate called with args: %s' % kwargs)
     self.eval_count += 1
-    if self.eval_count > 5:
-      tf.logging.info('Ran 6 evals. Done.')
+    if self.eval_count > self._max_evals:
+      tf.logging.info('Ran %d evals. Done.' % self.eval_count)
       raise StopIteration()
     return [(key, kwargs[key]) for key in sorted(kwargs.keys())]
 
+  def fake_checkpoint(self):
+    save_path = os.path.join(self.model_dir, 'model.ckpt')
+    with tf.Session() as sess:
+      var = tf.Variable(1.0, name='var0')
+      save = tf.train.Saver({var.op.name: var})
+      var.initializer.run()
+      save.save(sess, save_path, global_step=0)
+
   def fit(self, **kwargs):
+    self.fake_checkpoint()
     tf.logging.info('fit called with args: %s' % kwargs)
     self.fit_count += 1
     if 'monitors' in kwargs:
       self.monitors = kwargs['monitors']
     return [(key, kwargs[key]) for key in sorted(kwargs.keys())]
 
+  def export_savedmodel(self, export_dir_base, export_input_fn, **kwargs):
+    tf.logging.info('export_savedmodel called with args: %s, %s, %s'
+                    % (export_dir_base, export_input_fn, kwargs))
+    self.export_count += 1
+    return os.path.join(compat.as_bytes(export_dir_base),
+                        compat.as_bytes('bogus_timestamp'))
+
 
 class ExperimentTest(tf.test.TestCase):
 
@@ -89,7 +117,7 @@ def test_train_delay(self):
       start = time.time()
       ex.train(delay_secs=delay)
       duration = time.time() - start
-      self.assertAlmostEqual(duration, delay, delta=0.5)
+      self.assertAlmostEqual(duration, delay, delta=1.0)
 
   def test_train_default_delay(self):
     for task_id in [0, 1, 3]:
@@ -103,7 +131,7 @@ def test_train_default_delay(self):
       start = time.time()
       ex.train()
       duration = time.time() - start
-      self.assertAlmostEqual(duration, task_id * 5, delta=0.5)
+      self.assertAlmostEqual(duration, task_id * 5, delta=1.0)
 
   @tf.test.mock.patch('tensorflow.python.training.server_lib.Server')  # pylint: disable=line-too-long
   def test_train_starts_server(self, mock_server):
@@ -145,7 +173,7 @@ def test_train_starts_server(self, mock_server):
     mock_server.assert_has_calls([tf.test.mock.call().start()])
 
     # Ensure that the delay takes into account the time to start the server.
-    self.assertAlmostEqual(duration, 1.0, delta=0.5)
+    self.assertAlmostEqual(duration, 1.0, delta=1)
 
   @tf.test.mock.patch('tensorflow.python.training.server_lib.Server')  # pylint: disable=line-too-long
   def test_train_server_does_not_start_without_cluster_spec(self, mock_server):
@@ -195,6 +223,7 @@ def test_train_raises_if_job_name_is_missing(self):
 
   def test_evaluate(self):
     est = TestEstimator()
+    est.fake_checkpoint()
     ex = tf.contrib.learn.Experiment(
         est,
         train_input_fn='train_input',
@@ -208,6 +237,7 @@ def test_evaluate(self):
 
   def test_evaluate_delay(self):
     est = TestEstimator()
+    est.fake_checkpoint()
     ex = tf.contrib.learn.Experiment(
         est, train_input_fn='train_input', eval_input_fn='eval_input')
 
@@ -220,6 +250,7 @@ def test_evaluate_delay(self):
 
   def test_continuous_eval(self):
     est = TestEstimator()
+    est.fake_checkpoint()
     ex = tf.contrib.learn.Experiment(
         est,
         train_input_fn='train_input',
@@ -227,13 +258,15 @@ def test_continuous_eval(self):
         eval_metrics='eval_metrics',
         eval_delay_secs=0,
         continuous_eval_throttle_secs=0)
-    self.assertRaises(StopIteration, ex.continuous_eval)
+    self.assertRaises(StopIteration, ex.continuous_eval,
+                      evaluate_checkpoint_only_once=False)
     self.assertEquals(6, est.eval_count)
     self.assertEquals(0, est.fit_count)
 
   def test_continuous_eval_throttle_delay(self):
     for delay in [0, 1, 2]:
       est = TestEstimator()
+      est.fake_checkpoint()
       ex = tf.contrib.learn.Experiment(
           est,
           train_input_fn='train_input',
@@ -242,7 +275,8 @@ def test_continuous_eval_throttle_delay(self):
           continuous_eval_throttle_secs=delay,
           eval_delay_secs=0)
       start = time.time()
-      self.assertRaises(StopIteration, ex.continuous_eval)
+      self.assertRaises(StopIteration, ex.continuous_eval,
+                        evaluate_checkpoint_only_once=False)
       duration = time.time() - start
       expected = 5 * delay
       tf.logging.info('eval duration (expected %f): %f', expected, duration)
@@ -268,16 +302,20 @@ def test_run_local(self):
 
   def test_train_and_evaluate(self):
     est = TestEstimator()
+    export_strategy = saved_model_export_utils.make_export_strategy(
+        est, 'export_input')
     ex = tf.contrib.learn.Experiment(
         est,
         train_input_fn='train_input',
         eval_input_fn='eval_input',
         eval_metrics='eval_metrics',
         train_steps=100,
-        eval_steps=100)
+        eval_steps=100,
+        export_strategies=export_strategy)
     ex.train_and_evaluate()
     self.assertEquals(1, est.fit_count)
     self.assertEquals(1, est.eval_count)
+    self.assertEquals(1, est.export_count)
     self.assertEquals(1, len(est.monitors))
     self.assertTrue(
         isinstance(est.monitors[0],
@@ -327,6 +365,37 @@ def test_test(self):
     self.assertEquals(1, est.fit_count)
     self.assertEquals(1, est.eval_count)
 
+  def test_continuous_eval_evaluates_checkpoint_once(self):
+    # Temporarily disabled until we figure out the threading story on Jenkins.
+    return
+    # pylint: disable=unreachable
+
+    # The TestEstimator will raise StopIteration the second time evaluate is
+    # called.
+    ex = tf.contrib.learn.Experiment(
+        TestEstimator(max_evals=1),
+        train_input_fn='train_input',
+        eval_input_fn='eval_input')
+
+    # This should not happen if the logic restricting evaluation of the same
+    # checkpoint works. We do need some checkpoint though, otherwise Experiment
+    # will never evaluate.
+    ex.estimator.fake_checkpoint()
+
+    # Start a separate thread with continuous eval
+    thread = threading.Thread(
+        target=lambda: ex.continuous_eval(delay_secs=0, throttle_delay_secs=0))
+    thread.start()
+
+    # The thread will die if it evaluates twice, and we should never evaluate
+    # twice since we don't write another checkpoint. Since we did not enable
+    # throttling, if it hasn't died after two seconds, we're good.
+    thread.join(2)
+    self.assertTrue(thread.is_alive())
+
+    # But we should have evaluated once.
+    count = ex.estimator.eval_count
+    self.assertEquals(1, count)
 
 if __name__ == '__main__':
   tf.test.main()
diff --git a/tensorflow/contrib/learn/python/learn/export_strategy.py b/tensorflow/contrib/learn/python/learn/export_strategy.py
new file mode 100644
index 00000000000000..ea41e60f438e35
--- /dev/null
+++ b/tensorflow/contrib/learn/python/learn/export_strategy.py
@@ -0,0 +1,32 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Experiment class collecting information needed for a single training run."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+__all__ = ['ExportStrategy']
+
+
+class ExportStrategy(collections.namedtuple('ExportStrategy',
+                                            ['name', 'export_fn'])):
+
+  def export(self, estimator, export_path):
+    return self.export_fn(estimator, export_path)
+
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/__init__.py b/tensorflow/contrib/learn/python/learn/learn_io/__init__.py
index c3e2b56a6ef0af..32252cd8e3025a 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/__init__.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/__init__.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Tools to allow different io formats."""
 
 from __future__ import absolute_import
@@ -30,6 +29,7 @@
 from tensorflow.contrib.learn.python.learn.learn_io.graph_io import read_batch_record_features
 from tensorflow.contrib.learn.python.learn.learn_io.graph_io import read_keyed_batch_examples
 from tensorflow.contrib.learn.python.learn.learn_io.graph_io import read_keyed_batch_features
+from tensorflow.contrib.learn.python.learn.learn_io.numpy_io import numpy_input_fn
 from tensorflow.contrib.learn.python.learn.learn_io.pandas_io import extract_pandas_data
 from tensorflow.contrib.learn.python.learn.learn_io.pandas_io import extract_pandas_labels
 from tensorflow.contrib.learn.python.learn.learn_io.pandas_io import extract_pandas_matrix
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/numpy_io.py b/tensorflow/contrib/learn/python/learn/learn_io/numpy_io.py
new file mode 100644
index 00000000000000..2427e51f1447a1
--- /dev/null
+++ b/tensorflow/contrib/learn/python/learn/learn_io/numpy_io.py
@@ -0,0 +1,130 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Methods to allow dict of numpy arrays."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+from tensorflow.contrib.learn.python.learn.dataframe.queues import feeding_functions
+
+# Key name to pack the target into dict of `features`. See
+# `_get_unique_target_key` for details.
+_TARGET_KEY = '__target_key__'
+
+def _get_unique_target_key(features):
+  """Returns a key not existed in the input dict `features`.
+
+  Caller of `input_fn` usually provides `features` (dict of numpy arrays) and
+  `target`, but the underlying feeding module expects a single dict of numpy
+  arrays as input. So, the `target` needs to be packed into the `features`
+  temporarily and unpacked after calling the feeding function. Toward this goal,
+  this function returns a key not existed in the `features` to pack the
+  `target`.
+  """
+  target_key = _TARGET_KEY
+  while target_key in features:
+    target_key += '_n'
+  return target_key
+
+def numpy_input_fn(x,
+                   y=None,
+                   batch_size=128,
+                   num_epochs=1,
+                   shuffle=True,
+                   queue_capacity=1000,
+                   num_threads=1):
+  """Returns input function that would feed dict of numpy arrays into the model.
+
+  This returns a function outputting `features` and `target` based on the dict
+  of numpy arrays. The dict `features` has the same keys as the `x`.
+
+  Example:
+  ```python
+  age = np.arange(4) * 1.0
+  height = np.arange(32, 36)
+  x = {'age': age, 'height': height}
+  y = np.arange(-32, -28)
+
+  with tf.Session() as session:
+    input_fn = numpy_io.numpy_input_fn(
+        x, y, batch_size=2, shuffle=False, num_epochs=1)
+  ```
+
+  Args:
+    x: dict of numpy array object.
+    y: numpy array object.
+    batch_size: Integer, size of batches to return.
+    num_epochs: Integer, number of epochs to iterate over data. If `None` will
+      run forever.
+    shuffle: Boolean, if True shuffles the queue. Avoid shuffle at prediction
+      time.
+    queue_capacity: Integer, size of queue to accumulate.
+    num_threads: Integer, number of threads used for reading and enqueueing.
+
+  Returns:
+    Function, that has signature of ()->(dict of `features`, `target`)
+
+  Raises:
+    ValueError: if the shape of `y` mismatches the shape of values in `x` (i.e.,
+      values in `x` have same shape).
+    TypeError: `x` is not a dict.
+  """
+
+  def input_fn():
+    """Numpy input function."""
+    if not isinstance(x, dict):
+      raise TypeError('x must be dict; got {}'.format(type(x).__name__))
+
+    unique_target_key = _get_unique_target_key(x)
+    if y is not None:
+      x[unique_target_key] = y
+
+    if len(set(v.shape for v in x.values())) != 1:
+      shape_dict_of_x = {k: x[k].shape for k in x.keys()}
+      shape_of_y = None if y is None else y.shape
+      raise ValueError('Shape of x and y are mismatch, this will lead to '
+                       'missing values. Please make sure each value in x have '
+                       'the same shape as y.\n'
+                       'Shape for x: {}\n'
+                       'Shape for y: {}\n'.format(shape_dict_of_x, shape_of_y))
+
+    # Ensure the order of iteration is consistent.
+    ordered_dict_x = collections.OrderedDict(
+        sorted(x.items(), key=lambda t: t[0]))
+
+    queue = feeding_functions.enqueue_data(
+        ordered_dict_x,
+        queue_capacity,
+        shuffle=shuffle,
+        num_threads=num_threads,
+        enqueue_size=batch_size,
+        num_epochs=num_epochs)
+
+    features = (queue.dequeue_many(batch_size) if num_epochs is None
+                else queue.dequeue_up_to(batch_size))
+
+    # Remove the first `Tensor` in `features`, which is the row number.
+    if len(features) > 0:
+      features.pop(0)
+
+    features = dict(zip(ordered_dict_x.keys(), features))
+    if y is not None:
+      target = features.pop(unique_target_key)
+      return features, target
+    return features
+
+  return input_fn
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/numpy_io_test.py b/tensorflow/contrib/learn/python/learn/learn_io/numpy_io_test.py
new file mode 100644
index 00000000000000..d3845792dbb2df
--- /dev/null
+++ b/tensorflow/contrib/learn/python/learn/learn_io/numpy_io_test.py
@@ -0,0 +1,100 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for numpy_io."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+from tensorflow.contrib.learn.python.learn.learn_io import numpy_io
+from tensorflow.python.framework import errors
+
+
+class NumpyIoTest(tf.test.TestCase):
+
+  def testNumpyInputFn(self):
+    a = np.arange(4) * 1.0
+    b = np.arange(32, 36)
+    x = {'a': a, 'b': b}
+    y = np.arange(-32, -28)
+
+    with self.test_session() as session:
+      input_fn = numpy_io.numpy_input_fn(
+          x, y, batch_size=2, shuffle=False, num_epochs=1)
+      features, target = input_fn()
+
+      coord = tf.train.Coordinator()
+      threads = tf.train.start_queue_runners(session, coord=coord)
+
+      res = session.run([features, target])
+      self.assertAllEqual(res[0]['a'], [0, 1])
+      self.assertAllEqual(res[0]['b'], [32, 33])
+      self.assertAllEqual(res[1], [-32, -31])
+
+      session.run([features, target])
+      with self.assertRaises(errors.OutOfRangeError):
+        session.run([features, target])
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def testNumpyInputFnWithXAsNonDict(self):
+    x = np.arange(32, 36)
+    y = np.arange(4)
+    with self.test_session():
+      with self.assertRaisesRegexp(TypeError, 'x must be dict'):
+        failing_input_fn = numpy_io.numpy_input_fn(
+            x, y, batch_size=2, shuffle=False, num_epochs=1)
+        failing_input_fn()
+
+  def testNumpyInputFnWithTargetKeyAlreadyInX(self):
+    array = np.arange(32, 36)
+    x = {'__target_key__': array}
+    y = np.arange(4)
+
+    with self.test_session():
+      input_fn = numpy_io.numpy_input_fn(
+          x, y, batch_size=2, shuffle=False, num_epochs=1)
+      input_fn()
+      self.assertAllEqual(x['__target_key__'], array)
+      self.assertAllEqual(x['__target_key___n'], y)
+
+  def testNumpyInputFnWithMismatchLengthOfInputs(self):
+    a = np.arange(4) * 1.0
+    b = np.arange(32, 36)
+    x = {'a': a, 'b': b}
+    x_mismatch_length = {'a': np.arange(1), 'b': b}
+    y_longer_length = np.arange(10)
+
+    with self.test_session():
+      with self.assertRaisesRegexp(ValueError, 'Shape of x and y are mismatch'):
+        failing_input_fn = numpy_io.numpy_input_fn(
+            x, y_longer_length, batch_size=2, shuffle=False, num_epochs=1)
+        failing_input_fn()
+
+      with self.assertRaisesRegexp(ValueError, 'Shape of x and y are mismatch'):
+        failing_input_fn = numpy_io.numpy_input_fn(
+            x=x_mismatch_length,
+            y=None,
+            batch_size=2,
+            shuffle=False,
+            num_epochs=1)
+        failing_input_fn()
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/pandas_io.py b/tensorflow/contrib/learn/python/learn/learn_io/pandas_io.py
index ee62e777cd54a5..e0ea6c6c4a6db8 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/pandas_io.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/pandas_io.py
@@ -122,7 +122,7 @@ def extract_pandas_labels(labels):
     return labels
 
 
-def pandas_input_fn(x, y=None, batch_size=128, num_epochs=None, shuffle=True,
+def pandas_input_fn(x, y=None, batch_size=128, num_epochs=1, shuffle=True,
                     queue_capacity=1000, num_threads=1, target_column='target',
                     index_column='index'):
   """Returns input function that would feed pandas DataFrame into the model.
@@ -134,8 +134,8 @@ def pandas_input_fn(x, y=None, batch_size=128, num_epochs=None, shuffle=True,
     y: pandas `Series` object.
     batch_size: int, size of batches to return.
     num_epochs: int, number of epochs to iterate over data. If `None` will
-      run indefinetly.
-    shuffle: int, if shuffle the queue. Please make sure you don't shuffle at
+      run forever.
+    shuffle: bool, if shuffle the queue. Please make sure you don't shuffle at
       prediction time.
     queue_capacity: int, size of queue to accumulate.
     num_threads: int, number of threads used for reading and enqueueing.
diff --git a/tensorflow/contrib/learn/python/learn/metric_spec.py b/tensorflow/contrib/learn/python/learn/metric_spec.py
index a4df7ba658c724..1c404903e53fc5 100644
--- a/tensorflow/contrib/learn/python/learn/metric_spec.py
+++ b/tensorflow/contrib/learn/python/learn/metric_spec.py
@@ -194,6 +194,9 @@ def _get_dict(name, dict_or_tensor, key):
           raise ValueError('MetricSpec with ' + name + '_key specified'
                            ' requires ' +
                            name + 's dict, got %s' % dict_or_tensor)
+        if key not in dict_or_tensor:
+          raise KeyError(
+              'Key \'%s\' missing from %s.' % (key, dict_or_tensor.keys()))
         return dict_or_tensor[key]
       else:
         if isinstance(dict_or_tensor, dict):
diff --git a/tensorflow/contrib/learn/python/learn/models.py b/tensorflow/contrib/learn/python/learn/models.py
index 8f89da14605455..b4a65901c12471 100644
--- a/tensorflow/contrib/learn/python/learn/models.py
+++ b/tensorflow/contrib/learn/python/learn/models.py
@@ -251,9 +251,9 @@ def bidirectional_rnn(cell_fw,
     ValueError: If inputs is None or an empty list.
   """
 
-  if not isinstance(cell_fw, nn.rnn_cell.RNNCell):
+  if not isinstance(cell_fw, contrib_rnn.RNNCell):
     raise TypeError('cell_fw must be an instance of RNNCell')
-  if not isinstance(cell_bw, nn.rnn_cell.RNNCell):
+  if not isinstance(cell_bw, contrib_rnn.RNNCell):
     raise TypeError('cell_bw must be an instance of RNNCell')
   if not isinstance(inputs, list):
     raise TypeError('inputs must be a list')
@@ -317,12 +317,12 @@ def rnn_estimator(x, y):
     """RNN estimator with target predictor function on top."""
     x = input_op_fn(x)
     if cell_type == 'rnn':
-      cell_fn = nn.rnn_cell.BasicRNNCell
+      cell_fn = contrib_rnn.BasicRNNCell
     elif cell_type == 'gru':
-      cell_fn = nn.rnn_cell.GRUCell
+      cell_fn = contrib_rnn.GRUCell
     elif cell_type == 'lstm':
       cell_fn = functools.partial(
-          nn.rnn_cell.BasicLSTMCell, state_is_tuple=False)
+          contrib_rnn.BasicLSTMCell, state_is_tuple=False)
     else:
       raise ValueError('cell_type {} is not supported. '.format(cell_type))
     # TODO(ipolosukhin): state_is_tuple=False is deprecated
@@ -338,10 +338,10 @@ def rnn_estimator(x, y):
         bw_cell = contrib_rnn.AttentionCellWrapper(
             bw_cell, attn_length=attn_length, attn_size=attn_size,
             attn_vec_size=attn_vec_size, state_is_tuple=False)
-      rnn_fw_cell = nn.rnn_cell.MultiRNNCell([fw_cell] * num_layers,
+      rnn_fw_cell = contrib_rnn.MultiRNNCell([fw_cell] * num_layers,
                                              state_is_tuple=False)
       # backward direction cell
-      rnn_bw_cell = nn.rnn_cell.MultiRNNCell([bw_cell] * num_layers,
+      rnn_bw_cell = contrib_rnn.MultiRNNCell([bw_cell] * num_layers,
                                              state_is_tuple=False)
       # pylint: disable=unexpected-keyword-arg, no-value-for-parameter
       _, encoding = bidirectional_rnn(rnn_fw_cell,
@@ -357,7 +357,7 @@ def rnn_estimator(x, y):
         rnn_cell = contrib_rnn.AttentionCellWrapper(
             rnn_cell, attn_length=attn_length, attn_size=attn_size,
             attn_vec_size=attn_vec_size, state_is_tuple=False)
-      cell = nn.rnn_cell.MultiRNNCell([rnn_cell] * num_layers,
+      cell = contrib_rnn.MultiRNNCell([rnn_cell] * num_layers,
                                       state_is_tuple=False)
       _, encoding = nn.rnn(cell,
                            x,
diff --git a/tensorflow/contrib/learn/python/learn/ops/seq2seq_ops_test.py b/tensorflow/contrib/learn/python/learn/ops/seq2seq_ops_test.py
index 532f825c960885..6dd332d3639087 100644
--- a/tensorflow/contrib/learn/python/learn/ops/seq2seq_ops_test.py
+++ b/tensorflow/contrib/learn/python/learn/ops/seq2seq_ops_test.py
@@ -73,7 +73,7 @@ def test_rnn_decoder(self):
     with self.test_session():
       decoder_inputs = [tf.placeholder(tf.float32, [2, 2]) for _ in range(3)]
       encoding = tf.placeholder(tf.float32, [2, 2])
-      cell = tf.nn.rnn_cell.GRUCell(2)
+      cell = tf.contrib.rnn.GRUCell(2)
       outputs, states, sampling_outputs, sampling_states = (
           ops.rnn_decoder(decoder_inputs, encoding, cell))
       self.assertEqual(len(outputs), 3)
diff --git a/tensorflow/contrib/learn/python/learn/tests/dataframe/sparsify_densify_test.py b/tensorflow/contrib/learn/python/learn/tests/dataframe/sparsify_densify_test.py
index 4328eed5686ccd..bce081eee6dcc7 100644
--- a/tensorflow/contrib/learn/python/learn/tests/dataframe/sparsify_densify_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/dataframe/sparsify_densify_test.py
@@ -64,7 +64,7 @@ def _test_sparsify_densify(self, x, default_value):
     expected_x = x
     expected_x_values = x_values
 
-  np.testing.assert_array_equal(len(x), sparse_val.shape[0])
+  np.testing.assert_array_equal(len(x), sparse_val.dense_shape[0])
   np.testing.assert_array_equal(expected_x_values, sparse_val.values)
   np.testing.assert_array_equal(x_indexes, sparse_val.indices)
   np.testing.assert_array_equal(expected_x, dense_val)
diff --git a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py
index 54bb0fb3d7faba..5d7ba38446a106 100644
--- a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py
+++ b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py
@@ -23,6 +23,7 @@
 import re
 import time
 
+from tensorflow.contrib.learn.python.learn import export_strategy
 from tensorflow.contrib.learn.python.learn.estimators import constants
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
 from tensorflow.contrib.learn.python.learn.utils import gc
@@ -246,3 +247,26 @@ def parser(path):
 
   for p in delete_filter(gc.get_paths(export_dir_base, parser=parser)):
     gfile.DeleteRecursively(p.path)
+
+
+def make_export_strategy(export_input_fn,
+                         default_output_alternative_key='default',
+                         assets_extra=None,
+                         export_as_text=False,
+                         exports_to_keep=None):
+  """Create an ExportStrategy for use with Experiment."""
+
+  def export_fn(estimator, export_dir_base):
+    """Exports the given Estimator as a SavedModel."""
+    export_result = estimator.export_savedmodel(
+        export_dir_base,
+        export_input_fn,
+        default_output_alternative_key=default_output_alternative_key,
+        assets_extra=assets_extra,
+        export_as_text=export_as_text,
+        exports_to_keep=exports_to_keep)
+
+    garbage_collect_exports(export_dir_base, exports_to_keep)
+    return export_result
+
+  return export_strategy.ExportStrategy('Servo', export_fn)
diff --git a/tensorflow/contrib/legacy_seq2seq/BUILD b/tensorflow/contrib/legacy_seq2seq/BUILD
new file mode 100644
index 00000000000000..d4e8582bcc466d
--- /dev/null
+++ b/tensorflow/contrib/legacy_seq2seq/BUILD
@@ -0,0 +1,39 @@
+# Description:
+#   Contains library to create sequence-to-sequence models on top of TensorFlow.
+#   APIs here are meant to evolve over time.
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+package(default_visibility = ["//visibility:public"])
+
+load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
+
+py_library(
+    name = "seq2seq_py",
+    srcs = ["__init__.py"] + glob(["python/ops/*.py"]),
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+)
+
+cuda_py_tests(
+    name = "seq2seq_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/seq2seq_test.py"],
+    additional_deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/legacy_seq2seq/__init__.py b/tensorflow/contrib/legacy_seq2seq/__init__.py
new file mode 100644
index 00000000000000..1b9043645cb78b
--- /dev/null
+++ b/tensorflow/contrib/legacy_seq2seq/__init__.py
@@ -0,0 +1,54 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Deprecated library for creating sequence-to-sequence models in TensorFlow.
+
+@@attention_decoder
+@@basic_rnn_seq2seq
+@@embedding_attention_decoder
+@@embedding_attention_seq2seq
+@@embedding_rnn_decoder
+@@embedding_rnn_seq2seq
+@@embedding_tied_rnn_seq2seq
+@@model_with_buckets
+@@one2many_rnn_seq2seq
+@@rnn_decoder
+@@sequence_loss
+@@sequence_loss_by_example
+@@tied_rnn_seq2seq
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops.seq2seq import attention_decoder
+from tensorflow.python.ops.seq2seq import basic_rnn_seq2seq
+from tensorflow.python.ops.seq2seq import embedding_attention_decoder
+from tensorflow.python.ops.seq2seq import embedding_attention_seq2seq
+from tensorflow.python.ops.seq2seq import embedding_rnn_decoder
+from tensorflow.python.ops.seq2seq import embedding_rnn_seq2seq
+from tensorflow.python.ops.seq2seq import embedding_tied_rnn_seq2seq
+from tensorflow.python.ops.seq2seq import model_with_buckets
+from tensorflow.python.ops.seq2seq import one2many_rnn_seq2seq
+from tensorflow.python.ops.seq2seq import rnn_decoder
+from tensorflow.python.ops.seq2seq import sequence_loss
+from tensorflow.python.ops.seq2seq import sequence_loss_by_example
+from tensorflow.python.ops.seq2seq import tied_rnn_seq2seq
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = []
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/legacy_seq2seq/python/__init__.py b/tensorflow/contrib/legacy_seq2seq/python/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tensorflow/contrib/legacy_seq2seq/python/kernel_tests/__init__.py b/tensorflow/contrib/legacy_seq2seq/python/kernel_tests/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tensorflow/contrib/legacy_seq2seq/python/kernel_tests/seq2seq_test.py b/tensorflow/contrib/legacy_seq2seq/python/kernel_tests/seq2seq_test.py
new file mode 100644
index 00000000000000..33b5b96ca86a0c
--- /dev/null
+++ b/tensorflow/contrib/legacy_seq2seq/python/kernel_tests/seq2seq_test.py
@@ -0,0 +1,775 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for functional style sequence-to-sequence models."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import random
+
+import numpy as np
+import tensorflow as tf
+
+
+class Seq2SeqTest(tf.test.TestCase):
+
+  def testRNNDecoder(self):
+    with self.test_session() as sess:
+      with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
+        inp = [tf.constant(0.5, shape=[2, 2])] * 2
+        _, enc_state = tf.nn.rnn(
+            tf.contrib.rnn.GRUCell(2), inp, dtype=tf.float32)
+        dec_inp = [tf.constant(0.4, shape=[2, 2])] * 3
+        cell = tf.contrib.rnn.OutputProjectionWrapper(
+            tf.contrib.rnn.GRUCell(2), 4)
+        dec, mem = tf.contrib.legacy_seq2seq.rnn_decoder(
+            dec_inp, enc_state, cell)
+        sess.run([tf.global_variables_initializer()])
+        res = sess.run(dec)
+        self.assertEqual(3, len(res))
+        self.assertEqual((2, 4), res[0].shape)
+
+        res = sess.run([mem])
+        self.assertEqual((2, 2), res[0].shape)
+
+  def testBasicRNNSeq2Seq(self):
+    with self.test_session() as sess:
+      with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
+        inp = [tf.constant(0.5, shape=[2, 2])] * 2
+        dec_inp = [tf.constant(0.4, shape=[2, 2])] * 3
+        cell = tf.contrib.rnn.OutputProjectionWrapper(
+            tf.contrib.rnn.GRUCell(2), 4)
+        dec, mem = tf.contrib.legacy_seq2seq.basic_rnn_seq2seq(
+            inp, dec_inp, cell)
+        sess.run([tf.global_variables_initializer()])
+        res = sess.run(dec)
+        self.assertEqual(3, len(res))
+        self.assertEqual((2, 4), res[0].shape)
+
+        res = sess.run([mem])
+        self.assertEqual((2, 2), res[0].shape)
+
+  def testTiedRNNSeq2Seq(self):
+    with self.test_session() as sess:
+      with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
+        inp = [tf.constant(0.5, shape=[2, 2])] * 2
+        dec_inp = [tf.constant(0.4, shape=[2, 2])] * 3
+        cell = tf.contrib.rnn.OutputProjectionWrapper(
+            tf.contrib.rnn.GRUCell(2), 4)
+        dec, mem = tf.contrib.legacy_seq2seq.tied_rnn_seq2seq(
+            inp, dec_inp, cell)
+        sess.run([tf.global_variables_initializer()])
+        res = sess.run(dec)
+        self.assertEqual(3, len(res))
+        self.assertEqual((2, 4), res[0].shape)
+
+        res = sess.run([mem])
+        self.assertEqual(1, len(res))
+        self.assertEqual((2, 2), res[0].shape)
+
+  def testEmbeddingRNNDecoder(self):
+    with self.test_session() as sess:
+      with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
+        inp = [tf.constant(0.5, shape=[2, 2])] * 2
+        cell = tf.contrib.rnn.BasicLSTMCell(2, state_is_tuple=True)
+        _, enc_state = tf.nn.rnn(cell, inp, dtype=tf.float32)
+        dec_inp = [tf.constant(i, tf.int32, shape=[2]) for i in range(3)]
+        dec, mem = tf.contrib.legacy_seq2seq.embedding_rnn_decoder(
+            dec_inp, enc_state, cell, num_symbols=4, embedding_size=2)
+        sess.run([tf.global_variables_initializer()])
+        res = sess.run(dec)
+        self.assertEqual(3, len(res))
+        self.assertEqual((2, 2), res[0].shape)
+
+        res = sess.run([mem])
+        self.assertEqual(1, len(res))
+        self.assertEqual((2, 2), res[0].c.shape)
+        self.assertEqual((2, 2), res[0].h.shape)
+
+  def testEmbeddingRNNSeq2Seq(self):
+    with self.test_session() as sess:
+      with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
+        enc_inp = [tf.constant(1, tf.int32, shape=[2]) for i in range(2)]
+        dec_inp = [tf.constant(i, tf.int32, shape=[2]) for i in range(3)]
+        cell = tf.contrib.rnn.BasicLSTMCell(2, state_is_tuple=True)
+        dec, mem = tf.contrib.legacy_seq2seq.embedding_rnn_seq2seq(
+            enc_inp, dec_inp, cell, num_encoder_symbols=2,
+            num_decoder_symbols=5, embedding_size=2)
+        sess.run([tf.global_variables_initializer()])
+        res = sess.run(dec)
+        self.assertEqual(3, len(res))
+        self.assertEqual((2, 5), res[0].shape)
+
+        res = sess.run([mem])
+        self.assertEqual((2, 2), res[0].c.shape)
+        self.assertEqual((2, 2), res[0].h.shape)
+
+        # Test with state_is_tuple=False.
+        with tf.variable_scope("no_tuple"):
+          cell1 = tf.contrib.rnn.BasicLSTMCell(2, state_is_tuple=False)
+          dec, mem = tf.contrib.legacy_seq2seq.embedding_rnn_seq2seq(
+              enc_inp, dec_inp, cell1, num_encoder_symbols=2,
+              num_decoder_symbols=5, embedding_size=2)
+          sess.run([tf.global_variables_initializer()])
+          res = sess.run(dec)
+          self.assertEqual(3, len(res))
+          self.assertEqual((2, 5), res[0].shape)
+
+          res = sess.run([mem])
+          self.assertEqual((2, 4), res[0].shape)
+
+        # Test externally provided output projection.
+        w = tf.get_variable("proj_w", [2, 5])
+        b = tf.get_variable("proj_b", [5])
+        with tf.variable_scope("proj_seq2seq"):
+          dec, _ = tf.contrib.legacy_seq2seq.embedding_rnn_seq2seq(
+              enc_inp, dec_inp, cell, num_encoder_symbols=2,
+              num_decoder_symbols=5, embedding_size=2, output_projection=(w, b))
+        sess.run([tf.global_variables_initializer()])
+        res = sess.run(dec)
+        self.assertEqual(3, len(res))
+        self.assertEqual((2, 2), res[0].shape)
+
+        # Test that previous-feeding model ignores inputs after the first.
+        dec_inp2 = [tf.constant(0, tf.int32, shape=[2]) for _ in range(3)]
+        with tf.variable_scope("other"):
+          d3, _ = tf.contrib.legacy_seq2seq.embedding_rnn_seq2seq(
+              enc_inp, dec_inp2, cell, num_encoder_symbols=2,
+              num_decoder_symbols=5, embedding_size=2,
+              feed_previous=tf.constant(True))
+        sess.run([tf.global_variables_initializer()])
+        tf.get_variable_scope().reuse_variables()
+        d1, _ = tf.contrib.legacy_seq2seq.embedding_rnn_seq2seq(
+            enc_inp, dec_inp, cell, num_encoder_symbols=2,
+            num_decoder_symbols=5, embedding_size=2, feed_previous=True)
+        d2, _ = tf.contrib.legacy_seq2seq.embedding_rnn_seq2seq(
+            enc_inp, dec_inp2, cell, num_encoder_symbols=2,
+            num_decoder_symbols=5, embedding_size=2, feed_previous=True)
+        res1 = sess.run(d1)
+        res2 = sess.run(d2)
+        res3 = sess.run(d3)
+        self.assertAllClose(res1, res2)
+        self.assertAllClose(res1, res3)
+
+  def testEmbeddingTiedRNNSeq2Seq(self):
+    with self.test_session() as sess:
+      with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
+        enc_inp = [tf.constant(1, tf.int32, shape=[2]) for i in range(2)]
+        dec_inp = [tf.constant(i, tf.int32, shape=[2]) for i in range(3)]
+        cell = tf.contrib.rnn.BasicLSTMCell(2, state_is_tuple=True)
+        dec, mem = tf.contrib.legacy_seq2seq.embedding_tied_rnn_seq2seq(
+            enc_inp, dec_inp, cell, num_symbols=5, embedding_size=2)
+        sess.run([tf.global_variables_initializer()])
+        res = sess.run(dec)
+        self.assertEqual(3, len(res))
+        self.assertEqual((2, 5), res[0].shape)
+
+        res = sess.run([mem])
+        self.assertEqual((2, 2), res[0].c.shape)
+        self.assertEqual((2, 2), res[0].h.shape)
+
+        # Test when num_decoder_symbols is provided, the size of decoder output
+        # is num_decoder_symbols.
+        with tf.variable_scope("decoder_symbols_seq2seq"):
+          dec, mem = tf.contrib.legacy_seq2seq.embedding_tied_rnn_seq2seq(
+              enc_inp, dec_inp, cell, num_symbols=5, num_decoder_symbols=3,
+              embedding_size=2)
+        sess.run([tf.global_variables_initializer()])
+        res = sess.run(dec)
+        self.assertEqual(3, len(res))
+        self.assertEqual((2, 3), res[0].shape)
+
+        # Test externally provided output projection.
+        w = tf.get_variable("proj_w", [2, 5])
+        b = tf.get_variable("proj_b", [5])
+        with tf.variable_scope("proj_seq2seq"):
+          dec, _ = tf.contrib.legacy_seq2seq.embedding_tied_rnn_seq2seq(
+              enc_inp, dec_inp, cell, num_symbols=5, embedding_size=2,
+              output_projection=(w, b))
+        sess.run([tf.global_variables_initializer()])
+        res = sess.run(dec)
+        self.assertEqual(3, len(res))
+        self.assertEqual((2, 2), res[0].shape)
+
+        # Test that previous-feeding model ignores inputs after the first.
+        dec_inp2 = [tf.constant(0, tf.int32, shape=[2])] * 3
+        with tf.variable_scope("other"):
+          d3, _ = tf.contrib.legacy_seq2seq.embedding_tied_rnn_seq2seq(
+              enc_inp, dec_inp2, cell, num_symbols=5, embedding_size=2,
+              feed_previous=tf.constant(True))
+        sess.run([tf.global_variables_initializer()])
+        tf.get_variable_scope().reuse_variables()
+        d1, _ = tf.contrib.legacy_seq2seq.embedding_tied_rnn_seq2seq(
+            enc_inp, dec_inp, cell, num_symbols=5, embedding_size=2,
+            feed_previous=True)
+        d2, _ = tf.contrib.legacy_seq2seq.embedding_tied_rnn_seq2seq(
+            enc_inp, dec_inp2, cell, num_symbols=5, embedding_size=2,
+            feed_previous=True)
+        res1 = sess.run(d1)
+        res2 = sess.run(d2)
+        res3 = sess.run(d3)
+        self.assertAllClose(res1, res2)
+        self.assertAllClose(res1, res3)
+
+  def testAttentionDecoder1(self):
+    with self.test_session() as sess:
+      with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
+        cell = tf.contrib.rnn.GRUCell(2)
+        inp = [tf.constant(0.5, shape=[2, 2])] * 2
+        enc_outputs, enc_state = tf.nn.rnn(cell, inp, dtype=tf.float32)
+        attn_states = tf.concat(1, [tf.reshape(e, [-1, 1, cell.output_size])
+                                    for e in enc_outputs])
+        dec_inp = [tf.constant(0.4, shape=[2, 2])] * 3
+        dec, mem = tf.contrib.legacy_seq2seq.attention_decoder(
+            dec_inp, enc_state,
+            attn_states, cell, output_size=4)
+        sess.run([tf.global_variables_initializer()])
+        res = sess.run(dec)
+        self.assertEqual(3, len(res))
+        self.assertEqual((2, 4), res[0].shape)
+
+        res = sess.run([mem])
+        self.assertEqual((2, 2), res[0].shape)
+
+  def testAttentionDecoder2(self):
+    with self.test_session() as sess:
+      with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
+        cell = tf.contrib.rnn.GRUCell(2)
+        inp = [tf.constant(0.5, shape=[2, 2])] * 2
+        enc_outputs, enc_state = tf.nn.rnn(cell, inp, dtype=tf.float32)
+        attn_states = tf.concat(1, [tf.reshape(e, [-1, 1, cell.output_size])
+                                    for e in enc_outputs])
+        dec_inp = [tf.constant(0.4, shape=[2, 2])] * 3
+        dec, mem = tf.contrib.legacy_seq2seq.attention_decoder(
+            dec_inp, enc_state,
+            attn_states, cell, output_size=4,
+            num_heads=2)
+        sess.run([tf.global_variables_initializer()])
+        res = sess.run(dec)
+        self.assertEqual(3, len(res))
+        self.assertEqual((2, 4), res[0].shape)
+
+        res = sess.run([mem])
+        self.assertEqual((2, 2), res[0].shape)
+
+  def testDynamicAttentionDecoder1(self):
+    with self.test_session() as sess:
+      with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
+        cell = tf.contrib.rnn.GRUCell(2)
+        inp = tf.constant(0.5, shape=[2, 2, 2])
+        enc_outputs, enc_state = tf.nn.dynamic_rnn(cell, inp, dtype=tf.float32)
+        attn_states = enc_outputs
+        dec_inp = [tf.constant(0.4, shape=[2, 2])] * 3
+        dec, mem = tf.contrib.legacy_seq2seq.attention_decoder(
+            dec_inp, enc_state,
+            attn_states, cell, output_size=4)
+        sess.run([tf.global_variables_initializer()])
+        res = sess.run(dec)
+        self.assertEqual(3, len(res))
+        self.assertEqual((2, 4), res[0].shape)
+
+        res = sess.run([mem])
+        self.assertEqual((2, 2), res[0].shape)
+
+  def testDynamicAttentionDecoder2(self):
+    with self.test_session() as sess:
+      with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
+        cell = tf.contrib.rnn.GRUCell(2)
+        inp = tf.constant(0.5, shape=[2, 2, 2])
+        enc_outputs, enc_state = tf.nn.dynamic_rnn(cell, inp, dtype=tf.float32)
+        attn_states = enc_outputs
+        dec_inp = [tf.constant(0.4, shape=[2, 2])] * 3
+        dec, mem = tf.contrib.legacy_seq2seq.attention_decoder(
+            dec_inp, enc_state,
+            attn_states, cell, output_size=4,
+            num_heads=2)
+        sess.run([tf.global_variables_initializer()])
+        res = sess.run(dec)
+        self.assertEqual(3, len(res))
+        self.assertEqual((2, 4), res[0].shape)
+
+        res = sess.run([mem])
+        self.assertEqual((2, 2), res[0].shape)
+
+  def testAttentionDecoderStateIsTuple(self):
+    with self.test_session() as sess:
+      with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
+        cell = tf.contrib.rnn.BasicLSTMCell(2, state_is_tuple=True)
+        cell = tf.contrib.rnn.MultiRNNCell(cells=[cell] * 2,
+                                           state_is_tuple=True)
+        inp = [tf.constant(0.5, shape=[2, 2])] * 2
+        enc_outputs, enc_state = tf.nn.rnn(cell, inp, dtype=tf.float32)
+        attn_states = tf.concat(1, [tf.reshape(e, [-1, 1, cell.output_size])
+                                    for e in enc_outputs])
+        dec_inp = [tf.constant(0.4, shape=[2, 2])] * 3
+        dec, mem = tf.contrib.legacy_seq2seq.attention_decoder(
+            dec_inp, enc_state,
+            attn_states, cell, output_size=4)
+        sess.run([tf.global_variables_initializer()])
+        res = sess.run(dec)
+        self.assertEqual(3, len(res))
+        self.assertEqual((2, 4), res[0].shape)
+
+        res = sess.run([mem])
+        self.assertEqual(2, len(res[0]))
+        self.assertEqual((2, 2), res[0][0].c.shape)
+        self.assertEqual((2, 2), res[0][0].h.shape)
+        self.assertEqual((2, 2), res[0][1].c.shape)
+        self.assertEqual((2, 2), res[0][1].h.shape)
+
+    def testDynamicAttentionDecoderStateIsTuple(self):
+      with self.test_session() as sess:
+        with tf.variable_scope("root",
+                               initializer=tf.constant_initializer(0.5)):
+          cell = tf.contrib.rnn.BasicLSTMCell(2, state_is_tuple=True)
+          cell = tf.contrib.rnn.MultiRNNCell(cells=[cell] * 2,
+                                             state_is_tuple=True)
+          inp = tf.constant(0.5, shape=[2, 2, 2])
+          enc_outputs, enc_state = tf.nn.rnn(cell, inp, dtype=tf.float32)
+          attn_states = tf.concat(1, [tf.reshape(e, [-1, 1, cell.output_size])
+                                      for e in enc_outputs])
+          dec_inp = [tf.constant(0.4, shape=[2, 2])] * 3
+          dec, mem = tf.contrib.legacy_seq2seq.attention_decoder(
+              dec_inp, enc_state,
+              attn_states, cell, output_size=4)
+          sess.run([tf.global_variables_initializer()])
+          res = sess.run(dec)
+          self.assertEqual(3, len(res))
+          self.assertEqual((2, 4), res[0].shape)
+
+          res = sess.run([mem])
+          self.assertEqual(2, len(res[0]))
+          self.assertEqual((2, 2), res[0][0].c.shape)
+          self.assertEqual((2, 2), res[0][0].h.shape)
+          self.assertEqual((2, 2), res[0][1].c.shape)
+          self.assertEqual((2, 2), res[0][1].h.shape)
+
+  def testEmbeddingAttentionDecoder(self):
+    with self.test_session() as sess:
+      with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
+        inp = [tf.constant(0.5, shape=[2, 2])] * 2
+        cell = tf.contrib.rnn.GRUCell(2)
+        enc_outputs, enc_state = tf.nn.rnn(cell, inp, dtype=tf.float32)
+        attn_states = tf.concat(1, [tf.reshape(e, [-1, 1, cell.output_size])
+                                    for e in enc_outputs])
+        dec_inp = [tf.constant(i, tf.int32, shape=[2]) for i in range(3)]
+        dec, mem = tf.contrib.legacy_seq2seq.embedding_attention_decoder(
+            dec_inp, enc_state, attn_states, cell, num_symbols=4,
+            embedding_size=2, output_size=3)
+        sess.run([tf.global_variables_initializer()])
+        res = sess.run(dec)
+        self.assertEqual(3, len(res))
+        self.assertEqual((2, 3), res[0].shape)
+
+        res = sess.run([mem])
+        self.assertEqual((2, 2), res[0].shape)
+
+  def testEmbeddingAttentionSeq2Seq(self):
+    with self.test_session() as sess:
+      with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
+        enc_inp = [tf.constant(1, tf.int32, shape=[2]) for i in range(2)]
+        dec_inp = [tf.constant(i, tf.int32, shape=[2]) for i in range(3)]
+        cell = tf.contrib.rnn.BasicLSTMCell(2, state_is_tuple=True)
+        dec, mem = tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(
+            enc_inp, dec_inp, cell, num_encoder_symbols=2,
+            num_decoder_symbols=5, embedding_size=2)
+        sess.run([tf.global_variables_initializer()])
+        res = sess.run(dec)
+        self.assertEqual(3, len(res))
+        self.assertEqual((2, 5), res[0].shape)
+
+        res = sess.run([mem])
+        self.assertEqual((2, 2), res[0].c.shape)
+        self.assertEqual((2, 2), res[0].h.shape)
+
+        # Test with state_is_tuple=False.
+        with tf.variable_scope("no_tuple"):
+          cell = tf.contrib.rnn.BasicLSTMCell(2, state_is_tuple=False)
+          dec, mem = tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(
+              enc_inp, dec_inp, cell, num_encoder_symbols=2,
+              num_decoder_symbols=5, embedding_size=2)
+          sess.run([tf.global_variables_initializer()])
+          res = sess.run(dec)
+          self.assertEqual(3, len(res))
+          self.assertEqual((2, 5), res[0].shape)
+
+          res = sess.run([mem])
+          self.assertEqual((2, 4), res[0].shape)
+
+        # Test externally provided output projection.
+        w = tf.get_variable("proj_w", [2, 5])
+        b = tf.get_variable("proj_b", [5])
+        with tf.variable_scope("proj_seq2seq"):
+          dec, _ = tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(
+              enc_inp, dec_inp, cell, num_encoder_symbols=2,
+              num_decoder_symbols=5, embedding_size=2, output_projection=(w, b))
+        sess.run([tf.global_variables_initializer()])
+        res = sess.run(dec)
+        self.assertEqual(3, len(res))
+        self.assertEqual((2, 2), res[0].shape)
+
+        # Test that previous-feeding model ignores inputs after the first.
+        dec_inp2 = [tf.constant(0, tf.int32, shape=[2]) for _ in range(3)]
+        with tf.variable_scope("other"):
+          d3, _ = tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(
+              enc_inp, dec_inp2, cell, num_encoder_symbols=2,
+              num_decoder_symbols=5, embedding_size=2,
+              feed_previous=tf.constant(True))
+        sess.run([tf.global_variables_initializer()])
+        tf.get_variable_scope().reuse_variables()
+        d1, _ = tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(
+            enc_inp, dec_inp, cell, num_encoder_symbols=2,
+            num_decoder_symbols=5, embedding_size=2, feed_previous=True)
+        d2, _ = tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(
+            enc_inp, dec_inp2, cell, num_encoder_symbols=2,
+            num_decoder_symbols=5, embedding_size=2, feed_previous=True)
+        res1 = sess.run(d1)
+        res2 = sess.run(d2)
+        res3 = sess.run(d3)
+        self.assertAllClose(res1, res2)
+        self.assertAllClose(res1, res3)
+
+  def testOne2ManyRNNSeq2Seq(self):
+    with self.test_session() as sess:
+      with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
+        enc_inp = [tf.constant(1, tf.int32, shape=[2]) for i in range(2)]
+        dec_inp_dict = {}
+        dec_inp_dict["0"] = [
+            tf.constant(i, tf.int32, shape=[2]) for i in range(3)]
+        dec_inp_dict["1"] = [
+            tf.constant(i, tf.int32, shape=[2]) for i in range(4)]
+        dec_symbols_dict = {"0": 5, "1": 6}
+        cell = tf.contrib.rnn.BasicLSTMCell(2, state_is_tuple=True)
+        outputs_dict, state_dict = (
+            tf.contrib.legacy_seq2seq.one2many_rnn_seq2seq(
+                enc_inp, dec_inp_dict, cell, 2, dec_symbols_dict,
+                embedding_size=2))
+
+        sess.run([tf.global_variables_initializer()])
+        res = sess.run(outputs_dict["0"])
+        self.assertEqual(3, len(res))
+        self.assertEqual((2, 5), res[0].shape)
+        res = sess.run(outputs_dict["1"])
+        self.assertEqual(4, len(res))
+        self.assertEqual((2, 6), res[0].shape)
+        res = sess.run([state_dict["0"]])
+        self.assertEqual((2, 2), res[0].c.shape)
+        self.assertEqual((2, 2), res[0].h.shape)
+        res = sess.run([state_dict["1"]])
+        self.assertEqual((2, 2), res[0].c.shape)
+        self.assertEqual((2, 2), res[0].h.shape)
+
+        # Test that previous-feeding model ignores inputs after the first, i.e.
+        # dec_inp_dict2 has different inputs from dec_inp_dict after the first
+        # time-step.
+        dec_inp_dict2 = {}
+        dec_inp_dict2["0"] = [
+            tf.constant(0, tf.int32, shape=[2]) for _ in range(3)]
+        dec_inp_dict2["1"] = [
+            tf.constant(0, tf.int32, shape=[2]) for _ in range(4)]
+        with tf.variable_scope("other"):
+          outputs_dict3, _ = tf.contrib.legacy_seq2seq.one2many_rnn_seq2seq(
+              enc_inp, dec_inp_dict2, cell, 2, dec_symbols_dict,
+              embedding_size=2, feed_previous=tf.constant(True))
+        sess.run([tf.global_variables_initializer()])
+        tf.get_variable_scope().reuse_variables()
+        outputs_dict1, _ = tf.contrib.legacy_seq2seq.one2many_rnn_seq2seq(
+            enc_inp, dec_inp_dict, cell, 2, dec_symbols_dict,
+            embedding_size=2, feed_previous=True)
+        outputs_dict2, _ = tf.contrib.legacy_seq2seq.one2many_rnn_seq2seq(
+            enc_inp, dec_inp_dict2, cell, 2, dec_symbols_dict,
+            embedding_size=2, feed_previous=True)
+        res1 = sess.run(outputs_dict1["0"])
+        res2 = sess.run(outputs_dict2["0"])
+        res3 = sess.run(outputs_dict3["0"])
+        self.assertAllClose(res1, res2)
+        self.assertAllClose(res1, res3)
+
+  def testSequenceLoss(self):
+    with self.test_session() as sess:
+      logits = [tf.constant(i + 0.5, shape=[2, 5]) for i in range(3)]
+      targets = [tf.constant(i, tf.int32, shape=[2]) for i in range(3)]
+      weights = [tf.constant(1.0, shape=[2]) for i in range(3)]
+
+      average_loss_per_example = tf.contrib.legacy_seq2seq.sequence_loss(
+          logits, targets, weights,
+          average_across_timesteps=True,
+          average_across_batch=True)
+      res = sess.run(average_loss_per_example)
+      self.assertAllClose(1.60944, res)
+
+      average_loss_per_sequence = tf.contrib.legacy_seq2seq.sequence_loss(
+          logits, targets, weights,
+          average_across_timesteps=False,
+          average_across_batch=True)
+      res = sess.run(average_loss_per_sequence)
+      self.assertAllClose(4.828314, res)
+
+      total_loss = tf.contrib.legacy_seq2seq.sequence_loss(
+          logits, targets, weights,
+          average_across_timesteps=False,
+          average_across_batch=False)
+      res = sess.run(total_loss)
+      self.assertAllClose(9.656628, res)
+
+  def testSequenceLossByExample(self):
+    with self.test_session() as sess:
+      output_classes = 5
+      logits = [tf.constant(i + 0.5, shape=[2, output_classes])
+                for i in range(3)]
+      targets = [tf.constant(i, tf.int32, shape=[2]) for i in range(3)]
+      weights = [tf.constant(1.0, shape=[2]) for i in range(3)]
+
+      average_loss_per_example = (
+          tf.contrib.legacy_seq2seq.sequence_loss_by_example(
+              logits, targets, weights,
+              average_across_timesteps=True))
+      res = sess.run(average_loss_per_example)
+      self.assertAllClose(np.asarray([1.609438, 1.609438]), res)
+
+      loss_per_sequence = tf.contrib.legacy_seq2seq.sequence_loss_by_example(
+          logits, targets, weights,
+          average_across_timesteps=False)
+      res = sess.run(loss_per_sequence)
+      self.assertAllClose(np.asarray([4.828314, 4.828314]), res)
+
+  def testModelWithBucketsScopeAndLoss(self):
+    """Test that variable scope reuse is not reset after model_with_buckets."""
+    classes = 10
+    buckets = [(4, 4), (8, 8)]
+
+    with self.test_session():
+      # Here comes a sample Seq2Seq model using GRU cells.
+      def SampleGRUSeq2Seq(enc_inp, dec_inp, weights, per_example_loss):
+        """Example sequence-to-sequence model that uses GRU cells."""
+        def GRUSeq2Seq(enc_inp, dec_inp):
+          cell = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.GRUCell(24)] * 2,
+                                             state_is_tuple=True)
+          return tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(
+              enc_inp, dec_inp, cell, num_encoder_symbols=classes,
+              num_decoder_symbols=classes, embedding_size=24)
+        targets = [dec_inp[i+1] for i in range(len(dec_inp) - 1)] + [0]
+        return tf.contrib.legacy_seq2seq.model_with_buckets(
+            enc_inp, dec_inp, targets, weights, buckets, GRUSeq2Seq,
+            per_example_loss=per_example_loss)
+
+      # Now we construct the copy model.
+      inp = [tf.placeholder(tf.int32, shape=[None]) for _ in range(8)]
+      out = [tf.placeholder(tf.int32, shape=[None]) for _ in range(8)]
+      weights = [tf.ones_like(inp[0], dtype=tf.float32) for _ in range(8)]
+      with tf.variable_scope("root"):
+        _, losses1 = SampleGRUSeq2Seq(inp, out, weights, per_example_loss=False)
+        # Now check that we did not accidentally set reuse.
+        self.assertEqual(False, tf.get_variable_scope().reuse)
+        # Construct one more model with per-example loss.
+        tf.get_variable_scope().reuse_variables()
+        _, losses2 = SampleGRUSeq2Seq(inp, out, weights, per_example_loss=True)
+        # First loss is scalar, the second one is a 1-dimensinal tensor.
+        self.assertEqual([], losses1[0].get_shape().as_list())
+        self.assertEqual([None], losses2[0].get_shape().as_list())
+
+  def testModelWithBuckets(self):
+    """Larger tests that does full sequence-to-sequence model training."""
+    # We learn to copy 10 symbols in 2 buckets: length 4 and length 8.
+    classes = 10
+    buckets = [(4, 4), (8, 8)]
+    perplexities = [[], []]  # Results for each bucket.
+    tf.set_random_seed(111)
+    random.seed(111)
+    np.random.seed(111)
+
+    with self.test_session() as sess:
+      # We use sampled softmax so we keep output projection separate.
+      w = tf.get_variable("proj_w", [24, classes])
+      w_t = tf.transpose(w)
+      b = tf.get_variable("proj_b", [classes])
+      # Here comes a sample Seq2Seq model using GRU cells.
+      def SampleGRUSeq2Seq(enc_inp, dec_inp, weights):
+        """Example sequence-to-sequence model that uses GRU cells."""
+        def GRUSeq2Seq(enc_inp, dec_inp):
+          cell = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.GRUCell(24)] * 2,
+                                             state_is_tuple=True)
+          return tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(
+              enc_inp, dec_inp, cell, num_encoder_symbols=classes,
+              num_decoder_symbols=classes, embedding_size=24,
+              output_projection=(w, b))
+        targets = [dec_inp[i+1] for i in range(len(dec_inp) - 1)] + [0]
+        def SampledLoss(labels, inputs):
+          labels = tf.reshape(labels, [-1, 1])
+          return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, 8, classes)
+        return tf.contrib.legacy_seq2seq.model_with_buckets(
+            enc_inp, dec_inp, targets, weights, buckets, GRUSeq2Seq,
+            softmax_loss_function=SampledLoss)
+
+      # Now we construct the copy model.
+      batch_size = 8
+      inp = [tf.placeholder(tf.int32, shape=[None]) for _ in range(8)]
+      out = [tf.placeholder(tf.int32, shape=[None]) for _ in range(8)]
+      weights = [tf.ones_like(inp[0], dtype=tf.float32) for _ in range(8)]
+      with tf.variable_scope("root"):
+        _, losses = SampleGRUSeq2Seq(inp, out, weights)
+        updates = []
+        params = tf.all_variables()
+        optimizer = tf.train.AdamOptimizer(0.03, epsilon=1e-5)
+        for i in range(len(buckets)):
+          full_grads = tf.gradients(losses[i], params)
+          grads, _ = tf.clip_by_global_norm(full_grads, 30.0)
+          update = optimizer.apply_gradients(zip(grads, params))
+          updates.append(update)
+        sess.run([tf.global_variables_initializer()])
+      steps = 6
+      for _ in range(steps):
+        bucket = random.choice(np.arange(len(buckets)))
+        length = buckets[bucket][0]
+        i = [np.array([np.random.randint(9) + 1 for _ in range(batch_size)],
+                      dtype=np.int32) for _ in range(length)]
+        # 0 is our "GO" symbol here.
+        o = [np.array([0] * batch_size, dtype=np.int32)] + i
+        feed = {}
+        for i1, i2, o1, o2 in zip(inp[:length], i[:length],
+                                  out[:length], o[:length]):
+          feed[i1.name] = i2
+          feed[o1.name] = o2
+        if length < 8:  # For the 4-bucket, we need the 5th as target.
+          feed[out[length].name] = o[length]
+        res = sess.run([updates[bucket], losses[bucket]], feed)
+        perplexities[bucket].append(math.exp(float(res[1])))
+      for bucket in range(len(buckets)):
+        if len(perplexities[bucket]) > 1:  # Assert that perplexity went down.
+          self.assertLess(perplexities[bucket][-1], perplexities[bucket][0])
+
+  def testModelWithBooleanFeedPrevious(self):
+    """Test the model behavior when feed_previous is True.
+
+    For example, the following two cases have the same effect:
+      - Train `embedding_rnn_seq2seq` with `feed_previous=True`, which contains
+        a `embedding_rnn_decoder` with `feed_previous=True` and
+        `update_embedding_for_previous=True`. The decoder is fed with "<Go>"
+        and outputs "A, B, C".
+      - Train `embedding_rnn_seq2seq` with `feed_previous=False`. The decoder
+        is fed with "<Go>, A, B".
+    """
+    num_encoder_symbols = 3
+    num_decoder_symbols = 5
+    batch_size = 2
+    num_enc_timesteps = 2
+    num_dec_timesteps = 3
+
+    def TestModel(seq2seq):
+      with self.test_session(graph=tf.Graph()) as sess:
+        tf.set_random_seed(111)
+        random.seed(111)
+        np.random.seed(111)
+
+        enc_inp = [tf.constant(i + 1, tf.int32, shape=[batch_size])
+                     for i in range(num_enc_timesteps)]
+        dec_inp_fp_true = [tf.constant(i, tf.int32, shape=[batch_size])
+                           for i in range(num_dec_timesteps)]
+        dec_inp_holder_fp_false = [tf.placeholder(tf.int32, shape=[batch_size])
+                                   for _ in range(num_dec_timesteps)]
+        targets = [tf.constant(i + 1, tf.int32, shape=[batch_size])
+                   for i in range(num_dec_timesteps)]
+        weights = [tf.constant(1.0, shape=[batch_size])
+                   for i in range(num_dec_timesteps)]
+
+        def ForwardBackward(enc_inp, dec_inp, feed_previous):
+          scope_name = "fp_{}".format(feed_previous)
+          with tf.variable_scope(scope_name):
+            dec_op, _ = seq2seq(enc_inp, dec_inp, feed_previous=feed_previous)
+            net_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
+                                              scope_name)
+          optimizer = tf.train.AdamOptimizer(0.03, epsilon=1e-5)
+          update_op = optimizer.minimize(
+              tf.contrib.legacy_seq2seq.sequence_loss(dec_op, targets, weights),
+              var_list=net_variables)
+          return dec_op, update_op, net_variables
+
+        dec_op_fp_true, update_fp_true, variables_fp_true = ForwardBackward(
+            enc_inp, dec_inp_fp_true, feed_previous=True)
+        dec_op_fp_false, update_fp_false, variables_fp_false = ForwardBackward(
+            enc_inp, dec_inp_holder_fp_false, feed_previous=False)
+
+        sess.run(tf.global_variables_initializer())
+
+        # We only check consistencies between the variables existing in both
+        # the models with True and False feed_previous. Variables created by
+        # the loop_function in the model with True feed_previous are ignored.
+        v_false_name_dict = {v.name.split('/', 1)[-1]: v
+                             for v in variables_fp_false}
+        matched_variables = [(v, v_false_name_dict[v.name.split('/', 1)[-1]])
+                             for v in variables_fp_true]
+        for v_true, v_false in matched_variables:
+          sess.run(tf.assign(v_false, v_true))
+
+        # Take the symbols generated by the decoder with feed_previous=True as
+        # the true input symbols for the decoder with feed_previous=False.
+        dec_fp_true = sess.run(dec_op_fp_true)
+        output_symbols_fp_true = np.argmax(dec_fp_true, axis=2)
+        dec_inp_fp_false = np.vstack((dec_inp_fp_true[0].eval(),
+                                      output_symbols_fp_true[:-1]))
+        sess.run(update_fp_true)
+        sess.run(update_fp_false,
+                 {holder: inp for holder, inp in zip(dec_inp_holder_fp_false,
+                                                     dec_inp_fp_false)})
+
+        for v_true, v_false in matched_variables:
+          self.assertAllClose(v_true.eval(), v_false.eval())
+
+    def EmbeddingRNNSeq2SeqF(enc_inp, dec_inp, feed_previous):
+      cell = tf.contrib.rnn.BasicLSTMCell(2, state_is_tuple=True)
+      return tf.contrib.legacy_seq2seq.embedding_rnn_seq2seq(
+          enc_inp, dec_inp, cell, num_encoder_symbols,
+          num_decoder_symbols, embedding_size=2, feed_previous=feed_previous)
+
+    def EmbeddingRNNSeq2SeqNoTupleF(enc_inp, dec_inp, feed_previous):
+      cell = tf.contrib.rnn.BasicLSTMCell(2, state_is_tuple=False)
+      return tf.contrib.legacy_seq2seq.embedding_rnn_seq2seq(
+          enc_inp, dec_inp, cell, num_encoder_symbols,
+          num_decoder_symbols, embedding_size=2, feed_previous=feed_previous)
+
+    def EmbeddingTiedRNNSeq2Seq(enc_inp, dec_inp, feed_previous):
+      cell = tf.contrib.rnn.BasicLSTMCell(2, state_is_tuple=True)
+      return tf.contrib.legacy_seq2seq.embedding_tied_rnn_seq2seq(
+          enc_inp, dec_inp, cell, num_decoder_symbols, embedding_size=2,
+          feed_previous=feed_previous)
+
+    def EmbeddingTiedRNNSeq2SeqNoTuple(enc_inp, dec_inp, feed_previous):
+      cell = tf.contrib.rnn.BasicLSTMCell(2, state_is_tuple=False)
+      return tf.contrib.legacy_seq2seq.embedding_tied_rnn_seq2seq(
+          enc_inp, dec_inp, cell, num_decoder_symbols, embedding_size=2,
+          feed_previous=feed_previous)
+
+    def EmbeddingAttentionSeq2Seq(enc_inp, dec_inp, feed_previous):
+      cell = tf.contrib.rnn.BasicLSTMCell(2, state_is_tuple=True)
+      return tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(
+          enc_inp, dec_inp, cell, num_encoder_symbols,
+          num_decoder_symbols, embedding_size=2, feed_previous=feed_previous)
+
+    def EmbeddingAttentionSeq2SeqNoTuple(enc_inp, dec_inp, feed_previous):
+      cell = tf.contrib.rnn.BasicLSTMCell(2, state_is_tuple=False)
+      return tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(
+          enc_inp, dec_inp, cell, num_encoder_symbols,
+          num_decoder_symbols, embedding_size=2, feed_previous=feed_previous)
+
+    for model in (EmbeddingRNNSeq2SeqF, EmbeddingRNNSeq2SeqNoTupleF,
+                  EmbeddingTiedRNNSeq2Seq, EmbeddingTiedRNNSeq2SeqNoTuple,
+                  EmbeddingAttentionSeq2Seq, EmbeddingAttentionSeq2SeqNoTuple):
+      TestModel(model)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/contrib/linalg/BUILD b/tensorflow/contrib/linalg/BUILD
index e3ed248dd599b2..20f4970f0968a2 100644
--- a/tensorflow/contrib/linalg/BUILD
+++ b/tensorflow/contrib/linalg/BUILD
@@ -35,6 +35,32 @@ cuda_py_tests(
     shard_count = 5,
 )
 
+cuda_py_tests(
+    name = "linear_operator_tril_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/linear_operator_tril_test.py"],
+    additional_deps = [
+        ":linalg_py",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+    shard_count = 5,
+)
+
+cuda_py_tests(
+    name = "linear_operator_util_test",
+    size = "small",
+    srcs = ["python/kernel_tests/linear_operator_util_test.py"],
+    additional_deps = [
+        ":linalg_py",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+    shard_count = 5,
+)
+
 py_library(
     name = "linalg_py",
     srcs = ["__init__.py"] + glob(["python/ops/*.py"]),
diff --git a/tensorflow/contrib/linalg/__init__.py b/tensorflow/contrib/linalg/__init__.py
index 3f73581bc3efdf..d15ed052f26f1c 100644
--- a/tensorflow/contrib/linalg/__init__.py
+++ b/tensorflow/contrib/linalg/__init__.py
@@ -30,6 +30,7 @@
 ### Individual operators
 
 @@LinearOperatorDiag
+@@LinearOperatorTriL
 
 """
 from __future__ import absolute_import
@@ -40,5 +41,6 @@
 
 from tensorflow.contrib.linalg.python.ops.linear_operator import *
 from tensorflow.contrib.linalg.python.ops.linear_operator_diag import *
+from tensorflow.contrib.linalg.python.ops.linear_operator_tril import *
 
 # pylint: enable=unused-import,wildcard-import,line-too-long,g-importing-member
diff --git a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_diag_test.py b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_diag_test.py
index d03fb1d66f32c5..09e7f880e09417 100644
--- a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_diag_test.py
+++ b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_diag_test.py
@@ -38,6 +38,7 @@ def _operator_and_mat_and_feed_dict(self, shape, dtype, use_placeholder):
     if dtype.is_complex:
       diag = tf.complex(
           diag, tf.random_normal(diag_shape, dtype=dtype.real_dtype))
+
     diag_ph = tf.placeholder(dtype=dtype)
 
     if use_placeholder:
@@ -45,14 +46,14 @@ def _operator_and_mat_and_feed_dict(self, shape, dtype, use_placeholder):
       # diag is random and we want the same value used for both mat and
       # feed_dict.
       diag = diag.eval()
-      mat = tf.matrix_diag(diag)
       operator = linalg.LinearOperatorDiag(diag_ph)
       feed_dict = {diag_ph: diag}
     else:
-      mat = tf.matrix_diag(diag)
       operator = linalg.LinearOperatorDiag(diag)
       feed_dict = None
 
+    mat = tf.matrix_diag(diag)
+
     return operator, mat, feed_dict
 
   def test_assert_positive_definite_raises_for_zero_eigenvalue(self):
@@ -60,6 +61,9 @@ def test_assert_positive_definite_raises_for_zero_eigenvalue(self):
     with self.test_session():
       diag = [1.0, 0.0]
       operator = linalg.LinearOperatorDiag(diag)
+
+      # is_self_adjoint should be auto-set for real diag.
+      self.assertTrue(operator.is_self_adjoint)
       with self.assertRaisesOpError("non-positive.*not positive definite"):
         operator.assert_positive_definite().run()
 
@@ -69,6 +73,9 @@ def test_assert_positive_definite_raises_for_negative_real_eigvalues(self):
       diag_y = [0., 0.]  # Imaginary eigenvalues should not matter.
       diag = tf.complex(diag_x, diag_y)
       operator = linalg.LinearOperatorDiag(diag)
+
+      # is_self_adjoint should not be auto-set for complex diag.
+      self.assertTrue(operator.is_self_adjoint is None)
       with self.assertRaisesOpError("non-positive real.*not positive definite"):
         operator.assert_positive_definite().run()
 
@@ -84,7 +91,7 @@ def test_assert_non_singular_raises_if_zero_eigenvalue(self):
     # Singlular matrix with one positive eigenvalue and one zero eigenvalue.
     with self.test_session():
       diag = [1.0, 0.0]
-      operator = linalg.LinearOperatorDiag(diag)
+      operator = linalg.LinearOperatorDiag(diag, is_self_adjoint=True)
       with self.assertRaisesOpError("Singular operator"):
         operator.assert_non_singular().run()
 
@@ -124,7 +131,7 @@ def test_broadcast_apply_and_solve(self):
       # This LinearOperatorDiag will be brodacast to (2, 2, 3, 3) during solve
       # and apply with 'x' as the argument.
       diag = tf.random_uniform(shape=(2, 1, 3))
-      operator = linalg.LinearOperatorDiag(diag)
+      operator = linalg.LinearOperatorDiag(diag, is_self_adjoint=True)
       self.assertAllEqual((2, 1, 3, 3), operator.shape)
 
       # Create a batch matrix with the broadcast shape of operator.
diff --git a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_tril_test.py b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_tril_test.py
new file mode 100644
index 00000000000000..35f1c4a48caadd
--- /dev/null
+++ b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_tril_test.py
@@ -0,0 +1,104 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from tensorflow.contrib.linalg.python.ops import linear_operator_test_util
+
+
+linalg = tf.contrib.linalg
+tf.set_random_seed(23)
+
+
+class LinearOperatorTriLTest(
+    linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
+  """Most tests done in the base class LinearOperatorDerivedClassTest."""
+
+  @property
+  def _dtypes_to_test(self):
+    # TODO(langmore) Test complex types once supported by
+    # matrix_triangular_solve.
+    return [tf.float32, tf.float64]
+
+  def _operator_and_mat_and_feed_dict(self, shape, dtype, use_placeholder):
+    shape = list(shape)
+    diag_shape = shape[:-1]
+
+    # Upper triangle will be ignored.
+    # Use a diagonal that ensures this matrix is well conditioned.
+    tril = tf.random_normal(shape=shape, dtype=dtype.real_dtype)
+    diag = tf.random_uniform(
+        shape=diag_shape, dtype=dtype.real_dtype, minval=2., maxval=3.)
+    if dtype.is_complex:
+      tril = tf.complex(
+          tril, tf.random_normal(shape, dtype=dtype.real_dtype))
+      diag = tf.complex(
+          diag, tf.random_uniform(
+              shape=diag_shape, dtype=dtype.real_dtype, minval=2., maxval=3.))
+
+    tril = tf.matrix_set_diag(tril, diag)
+
+    tril_ph = tf.placeholder(dtype=dtype)
+
+    if use_placeholder:
+      # Evaluate the tril here because (i) you cannot feed a tensor, and (ii)
+      # tril is random and we want the same value used for both mat and
+      # feed_dict.
+      tril = tril.eval()
+      operator = linalg.LinearOperatorTriL(tril_ph)
+      feed_dict = {tril_ph: tril}
+    else:
+      operator = linalg.LinearOperatorTriL(tril)
+      feed_dict = None
+
+    mat = tf.matrix_band_part(tril, -1, 0)
+
+    return operator, mat, feed_dict
+
+  def test_assert_positive_definite(self):
+    # Singlular matrix with one positive eigenvalue and one negative eigenvalue.
+    with self.test_session():
+      tril = [[1., 0.], [1., -1.]]
+      operator = linalg.LinearOperatorTriL(tril)
+      with self.assertRaisesOpError("was not positive definite"):
+        operator.assert_positive_definite().run()
+
+  def test_assert_non_singular(self):
+    # Singlular matrix with one positive eigenvalue and one zero eigenvalue.
+    with self.test_session():
+      tril = [[1., 0.], [1., 0.]]
+      operator = linalg.LinearOperatorTriL(tril)
+      with self.assertRaisesOpError("Singular operator"):
+        operator.assert_non_singular().run()
+
+  def test_is_x_flags(self):
+    # Matrix with one two positive eigenvalues.
+    tril = [[1., 0.], [1., 1.]]
+    operator = linalg.LinearOperatorTriL(
+        tril,
+        is_positive_definite=True,
+        is_non_singular=True,
+        is_self_adjoint=False)
+    self.assertTrue(operator.is_positive_definite)
+    self.assertTrue(operator.is_non_singular)
+    self.assertFalse(operator.is_self_adjoint)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_util_test.py b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_util_test.py
new file mode 100644
index 00000000000000..8e439070ccef79
--- /dev/null
+++ b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_util_test.py
@@ -0,0 +1,90 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from tensorflow.contrib.linalg.python.ops import linear_operator_util
+
+
+linalg = tf.contrib.linalg
+tf.set_random_seed(23)
+
+
+class AssertZeroImagPartTest(tf.test.TestCase):
+
+  def test_real_tensor_doesnt_raise(self):
+    x = tf.convert_to_tensor([0., 2, 3])
+    with self.test_session():
+      # Should not raise.
+      linear_operator_util.assert_zero_imag_part(x, message="ABC123").run()
+
+  def test_complex_tensor_with_imag_zero_doesnt_raise(self):
+    x = tf.convert_to_tensor([1., 0, 3])
+    y = tf.convert_to_tensor([0., 0, 0])
+    z = tf.complex(x, y)
+    with self.test_session():
+      # Should not raise.
+      linear_operator_util.assert_zero_imag_part(z, message="ABC123").run()
+
+  def test_complex_tensor_with_nonzero_imag_raises(self):
+    x = tf.convert_to_tensor([1., 2, 0])
+    y = tf.convert_to_tensor([1., 2, 0])
+    z = tf.complex(x, y)
+    with self.test_session():
+      with self.assertRaisesOpError("ABC123"):
+        linear_operator_util.assert_zero_imag_part(z, message="ABC123").run()
+
+
+class AssertNoEntriesWithModulusZeroTest(tf.test.TestCase):
+
+  def test_nonzero_real_tensor_doesnt_raise(self):
+    x = tf.convert_to_tensor([1., 2, 3])
+    with self.test_session():
+      # Should not raise.
+      linear_operator_util.assert_no_entries_with_modulus_zero(
+          x, message="ABC123").run()
+
+  def test_nonzero_complex_tensor_doesnt_raise(self):
+    x = tf.convert_to_tensor([1., 0, 3])
+    y = tf.convert_to_tensor([1., 2, 0])
+    z = tf.complex(x, y)
+    with self.test_session():
+      # Should not raise.
+      linear_operator_util.assert_no_entries_with_modulus_zero(
+          z, message="ABC123").run()
+
+  def test_zero_real_tensor_raises(self):
+    x = tf.convert_to_tensor([1., 0, 3])
+    with self.test_session():
+      with self.assertRaisesOpError("ABC123"):
+        linear_operator_util.assert_no_entries_with_modulus_zero(
+            x, message="ABC123").run()
+
+  def test_zero_complex_tensor_raises(self):
+    x = tf.convert_to_tensor([1., 2, 0])
+    y = tf.convert_to_tensor([1., 2, 0])
+    z = tf.complex(x, y)
+    with self.test_session():
+      with self.assertRaisesOpError("ABC123"):
+        linear_operator_util.assert_no_entries_with_modulus_zero(
+            z, message="ABC123").run()
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_diag.py b/tensorflow/contrib/linalg/python/ops/linear_operator_diag.py
index f65ed9a6c862d2..58a891710c1fbe 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_diag.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_diag.py
@@ -19,11 +19,11 @@
 from __future__ import print_function
 
 from tensorflow.contrib.linalg.python.ops import linear_operator
+from tensorflow.contrib.linalg.python.ops import linear_operator_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 
 __all__ = ["LinearOperatorDiag",]
@@ -111,7 +111,7 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
   def __init__(self,
                diag,
                is_non_singular=None,
-               is_self_adjoint=True,
+               is_self_adjoint=None,
                is_positive_definite=None,
                name="LinearOperatorDiag"):
     """Initialize a `LinearOperatorDiag`.
@@ -119,11 +119,10 @@ def __init__(self,
     Args:
       diag:  Shape `[B1,...,Bb, N]` `Tensor` with `b >= 0` `N >= 0`.
         The diagonal of the operator.  Allowed dtypes: `float32`, `float64`,
-        `complex64`, `complex128`.
+          `complex64`, `complex128`.
       is_non_singular:  Expect that this operator is non-singular.
       is_self_adjoint:  Expect that this operator is equal to its hermitian
-        transpose.  Since this is a real (not complex) diagonal operator, it is
-        always self adjoint.
+        transpose.  If `diag.dtype` is real, this is auto-set to `True`.
       is_positive_definite:  Expect that this operator is positive definite,
         meaning the real part of all eigenvalues is positive.  We do not require
         the operator to be self-adjoint to be positive-definite.  See:
@@ -133,7 +132,7 @@ def __init__(self,
 
     Raises:
       TypeError:  If `diag.dtype` is not an allowed type.
-      ValueError:  If `is_self_adjoint` is not `True`.
+      ValueError:  If `diag.dtype` is real, and `is_self_adjoint` is not `True`.
     """
 
     allowed_dtypes = [
@@ -146,8 +145,13 @@ def __init__(self,
         raise TypeError(
             "Argument diag must have dtype in %s.  Found: %s"
             % (allowed_dtypes, dtype))
-      if dtype.is_floating and not is_self_adjoint:
-        raise ValueError("A real diagonal operator is always self adjoint.")
+
+      # Check and auto-set hints.
+      if not dtype.is_complex:
+        if is_self_adjoint is False:
+          raise ValueError("A real diagonal operator is always self adjoint.")
+        else:
+          is_self_adjoint = True
 
       super(LinearOperatorDiag, self).__init__(
           dtype=dtype,
@@ -168,17 +172,9 @@ def _shape_dynamic(self):
     return array_ops.concat(0, (d_shape, [k]))
 
   def _assert_non_singular(self):
-    if self.dtype.is_complex:
-      should_be_nonzero = math_ops.complex_abs(self._diag)
-    else:
-      should_be_nonzero = self._diag
-
-    nonzero_diag = math_ops.reduce_all(
-        math_ops.logical_not(math_ops.equal(should_be_nonzero, 0)))
-
-    return control_flow_ops.Assert(
-        nonzero_diag,
-        data=["Singular operator: diag contained zero values.", self._diag])
+    return linear_operator_util.assert_no_entries_with_modulus_zero(
+        self._diag,
+        message="Singular operator:  Diagonal contained zero values.")
 
   def _assert_positive_definite(self):
     if self.dtype.is_complex:
@@ -195,7 +191,7 @@ def _assert_positive_definite(self):
         message=message)
 
   def _assert_self_adjoint(self):
-    return _assert_imag_part_zero(
+    return linear_operator_util.assert_zero_imag_part(
         self._diag,
         message=(
             "This diagonal operator contained non-zero imaginary values.  "
@@ -225,18 +221,3 @@ def _add_to_tensor(self, x):
     x_diag = array_ops.matrix_diag_part(x)
     new_diag = self._diag + x_diag
     return array_ops.matrix_set_diag(x, new_diag)
-
-
-def _assert_imag_part_zero(x, message=None):
-  """Assert that floating or complex 'x' is real."""
-  dtype = x.dtype.base_dtype
-  if dtype.is_floating:
-    return control_flow_ops.no_op()
-
-  if not dtype.is_complex:
-    raise TypeError(
-        "imag_part_zero only handles float or complex types.  Found: %s"
-        % dtype)
-
-  zero = ops.convert_to_tensor(0, dtype=dtype.real_dtype)
-  return check_ops.assert_equal(zero, math_ops.imag(x), message=message)
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_test_util.py b/tensorflow/contrib/linalg/python/ops/linear_operator_test_util.py
index 20136bfbd005ca..5f0f1e9bb137f2 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_test_util.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_test_util.py
@@ -48,7 +48,7 @@ def assertAC(self, x, y):
 
   @property
   def _dtypes_to_test(self):
-    # TODO(langmore) Test tf.float16 once tf.matrix_diag works in 16bit.
+    # TODO(langmore) Test tf.float16 once tf.matrix_solve works in 16bit.
     return [tf.float32, tf.float64, tf.complex64, tf.complex128]
 
   @abc.abstractproperty
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_tril.py b/tensorflow/contrib/linalg/python/ops/linear_operator_tril.py
new file mode 100644
index 00000000000000..ce54fa3c20166f
--- /dev/null
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_tril.py
@@ -0,0 +1,207 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""`LinearOperator` acting like a lower triangular matrix."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.linalg.python.ops import linear_operator
+from tensorflow.contrib.linalg.python.ops import linear_operator_util
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+
+__all__ = ["LinearOperatorTriL",]
+
+
+class LinearOperatorTriL(linear_operator.LinearOperator):
+  """`LinearOperator` acting like a [batch] square lower triangular matrix.
+
+  This operator acts like a [batch] matrix `A` with shape
+  `[B1,...,Bb, N, N]` for some `b >= 0`.  The first `b` indices index a
+  batch member.  For every batch index `(i1,...,ib)`, `A[i1,...,ib, : :]` is
+  an `N x N` matrix.
+
+  `LinearOperatorTriL` is initialized with a `Tensor` having dimensions
+  `[B1,...,Bb, N, N]`. The upper triangle of the last two dimensions is ignored.
+
+  ```python
+  # Create a 2 x 2 lower-triangular linear operator.
+  tril = [[1., 2.], [3., 4.]]
+  operator = LinearOperatorTriL(tril)
+
+  # The upper triangle is ignored.
+  operator.to_dense()
+  ==> [[1., 0.]
+       [3., 4.]]
+
+  operator.shape
+  ==> [2, 2]
+
+  operator.log_determinant()
+  ==> scalar Tensor
+
+  x = ... Shape [2, 4] Tensor
+  operator.apply(x)
+  ==> Shape [2, 4] Tensor
+
+  # Create a [2, 3] batch of 4 x 4 linear operators.
+  tril = tf.random_normal(shape=[2, 3, 4, 4])
+  operator = LinearOperatorTriL(tril)
+
+  # Create a shape [2, 1, 4, 2] vector.  Note that this shape is compatible
+  # since the batch dimensions, [2, 1], are brodcast to
+  # operator.batch_shape = [2, 3].
+  y = tf.random_normal(shape=[2, 1, 4, 2])
+  x = operator.solve(y)
+  ==> operator.apply(x) = y
+  ```
+
+  ### Shape compatibility
+
+  This operator acts on [batch] matrix with compatible shape.
+  `x` is a batch matrix with compatible shape for `apply` and `solve` if
+
+  ```
+  operator.shape = [B1,...,Bb] + [N, N],  with b >= 0
+  x.shape =        [B1,...,Bb] + [N, R],  with R >= 0.
+  ```
+
+  ### Performance
+
+  Suppose `operator` is a `LinearOperatorTriL` of shape `[N, N]`,
+  and `x.shape = [N, R]`.  Then
+
+  * `operator.apply(x)` involves `N^2 * R` multiplications.
+  * `operator.solve(x)` involves `N * R` size `N` back-substitutions.
+  * `operator.determinant()` involves a size `N` `reduce_prod`.
+
+  If instead `operator` and `x` have shape `[B1,...,Bb, N, N]` and
+  `[B1,...,Bb, N, R]`, every operation increases in complexity by `B1*...*Bb`.
+
+  ### Matrix property hints
+
+  This `LinearOperator` is initialized with boolean flags of the form `is_X`,
+  for `X = non_singular, self_adjoint` etc...
+  These have the following meaning
+  * If `is_X == True`, callers should expect the operator to have the
+    property `X`.  This is a promise that should be fulfilled, but is *not* a
+    runtime assert.  For example, finite floating point precision may result
+    in these promises being violated.
+  * If `is_X == False`, callers should expect the operator to not have `X`.
+  * If `is_X == None` (the default), callers should have no expectation either
+    way.
+  """
+
+  def __init__(self,
+               tril,
+               is_non_singular=None,
+               is_self_adjoint=None,
+               is_positive_definite=None,
+               name="LinearOperatorTriL"):
+    """Initialize a `LinearOperatorTriL`.
+
+    Args:
+      tril:  Shape `[B1,...,Bb, N, N]` with `b >= 0`, `N >= 0`.
+        The lower triangular part of `tril` defines this operator.  The strictly
+        upper triangle is ignored.  Allowed dtypes: `float32`, `float64`.
+      is_non_singular:  Expect that this operator is non-singular.
+        This operator is non-singular if and only if its diagonal elements are
+        all non-zero.
+      is_self_adjoint:  Expect that this operator is equal to its hermitian
+        transpose.  This operator is self-adjoint only if it is diagonal with
+        real-valued diagonal entries.  In this case it is advised to use
+        `LinearOperatorDiag`.
+      is_positive_definite:  Expect that this operator is positive definite,
+        meaning the real part of all eigenvalues is positive.  We do not require
+        the operator to be self-adjoint to be positive-definite.  See:
+        https://en.wikipedia.org/wiki/Positive-definite_matrix
+            #Extension_for_non_symmetric_matrices
+      name: A name for this `LinearOperator`.
+
+    Raises:
+      TypeError:  If `diag.dtype` is not an allowed type.
+    """
+
+    # TODO(langmore) Add complex types once matrix_triangular_solve works for
+    # them.
+    allowed_dtypes = [dtypes.float32, dtypes.float64]
+
+    with ops.name_scope(name, values=[tril]):
+      self._tril = array_ops.matrix_band_part(tril, -1, 0)
+      self._diag = array_ops.matrix_diag_part(self._tril)
+
+      dtype = self._tril.dtype
+      if dtype not in allowed_dtypes:
+        raise TypeError(
+            "Argument diag must have dtype in %s.  Found: %s"
+            % (allowed_dtypes, dtype))
+
+      super(LinearOperatorTriL, self).__init__(
+          dtype=self._tril.dtype,
+          graph_parents=[self._tril],
+          is_non_singular=is_non_singular,
+          is_self_adjoint=is_self_adjoint,
+          is_positive_definite=is_positive_definite,
+          name=name)
+
+  def _shape(self):
+    return self._tril.get_shape()
+
+  def _shape_dynamic(self):
+    return array_ops.shape(self._tril)
+
+  def _assert_non_singular(self):
+    return linear_operator_util.assert_no_entries_with_modulus_zero(
+        self._diag,
+        message="Singular operator:  Diagonal contained zero values.")
+
+  def _assert_positive_definite(self):
+    if self.dtype.is_complex:
+      message = (
+          "Diagonal operator had diagonal entries with non-positive real part, "
+          "thus was not positive definite.")
+    else:
+      message = (
+          "Real diagonal operator had non-positive diagonal entries, "
+          "thus was not positive definite.")
+
+    return check_ops.assert_positive(
+        math_ops.real(self._diag),
+        message=message)
+
+  def _apply(self, x, adjoint=False):
+    return math_ops.matmul(self._tril, x, adjoint_a=adjoint)
+
+  def _determinant(self):
+    return math_ops.reduce_prod(self._diag, reduction_indices=[-1])
+
+  def _log_abs_determinant(self):
+    return math_ops.reduce_sum(
+        math_ops.log(math_ops.abs(self._diag)), reduction_indices=[-1])
+
+  def _solve(self, rhs, adjoint=False):
+    return linalg_ops.matrix_triangular_solve(
+        self._tril, rhs, lower=True, adjoint=adjoint)
+
+  def _to_dense(self):
+    return self._tril
+
+  def _add_to_tensor(self, x):
+    return self._tril + x
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_util.py b/tensorflow/contrib/linalg/python/ops/linear_operator_util.py
new file mode 100644
index 00000000000000..06140ef4a27a0b
--- /dev/null
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_util.py
@@ -0,0 +1,72 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Internal utilities for `LinearOperator` classes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+
+
+def assert_no_entries_with_modulus_zero(
+    x, message=None, name="assert_no_entries_with_modulus_zero"):
+  """Returns `Op` that asserts Tensor `x` has no entries with modulus zero.
+
+  Args:
+    x:  Numeric `Tensor`, real, integer, or complex.
+    message:  A string message to prepend to failure message.
+    name:  A name to give this `Op`.
+
+  Returns:
+    An `Op` that asserts `x` has no entries with modulus zero.
+  """
+  with ops.name_scope(name, values=[x]):
+    x = ops.convert_to_tensor(x, name="x")
+    dtype = x.dtype.base_dtype
+
+    if dtype.is_complex:
+      should_be_nonzero = math_ops.complex_abs(x)
+    else:
+      should_be_nonzero = math_ops.abs(x)
+
+    zero = ops.convert_to_tensor(0, dtype=dtype.real_dtype)
+
+    return check_ops.assert_less(zero, should_be_nonzero, message=message)
+
+
+def assert_zero_imag_part(x, message=None, name="assert_zero_imag_part"):
+  """Returns `Op` that asserts Tensor `x` has no non-zero imaginary parts.
+
+  Args:
+    x:  Numeric `Tensor`, real, integer, or complex.
+    message:  A string message to prepend to failure message.
+    name:  A name to give this `Op`.
+
+  Returns:
+    An `Op` that asserts `x` has no entries with modulus zero.
+  """
+  with ops.name_scope(name, values=[x]):
+    x = ops.convert_to_tensor(x, name="x")
+    dtype = x.dtype.base_dtype
+
+    if dtype.is_floating:
+      return control_flow_ops.no_op()
+
+    zero = ops.convert_to_tensor(0, dtype=dtype.real_dtype)
+    return check_ops.assert_equal(zero, math_ops.imag(x), message=message)
diff --git a/tensorflow/contrib/lookup/BUILD b/tensorflow/contrib/lookup/BUILD
index 7183304111c239..e608c2e2d2e5eb 100644
--- a/tensorflow/contrib/lookup/BUILD
+++ b/tensorflow/contrib/lookup/BUILD
@@ -5,7 +5,7 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
+package(default_visibility = ["//tensorflow:internal"])
 
 py_library(
     name = "lookup_py",
diff --git a/tensorflow/contrib/lookup/lookup_ops.py b/tensorflow/contrib/lookup/lookup_ops.py
index d4f9c92b2de4d6..2e449afcfaea8c 100644
--- a/tensorflow/contrib/lookup/lookup_ops.py
+++ b/tensorflow/contrib/lookup/lookup_ops.py
@@ -141,11 +141,11 @@ def size(self, name=None):
     Returns:
       A scalar tensor containing the number of elements in this table.
     """
-    if name is None:
-      name = "%s_Size" % self._name
-    # pylint: disable=protected-access
-    return gen_data_flow_ops._lookup_table_size(self._table_ref, name=name)
-    # pylint: enable=protected-access
+    with ops.name_scope(name, "%s_Size" % self._name,
+                        [self._table_ref]) as scope:
+      # pylint: disable=protected-access
+      return gen_data_flow_ops._lookup_table_size(self._table_ref, name=scope)
+      # pylint: enable=protected-access
 
   def lookup(self, keys, name=None):
     """Looks up `keys` in a table, outputs the corresponding values.
@@ -163,9 +163,6 @@ def lookup(self, keys, name=None):
       TypeError: when `keys` or `default_value` doesn't match the table data
         types.
     """
-    if name is None:
-      name = "%s_lookup_table_find" % self._name
-
     key_tensor = keys
     if isinstance(keys, sparse_tensor.SparseTensor):
       key_tensor = keys.values
@@ -174,16 +171,16 @@ def lookup(self, keys, name=None):
       raise TypeError("Signature mismatch. Keys must be dtype %s, got %s." %
                       (self._key_dtype, keys.dtype))
 
-    # pylint: disable=protected-access
-    values = gen_data_flow_ops._lookup_table_find(self._table_ref,
-                                                  key_tensor,
-                                                  self._default_value,
-                                                  name=name)
-    # pylint: enable=protected-access
+    with ops.name_scope(name, "%s_Lookup" % self._name,
+                        [self._table_ref]) as scope:
+      # pylint: disable=protected-access
+      values = gen_data_flow_ops._lookup_table_find(
+          self._table_ref, key_tensor, self._default_value, name=scope)
+      # pylint: enable=protected-access
 
     values.set_shape(key_tensor.get_shape())
     if isinstance(keys, sparse_tensor.SparseTensor):
-      return sparse_tensor.SparseTensor(keys.indices, values, keys.shape)
+      return sparse_tensor.SparseTensor(keys.indices, values, keys.dense_shape)
     else:
       return values
 
@@ -220,13 +217,13 @@ def __init__(self, initializer, default_value, shared_name=None, name=None):
     Returns:
       A `HashTable` object.
     """
-    with ops.name_scope(name, "hash_table", [initializer]):
+    with ops.name_scope(name, "hash_table", [initializer]) as scope:
       # pylint: disable=protected-access
       table_ref = gen_data_flow_ops._hash_table(
           shared_name=shared_name,
           key_dtype=initializer.key_dtype,
           value_dtype=initializer.value_dtype,
-          name=name)
+          name=scope)
       # pylint: enable=protected-access
 
       super(HashTable, self).__init__(table_ref, default_value, initializer)
diff --git a/tensorflow/contrib/losses/python/losses/loss_ops.py b/tensorflow/contrib/losses/python/losses/loss_ops.py
index c17b251d3ea69d..780def4269a6b0 100644
--- a/tensorflow/contrib/losses/python/losses/loss_ops.py
+++ b/tensorflow/contrib/losses/python/losses/loss_ops.py
@@ -28,7 +28,7 @@
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import nn_ops
-
+from tensorflow.python.util.deprecation import deprecated
 
 __all__ = ["absolute_difference",
            "add_loss",
@@ -141,6 +141,7 @@ def _safe_mean(losses, num_present):
   return _safe_div(total_loss, num_present)
 
 
+@deprecated("2016-12-30", "Use tf.losses.compute_weighted_loss instead.")
 @deprecated_args(
     "2016-11-25", "`weight` is being deprecated, use `weights`.", "weight")
 def compute_weighted_loss(
@@ -235,6 +236,7 @@ def _num_present(losses, weights, per_batch=False):
   return num_per_batch if per_batch else math_ops.reduce_sum(num_per_batch)
 
 
+@deprecated("2016-12-30", "Use tf.losses.add_loss instead.")
 @add_arg_scope
 def add_loss(loss, loss_collection=ops.GraphKeys.LOSSES):
   """Adds a externally defined loss to the collection of losses.
@@ -247,6 +249,7 @@ def add_loss(loss, loss_collection=ops.GraphKeys.LOSSES):
     ops.add_to_collection(loss_collection, loss)
 
 
+@deprecated("2016-12-30", "Use tf.losses.get_losses instead.")
 def get_losses(scope=None, loss_collection=ops.GraphKeys.LOSSES):
   """Gets the list of losses from the loss_collection.
 
@@ -260,6 +263,7 @@ def get_losses(scope=None, loss_collection=ops.GraphKeys.LOSSES):
   return ops.get_collection(loss_collection, scope)
 
 
+@deprecated("2016-12-30", "Use tf.losses.get_regularization_losses instead.")
 def get_regularization_losses(scope=None):
   """Gets the regularization losses.
 
@@ -272,6 +276,7 @@ def get_regularization_losses(scope=None):
   return ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES, scope)
 
 
+@deprecated("2016-12-30", "Use tf.losses.get_total_loss instead.")
 def get_total_loss(add_regularization_losses=True, name="total_loss"):
   """Returns a tensor whose value represents the total loss.
 
@@ -294,6 +299,7 @@ def get_total_loss(add_regularization_losses=True, name="total_loss"):
   return math_ops.add_n(losses, name=name)
 
 
+@deprecated("2016-12-30", "Use tf.losses.absolute_difference instead.")
 @deprecated_args(
     "2016-11-25",
     "`targets` is being deprecated, use `labels`."
@@ -339,6 +345,7 @@ def absolute_difference(
     return compute_weighted_loss(losses, weights, scope=scope)
 
 
+@deprecated("2016-12-30", "Use tf.losses.sigmoid_cross_entropy instead.")
 @deprecated_args(
     "2016-11-25", "`weight` is being deprecated, use `weights`", "weight")
 def sigmoid_cross_entropy(
@@ -389,6 +396,7 @@ def sigmoid_cross_entropy(
     return compute_weighted_loss(losses, weights, scope=scope)
 
 
+@deprecated("2016-12-30", "Use tf.losses.softmax_cross_entropy instead.")
 @deprecated_args(
     "2016-11-25", "`weight` is being deprecated, use `weights`", "weight")
 def softmax_cross_entropy(
@@ -440,6 +448,7 @@ def softmax_cross_entropy(
     return compute_weighted_loss(losses, weights, scope=scope)
 
 
+@deprecated("2016-12-30", "Use tf.losses.sparse_softmax_cross_entropy instead.")
 @deprecated_args(
     "2016-11-25", "`weight` is being deprecated, use `weights`", "weight")
 def sparse_softmax_cross_entropy(
@@ -479,6 +488,7 @@ def sparse_softmax_cross_entropy(
     return compute_weighted_loss(losses, weights, scope=scope)
 
 
+@deprecated("2016-12-30", "Use tf.losses.log_loss instead.")
 @deprecated_args(
     "2016-11-25",
     "`targets` is being deprecated, use `labels`."
@@ -528,6 +538,7 @@ def log_loss(
     return compute_weighted_loss(losses, weights, scope=scope)
 
 
+@deprecated("2016-12-30", "Use tf.losses.hinge_loss instead.")
 @deprecated_args(
     "2016-11-25", "`target` is being deprecated, use `labels`.", "target")
 def hinge_loss(logits, labels=None, scope=None, target=None):
@@ -557,6 +568,7 @@ def hinge_loss(logits, labels=None, scope=None, target=None):
     return nn_ops.relu(math_ops.sub(all_ones, math_ops.mul(labels, logits)))
 
 
+@deprecated("2016-12-30", "Use tf.losses.mean_squared_error instead.")
 @deprecated_args(
     "2016-11-25",
     "`targets` is being deprecated, use `labels`."
@@ -602,6 +614,7 @@ def mean_squared_error(
     return compute_weighted_loss(losses, weights, scope=scope)
 
 
+@deprecated("2016-12-30", "Use tf.losses.mean_pairwise_squared_error instead.")
 @deprecated_args(
     "2016-11-25",
     "`targets` is being deprecated, use `labels`."
@@ -691,6 +704,7 @@ def mean_pairwise_squared_error(
     return mean_loss
 
 
+@deprecated("2016-12-30", "Use tf.losses.cosine_distance instead.")
 @deprecated_args(
     "2016-11-25",
     "`targets` is being deprecated, use `labels`."
diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt
index d39dc1d4303d88..89f93ad8b85a28 100644
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@@ -102,6 +102,7 @@ tensorflow/core/kernels/dynamic_stitch_op.cc
 tensorflow/core/kernels/dynamic_partition_op.cc
 tensorflow/core/kernels/dense_update_ops.cc
 tensorflow/core/kernels/deep_conv2d.cc
+tensorflow/core/kernels/xsmm_conv2d.cc
 tensorflow/core/kernels/cwise_ops_common.cc
 tensorflow/core/kernels/cwise_op_tanh.cc
 tensorflow/core/kernels/cwise_op_sub.cc
diff --git a/tensorflow/contrib/metrics/BUILD b/tensorflow/contrib/metrics/BUILD
index bee8b9567a3eb1..e20e494b9eb68c 100644
--- a/tensorflow/contrib/metrics/BUILD
+++ b/tensorflow/contrib/metrics/BUILD
@@ -8,93 +8,16 @@ exports_files(["LICENSE"])
 
 package(default_visibility = ["//tensorflow:__subpackages__"])
 
-load("//tensorflow:tensorflow.bzl", "tf_cc_tests")
-load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
-load("//tensorflow:tensorflow.bzl", "tf_gen_op_libs")
-load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
-load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
-load(
-    "//tensorflow/core:platform/default/build_config.bzl",
-    "tf_kernel_tests_linkstatic",
-)
-
-tf_custom_op_library(
-    # TODO(sibyl-Mooth6ku,ptucker): Understand why 'python/ops/_' is needed and fix it.
-    name = "python/ops/_set_ops.so",
-    srcs = [
-        "ops/set_ops.cc",
-    ],
-    deps = [
-        "//tensorflow/contrib/metrics/kernels:set_kernels",
-    ],
-)
-
-tf_gen_op_libs(
-    op_lib_names = ["set_ops"],
-)
-
-tf_gen_op_wrapper_py(
-    name = "set_ops",
-    hidden = [
-        "DenseToDenseSetOperation",
-        "DenseToSparseSetOperation",
-        "SparseToSparseSetOperation",
-        "SetSize",
-    ],
-    deps = [":set_ops_op_lib"],
-)
-
-tf_kernel_library(
-    name = "set_ops_kernels",
-    deps = [
-        "//tensorflow/contrib/metrics/kernels:set_kernels",
-        "//tensorflow/core:framework",
-    ],
-    alwayslink = 1,
-)
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 py_library(
     name = "metrics_py",
     srcs = ["__init__.py"] + glob(["python/ops/*.py"]) + glob(["python/metrics/*.py"]),
-    data = [":python/ops/_set_ops.so"],
-    srcs_version = "PY2AND3",
-    deps = [":set_ops"],
-)
-
-py_test(
-    name = "set_ops_test",
-    size = "small",
-    srcs = ["python/kernel_tests/set_ops_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":metrics_py",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
-tf_cc_tests(
-    size = "small",
-    srcs = [
-        "ops/set_ops_test.cc",
-    ],
-    linkstatic = tf_kernel_tests_linkstatic(),
-    deps = [
-        ":set_ops_op_lib",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/core",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:ops",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-        "//third_party/eigen3",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:sets",
     ],
 )
 
@@ -110,19 +33,6 @@ py_test(
     ],
 )
 
-py_test(
-    name = "confusion_matrix_ops_test",
-    size = "medium",
-    srcs = ["python/kernel_tests/confusion_matrix_ops_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":metrics_py",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
 py_test(
     name = "histogram_ops_test",
     size = "medium",
diff --git a/tensorflow/contrib/metrics/__init__.py b/tensorflow/contrib/metrics/__init__.py
index 3ad53655bc1e55..aaa1b62d5f7baa 100644
--- a/tensorflow/contrib/metrics/__init__.py
+++ b/tensorflow/contrib/metrics/__init__.py
@@ -133,7 +133,6 @@
 @@auc_using_histogram
 
 @@accuracy
-@@confusion_matrix
 
 @@aggregate_metrics
 @@aggregate_metric_map
diff --git a/tensorflow/contrib/metrics/python/ops/confusion_matrix_ops.py b/tensorflow/contrib/metrics/python/ops/confusion_matrix_ops.py
index dd57f0478bee15..81bbe935e74147 100644
--- a/tensorflow/contrib/metrics/python/ops/confusion_matrix_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/confusion_matrix_ops.py
@@ -18,93 +18,13 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.framework import tensor_util
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import confusion_matrix as cm
 
 
-def confusion_matrix(predictions, labels, num_classes=None, dtype=dtypes.int32,
+def confusion_matrix(labels, predictions, num_classes=None, dtype=dtypes.int32,
                      name=None, weights=None):
-  """Computes the confusion matrix from predictions and labels.
-
-  Calculate the Confusion Matrix for a pair of prediction and
-  label 1-D int arrays.
-
-  The matrix rows represent the prediction labels and the columns
-  represents the real labels. The confusion matrix is always a 2-D array
-  of shape `[n, n]`, where `n` is the number of valid labels for a given
-  classification task. Both prediction and labels must be 1-D arrays of
-  the same shape in order for this function to work.
-
-  If `num_classes` is None, then `num_classes` will be set to the one plus
-  the maximum value in either predictions or labels.
-  Class labels are expected to start at 0. E.g., if `num_classes` was
-  three, then the possible labels would be `[0, 1, 2]`.
-
-  If `weights` is not `None`, then each prediction contributes its
-  corresponding weight to the total value of the confusion matrix cell.
-
-  For example:
-
-  ```python
-    tf.contrib.metrics.confusion_matrix([1, 2, 4], [2, 2, 4]) ==>
-        [[0 0 0 0 0]
-         [0 0 1 0 0]
-         [0 0 1 0 0]
-         [0 0 0 0 0]
-         [0 0 0 0 1]]
-  ```
-
-  Note that the possible labels are assumed to be `[0, 1, 2, 3, 4]`,
-  resulting in a 5x5 confusion matrix.
-
-  Args:
-    predictions: A 1-D array representing the predictions for a given
-                 classification.
-    labels: A 1-D representing the real labels for the classification task.
-    num_classes: The possible number of labels the classification task can
-                 have. If this value is not provided, it will be calculated
-                 using both predictions and labels array.
-    dtype: Data type of the confusion matrix.
-    name: Scope name.
-    weights: An optional `Tensor` whose shape matches `predictions`.
-
-  Returns:
-    A k X k matrix representing the confusion matrix, where k is the number of
-    possible labels in the classification task.
-
-  Raises:
-    ValueError: If both predictions and labels are not 1-D vectors and have
-      mismatched shapes, or if `weights` is not `None` and its shape doesn't
-      match `predictions`.
-  """
-  with ops.name_scope(name, 'confusion_matrix',
-                      [predictions, labels, num_classes]) as name:
-    predictions, labels = tensor_util.remove_squeezable_dimensions(
-        ops.convert_to_tensor(
-            predictions, name='predictions'),
-        ops.convert_to_tensor(labels, name='labels'))
-    predictions = math_ops.cast(predictions, dtypes.int64)
-    labels = math_ops.cast(labels, dtypes.int64)
-
-    if num_classes is None:
-      num_classes = math_ops.maximum(math_ops.reduce_max(predictions),
-                                     math_ops.reduce_max(labels)) + 1
-
-    if weights is not None:
-      predictions.get_shape().assert_is_compatible_with(weights.get_shape())
-      weights = math_ops.cast(weights, dtype)
-
-    shape = array_ops.pack([num_classes, num_classes])
-    indices = array_ops.transpose(array_ops.pack([predictions, labels]))
-    values = (array_ops.ones_like(predictions, dtype)
-              if weights is None else weights)
-    cm_sparse = sparse_tensor.SparseTensor(
-        indices=indices, values=values, shape=math_ops.to_int64(shape))
-    zero_matrix = array_ops.zeros(math_ops.to_int32(shape), dtype)
-
-    return sparse_ops.sparse_add(zero_matrix, cm_sparse)
+  """Deprecated. Use tf.confusion_matrix instead."""
+  return cm.confusion_matrix(labels=labels, predictions=predictions,
+                             num_classes=num_classes, dtype=dtype, name=name,
+                             weights=weights)
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index c6d6b50e9067d2..a644f6bfcb22bb 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -25,7 +25,6 @@
 from tensorflow.contrib.framework import deprecated
 from tensorflow.contrib.framework import tensor_util
 from tensorflow.contrib.framework.python.ops import variables as contrib_variables
-from tensorflow.contrib.metrics.python.ops import confusion_matrix_ops
 from tensorflow.contrib.metrics.python.ops import set_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -34,6 +33,7 @@
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import metrics
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import state_ops
@@ -178,16 +178,10 @@ def streaming_true_positives(predictions, labels, weights=None,
       either `metrics_collections` or `updates_collections` are not a list or
       tuple.
   """
-  with variable_scope.variable_scope(
-      name, 'true_positives', (predictions, labels, weights)):
-
-    predictions = ops.convert_to_tensor(predictions)
-    labels = ops.convert_to_tensor(labels)
-    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
-    is_true_positive = math_ops.logical_and(math_ops.equal(labels, 1),
-                                            math_ops.equal(predictions, 1))
-    return _count_condition(is_true_positive, weights, metrics_collections,
-                            updates_collections)
+  return metrics.true_positives(
+      predictions=predictions, labels=labels, weights=weights,
+      metrics_collections=metrics_collections,
+      updates_collections=updates_collections, name=name)
 
 
 def streaming_true_negatives(predictions, labels, weights=None,
@@ -262,16 +256,10 @@ def streaming_false_positives(predictions, labels, weights=None,
       either `metrics_collections` or `updates_collections` are not a list or
       tuple.
   """
-  with variable_scope.variable_scope(
-      name, 'false_positives', (predictions, labels, weights)):
-
-    predictions = ops.convert_to_tensor(predictions)
-    labels = ops.convert_to_tensor(labels)
-    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
-    is_false_positive = math_ops.logical_and(math_ops.equal(labels, 0),
-                                             math_ops.equal(predictions, 1))
-    return _count_condition(is_false_positive, weights, metrics_collections,
-                            updates_collections)
+  return metrics.false_positives(
+      predictions=predictions, labels=labels, weights=weights,
+      metrics_collections=metrics_collections,
+      updates_collections=updates_collections, name=name)
 
 
 def streaming_false_negatives(predictions, labels, weights=None,
@@ -303,16 +291,10 @@ def streaming_false_negatives(predictions, labels, weights=None,
       or if either `metrics_collections` or `updates_collections` are not a list
       or tuple.
   """
-  with variable_scope.variable_scope(
-      name, 'false_negatives', (predictions, labels, weights)):
-
-    predictions = ops.convert_to_tensor(predictions)
-    labels = ops.convert_to_tensor(labels)
-    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
-    is_false_negative = math_ops.logical_and(math_ops.equal(labels, 1),
-                                             math_ops.equal(predictions, 0))
-    return _count_condition(is_false_negative, weights, metrics_collections,
-                            updates_collections)
+  return metrics.false_negatives(
+      predictions=predictions, labels=labels, weights=weights,
+      metrics_collections=metrics_collections,
+      updates_collections=updates_collections, name=name)
 
 
 def _broadcast_weights(weights, values):
@@ -376,33 +358,9 @@ def streaming_mean(values, weights=None, metrics_collections=None,
       or if either `metrics_collections` or `updates_collections` are not a list
       or tuple.
   """
-  with variable_scope.variable_scope(name, 'mean', (values, weights)):
-    values = math_ops.to_float(values)
-
-    total = _create_local('total', shape=[])
-    count = _create_local('count', shape=[])
-
-    if weights is not None:
-      weights = math_ops.to_float(weights)
-      values = math_ops.mul(values, weights)
-      num_values = math_ops.reduce_sum(_broadcast_weights(weights, values))
-    else:
-      num_values = math_ops.to_float(array_ops.size(values))
-
-    total_compute_op = state_ops.assign_add(total, math_ops.reduce_sum(values))
-    count_compute_op = state_ops.assign_add(count, num_values)
-
-    mean = _safe_div(total, count, 'value')
-    with ops.control_dependencies([total_compute_op, count_compute_op]):
-      update_op = _safe_div(total, count, 'update_op')
-
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, mean)
-
-    if updates_collections:
-      ops.add_to_collections(updates_collections, update_op)
-
-    return mean, update_op
+  return metrics.mean(
+      values=values, weights=weights, metrics_collections=metrics_collections,
+      updates_collections=updates_collections, name=name)
 
 
 def streaming_mean_tensor(values, weights=None, metrics_collections=None,
@@ -445,36 +403,9 @@ def streaming_mean_tensor(values, weights=None, metrics_collections=None,
       or if either `metrics_collections` or `updates_collections` are not a list
       or tuple.
   """
-  with variable_scope.variable_scope(name, 'mean', (values, weights)):
-    total = _create_local('total_tensor', shape=values.get_shape())
-    count = _create_local('count_tensor', shape=values.get_shape())
-
-    num_values = array_ops.ones_like(values)
-    if weights is not None:
-      weights = math_ops.to_float(weights)
-      values = math_ops.mul(values, weights)
-      num_values = math_ops.mul(num_values, weights)
-
-    total_compute_op = state_ops.assign_add(total, values)
-    count_compute_op = state_ops.assign_add(count, num_values)
-
-    def compute_mean(total, count, name):
-      non_zero_count = math_ops.maximum(count,
-                                        array_ops.ones_like(count),
-                                        name=name)
-      return math_ops.truediv(total, non_zero_count, name=name)
-
-    mean = compute_mean(total, count, 'value')
-    with ops.control_dependencies([total_compute_op, count_compute_op]):
-      update_op = compute_mean(total, count, 'update_op')
-
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, mean)
-
-    if updates_collections:
-      ops.add_to_collections(updates_collections, update_op)
-
-    return mean, update_op
+  return metrics.mean_tensor(
+      values=values, weights=weights, metrics_collections=metrics_collections,
+      updates_collections=updates_collections, name=name)
 
 
 def streaming_accuracy(predictions, labels, weights=None,
@@ -520,14 +451,10 @@ def streaming_accuracy(predictions, labels, weights=None,
       either `metrics_collections` or `updates_collections` are not a list or
       tuple.
   """
-  predictions, labels, weights = _remove_squeezable_dimensions(
-      predictions, labels, weights=weights)
-  predictions.get_shape().assert_is_compatible_with(labels.get_shape())
-  if labels.dtype != predictions.dtype:
-    predictions = math_ops.cast(predictions, labels.dtype)
-  is_correct = math_ops.to_float(math_ops.equal(predictions, labels))
-  return streaming_mean(is_correct, weights, metrics_collections,
-                        updates_collections, name or 'accuracy')
+  return metrics.accuracy(
+      predictions=predictions, labels=labels, weights=weights,
+      metrics_collections=metrics_collections,
+      updates_collections=updates_collections, name=name)
 
 
 def streaming_precision(predictions, labels, weights=None,
@@ -572,39 +499,10 @@ def streaming_precision(predictions, labels, weights=None,
       either `metrics_collections` or `updates_collections` are not a list or
       tuple.
   """
-  with variable_scope.variable_scope(
-      name, 'precision', (predictions, labels, weights)):
-
-    predictions, labels, weights = _remove_squeezable_dimensions(
-        predictions, labels, weights)
-    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
-
-    true_positives, true_positives_update_op = streaming_true_positives(
-        predictions, labels, weights, metrics_collections=None,
-        updates_collections=None, name=None)
-    false_positives, false_positives_update_op = streaming_false_positives(
-        predictions, labels, weights, metrics_collections=None,
-        updates_collections=None, name=None)
-
-    def compute_precision(name):
-      return array_ops.where(
-          math_ops.greater(true_positives + false_positives, 0),
-          math_ops.div(true_positives, true_positives + false_positives),
-          0,
-          name)
-
-    precision = compute_precision('value')
-    with ops.control_dependencies([true_positives_update_op,
-                                   false_positives_update_op]):
-      update_op = compute_precision('update_op')
-
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, precision)
-
-    if updates_collections:
-      ops.add_to_collections(updates_collections, update_op)
-
-    return precision, update_op
+  return metrics.precision(
+      predictions=predictions, labels=labels, weights=weights,
+      metrics_collections=metrics_collections,
+      updates_collections=updates_collections, name=name)
 
 
 def streaming_recall(predictions, labels, weights=None,
@@ -647,38 +545,10 @@ def streaming_recall(predictions, labels, weights=None,
       either `metrics_collections` or `updates_collections` are not a list or
       tuple.
   """
-  with variable_scope.variable_scope(
-      name, 'recall', (predictions, labels, weights)):
-    predictions, labels, weights = _remove_squeezable_dimensions(
-        predictions, labels, weights)
-    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
-
-    true_positives, true_positives_update_op = streaming_true_positives(
-        predictions, labels, weights, metrics_collections=None,
-        updates_collections=None, name=None)
-    false_negatives, false_negatives_update_op = streaming_false_negatives(
-        predictions, labels, weights, metrics_collections=None,
-        updates_collections=None, name=None)
-
-    def compute_recall(true_positives, false_negatives, name):
-      return array_ops.where(
-          math_ops.greater(true_positives + false_negatives, 0),
-          math_ops.div(true_positives, true_positives + false_negatives),
-          0,
-          name)
-
-    recall = compute_recall(true_positives, false_negatives, 'value')
-    with ops.control_dependencies([true_positives_update_op,
-                                   false_negatives_update_op]):
-      update_op = compute_recall(true_positives, false_negatives, 'update_op')
-
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, recall)
-
-    if updates_collections:
-      ops.add_to_collections(updates_collections, update_op)
-
-    return recall, update_op
+  return metrics.recall(
+      predictions=predictions, labels=labels, weights=weights,
+      metrics_collections=metrics_collections,
+      updates_collections=updates_collections, name=name)
 
 
 def _streaming_confusion_matrix_at_thresholds(
@@ -903,50 +773,10 @@ def streaming_auc(predictions, labels, weights=None, num_thresholds=200,
       either `metrics_collections` or `updates_collections` are not a list or
       tuple.
   """
-  with variable_scope.variable_scope(
-      name, 'auc', (predictions, labels, weights)):
-    if curve != 'ROC' and  curve != 'PR':
-      raise ValueError('curve must be either ROC or PR, %s unknown' %
-                       (curve))
-    kepsilon = 1e-7  # to account for floating point imprecisions
-    thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
-                  for i in range(num_thresholds-2)]
-    thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
-
-    values, update_ops = _streaming_confusion_matrix_at_thresholds(
-        predictions, labels, thresholds, weights)
-
-    # Add epsilons to avoid dividing by 0.
-    epsilon = 1.0e-6
-    def compute_auc(tp, fn, tn, fp, name):
-      """Computes the roc-auc or pr-auc based on confusion counts."""
-      recall = math_ops.div(tp + epsilon, tp + fn + epsilon)
-      if curve == 'ROC':
-        fp_rate = math_ops.div(fp, fp + tn + epsilon)
-        x = fp_rate
-        y = recall
-      else:  # curve == 'PR'.
-        precision = math_ops.div(tp + epsilon, tp + fp + epsilon)
-        x = recall
-        y = precision
-      return math_ops.reduce_sum(math_ops.mul(
-          x[:num_thresholds - 1] - x[1:],
-          (y[:num_thresholds - 1] + y[1:]) / 2.), name=name)
-
-    # sum up the areas of all the trapeziums
-    auc = compute_auc(
-        values['tp'], values['fn'], values['tn'], values['fp'], 'value')
-    update_op = compute_auc(
-        update_ops['tp'], update_ops['fn'], update_ops['tn'], update_ops['fp'],
-        'update_op')
-
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, auc)
-
-    if updates_collections:
-      ops.add_to_collections(updates_collections, update_op)
-
-    return auc, update_op
+  return metrics.auc(
+      predictions=predictions, labels=labels, weights=weights,
+      metrics_collections=metrics_collections, num_thresholds=num_thresholds,
+      curve=curve, updates_collections=updates_collections, name=name)
 
 
 def streaming_specificity_at_sensitivity(
@@ -998,60 +828,11 @@ def streaming_specificity_at_sensitivity(
       `sensitivity` is not between 0 and 1, or if either `metrics_collections`
       or `updates_collections` are not a list or tuple.
   """
-  if sensitivity < 0 or sensitivity > 1:
-    raise ValueError('`sensitivity` must be in the range [0, 1].')
-
-  with variable_scope.variable_scope(name, 'specificity_at_sensitivity',
-                                     (predictions, labels, weights)):
-    kepsilon = 1e-7  # to account for floating point imprecisions
-    thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
-                  for i in range(num_thresholds-2)]
-    thresholds = [0.0 - kepsilon] + thresholds + [1.0 - kepsilon]
-
-    values, update_ops = _streaming_confusion_matrix_at_thresholds(
-        predictions, labels, thresholds, weights)
-    tp = values['tp']
-    fn = values['fn']
-    tn = values['tn']
-    fp = values['fp']
-
-    def compute_specificity_at_sensitivity(name):
-      """Computes the specificity at the given sensitivity.
-
-      Args:
-        name: The name of the operation.
-
-      Returns:
-        The specificity using the aggregated values.
-      """
-      sensitivities = math_ops.div(tp, tp + fn + kepsilon)
-
-      # We'll need to use this trick until tf.argmax allows us to specify
-      # whether we should use the first or last index in case of ties.
-      min_val = math_ops.reduce_min(math_ops.abs(sensitivities - sensitivity))
-      indices_at_minval = math_ops.equal(
-          math_ops.abs(sensitivities - sensitivity), min_val)
-      indices_at_minval = math_ops.to_int64(indices_at_minval)
-      indices_at_minval = math_ops.cumsum(indices_at_minval)
-      tf_index = math_ops.argmax(indices_at_minval, 0)
-      tf_index = math_ops.cast(tf_index, dtypes.int32)
-
-      # Now, we have the implicit threshold, so compute the specificity:
-      return math_ops.div(tn[tf_index],
-                          tn[tf_index] + fp[tf_index] + kepsilon,
-                          name)
-
-    specificity = compute_specificity_at_sensitivity('value')
-    with ops.control_dependencies(update_ops.values()):
-      update_op = compute_specificity_at_sensitivity('update_op')
-
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, specificity)
-
-    if updates_collections:
-      ops.add_to_collections(updates_collections, update_op)
-
-    return specificity, update_op
+  return metrics.specificity_at_sensitivity(
+      sensitivity=sensitivity, num_thresholds=num_thresholds,
+      predictions=predictions, labels=labels, weights=weights,
+      metrics_collections=metrics_collections,
+      updates_collections=updates_collections, name=name)
 
 
 def streaming_sensitivity_at_specificity(
@@ -1103,44 +884,11 @@ def streaming_sensitivity_at_specificity(
       `specificity` is not between 0 and 1, or if either `metrics_collections`
       or `updates_collections` are not a list or tuple.
   """
-  if specificity < 0 or specificity > 1:
-    raise ValueError('`specificity` must be in the range [0, 1].')
-
-  with variable_scope.variable_scope(name, 'sensitivity_at_specificity',
-                                     (predictions, labels, weights)):
-    kepsilon = 1e-7  # to account for floating point imprecisions
-    thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
-                  for i in range(num_thresholds-2)]
-    thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
-
-    values, update_ops = _streaming_confusion_matrix_at_thresholds(
-        predictions, labels, thresholds, weights)
-    tp = values['tp']
-    fn = values['fn']
-    tn = values['tn']
-    fp = values['fp']
-
-    def compute_sensitivity_at_specificity(name):
-      specificities = math_ops.div(tn, tn + fp + kepsilon)
-      tf_index = math_ops.argmin(math_ops.abs(specificities - specificity), 0)
-      tf_index = math_ops.cast(tf_index, dtypes.int32)
-
-      # Now, we have the implicit threshold, so compute the sensitivity:
-      return math_ops.div(tp[tf_index],
-                          tp[tf_index] + fn[tf_index] + kepsilon,
-                          name)
-
-    sensitivity = compute_sensitivity_at_specificity('value')
-    with ops.control_dependencies(update_ops.values()):
-      update_op = compute_sensitivity_at_specificity('update_op')
-
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, sensitivity)
-
-    if updates_collections:
-      ops.add_to_collections(updates_collections, update_op)
-
-    return sensitivity, update_op
+  return metrics.sensitivity_at_specificity(
+      specificity=specificity, num_thresholds=num_thresholds,
+      predictions=predictions, labels=labels, weights=weights,
+      metrics_collections=metrics_collections,
+      updates_collections=updates_collections, name=name)
 
 
 def streaming_precision_at_thresholds(predictions, labels, thresholds,
@@ -1187,29 +935,11 @@ def streaming_precision_at_thresholds(predictions, labels, thresholds,
       either `metrics_collections` or `updates_collections` are not a list or
       tuple.
   """
-  with variable_scope.variable_scope(name, 'precision_at_thresholds',
-                                     (predictions, labels, weights)):
-    values, update_ops = _streaming_confusion_matrix_at_thresholds(
-        predictions, labels, thresholds, weights, includes=('tp', 'fp'))
-    tp = values['tp']
-    fp = values['fp']
-
-    # Avoid division by zero.
-    epsilon = 1e-7
-    def compute_precision(name):
-      return math_ops.div(tp, epsilon + tp + fp, name='precision_' + name)
-
-    precision = compute_precision('value')
-    with ops.control_dependencies(update_ops.values()):
-      update_op = compute_precision('update_op')
-
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, precision)
-
-    if updates_collections:
-      ops.add_to_collections(updates_collections, update_op)
-
-    return precision, update_op
+  return metrics.precision_at_thresholds(
+      thresholds=thresholds,
+      predictions=predictions, labels=labels, weights=weights,
+      metrics_collections=metrics_collections,
+      updates_collections=updates_collections, name=name)
 
 
 def streaming_recall_at_thresholds(predictions, labels, thresholds,
@@ -1253,29 +983,11 @@ def streaming_recall_at_thresholds(predictions, labels, thresholds,
       either `metrics_collections` or `updates_collections` are not a list or
       tuple.
   """
-  with variable_scope.variable_scope(name, 'recall_at_thresholds',
-                                     (predictions, labels, weights)):
-    values, update_ops = _streaming_confusion_matrix_at_thresholds(
-        predictions, labels, thresholds, weights, includes=('tp', 'fn'))
-    tp = values['tp']
-    fn = values['fn']
-
-    # Avoid division by zero.
-    epsilon = 1e-7
-    def compute_recall(name):
-      return math_ops.div(tp, epsilon + tp + fn, name='recall_' + name)
-
-    recall = compute_recall('value')
-    with ops.control_dependencies(update_ops.values()):
-      update_op = compute_recall('update_op')
-
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, recall)
-
-    if updates_collections:
-      ops.add_to_collections(updates_collections, update_op)
-
-    return recall, update_op
+  return metrics.recall_at_thresholds(
+      thresholds=thresholds,
+      predictions=predictions, labels=labels, weights=weights,
+      metrics_collections=metrics_collections,
+      updates_collections=updates_collections, name=name)
 
 
 def _at_k_name(name, k=None, class_id=None):
@@ -1413,25 +1125,11 @@ def streaming_sparse_recall_at_k(predictions,
     `predictions`, or if either `metrics_collections` or `updates_collections`
     are not a list or tuple.
   """
-  default_name = _at_k_name('recall', k, class_id=class_id)
-  with ops.name_scope(name, default_name, (predictions, labels)) as scope:
-    _, top_k_idx = nn.top_k(predictions, k)
-    top_k_idx = math_ops.to_int64(top_k_idx)
-    tp, tp_update = _streaming_sparse_true_positive_at_k(
-        predictions_idx=top_k_idx, labels=labels, k=k, class_id=class_id,
-        weights=weights)
-    fn, fn_update = _streaming_sparse_false_negative_at_k(
-        predictions_idx=top_k_idx, labels=labels, k=k, class_id=class_id,
-        weights=weights)
-
-    metric = math_ops.div(tp, math_ops.add(tp, fn), name=scope)
-    update = math_ops.div(
-        tp_update, math_ops.add(tp_update, fn_update), name='update')
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, metric)
-    if updates_collections:
-      ops.add_to_collections(updates_collections, update)
-    return metric, update
+  return metrics.recall_at_k(
+      k=k, class_id=class_id,
+      predictions=predictions, labels=labels, weights=weights,
+      metrics_collections=metrics_collections,
+      updates_collections=updates_collections, name=name)
 
 
 def _streaming_sparse_precision_at_k(top_k_idx,
@@ -1575,19 +1273,11 @@ def streaming_sparse_precision_at_k(predictions,
       `predictions`, or if either `metrics_collections` or `updates_collections`
       are not a list or tuple.
   """
-  default_name = _at_k_name('precision', k, class_id=class_id)
-  with ops.name_scope(name, default_name,
-                      (predictions, labels, weights)) as scope:
-    _, top_k_idx = nn.top_k(predictions, k)
-    return _streaming_sparse_precision_at_k(
-        top_k_idx=top_k_idx,
-        labels=labels,
-        k=k,
-        class_id=class_id,
-        weights=weights,
-        metrics_collections=metrics_collections,
-        updates_collections=updates_collections,
-        name=scope)
+  return metrics.sparse_precision_at_k(
+      k=k, class_id=class_id,
+      predictions=predictions, labels=labels, weights=weights,
+      metrics_collections=metrics_collections,
+      updates_collections=updates_collections, name=name)
 
 
 # TODO(ptucker): Validate range of values in labels?
@@ -1745,12 +1435,15 @@ def expand_and_tile(tensor, multiple, dim=0, name=None):
     if isinstance(tensor, sparse_tensor.SparseTensor):
       if dim < 0:
         expand_dims = array_ops.reshape(
-            array_ops.size(tensor.shape) + dim, [1])
+            array_ops.size(tensor.dense_shape) + dim, [1])
       else:
         expand_dims = [dim]
       expanded_shape = array_ops.concat(
-          0, (array_ops.slice(tensor.shape, [0], expand_dims), [1],
-              array_ops.slice(tensor.shape, expand_dims, [-1])),
+          0, (array_ops.strided_slice(
+                  tensor.dense_shape, [0], expand_dims),
+              [1],
+              array_ops.strided_slice(
+                  tensor.dense_shape, expand_dims, [-1], end_mask=1 << 0)),
           name='expanded_shape')
       expanded = sparse_ops.sparse_reshape(
           tensor, shape=expanded_shape, name='expand')
@@ -1917,50 +1610,10 @@ def streaming_sparse_average_precision_at_k(predictions,
     update: `Operation` that increments  variables appropriately, and whose
       value matches `metric`.
   """
-  default_name = _at_k_name('average_precision', k)
-  with ops.name_scope(name, default_name, (predictions, labels)) as scope:
-    # Calculate per-example average precision, and apply weights.
-    average_precision = sparse_average_precision_at_k(
-        predictions=predictions, labels=labels, k=k)
-    if weights is not None:
-      weights = math_ops.to_double(weights)
-      average_precision = math_ops.mul(average_precision, weights)
-
-    # Create accumulation variables and update ops for max average precision and
-    # total average precision.
-    with ops.name_scope(None, 'max', (average_precision,)) as max_scope:
-      # `max` is the max possible precision. Since max for any row is 1.0:
-      # - For the unweighted case, this is just the number of rows.
-      # - For the weighted case, it's the sum of the weights broadcast across
-      #   `average_precision` rows.
-      max_var = contrib_variables.local_variable(
-          array_ops.zeros([], dtype=dtypes.float64), name=max_scope)
-      if weights is None:
-        batch_max = math_ops.to_double(
-            array_ops.size(average_precision, name='batch_max'))
-      else:
-        # TODO(ptucker): More efficient way to broadcast?
-        broadcast_weights = math_ops.mul(
-            weights, array_ops.ones_like(average_precision),
-            name='broadcast_weights')
-        batch_max = math_ops.reduce_sum(broadcast_weights, name='batch_max')
-      max_update = state_ops.assign_add(max_var, batch_max, name='update')
-    with ops.name_scope(None, 'total', (average_precision,)) as total_scope:
-      total_var = contrib_variables.local_variable(
-          array_ops.zeros([], dtype=dtypes.float64), name=total_scope)
-      batch_total = math_ops.reduce_sum(average_precision, name='batch_total')
-      total_update = state_ops.assign_add(total_var, batch_total, name='update')
-
-    # Divide total by max to get mean, for both vars and the update ops.
-    mean_average_precision = _safe_scalar_div(total_var, max_var, name='mean')
-    update = _safe_scalar_div(total_update, max_update, name=scope)
-
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, mean_average_precision)
-    if updates_collections:
-      ops.add_to_collections(updates_collections, update)
-
-    return mean_average_precision, update
+  return metrics.sparse_average_precision_at_k(
+      k=k, predictions=predictions, labels=labels, weights=weights,
+      metrics_collections=metrics_collections,
+      updates_collections=updates_collections, name=name)
 
 
 def _select_class_id(ids, selected_id):
@@ -1993,7 +1646,7 @@ def _select_class_id(ids, selected_id):
       filled_selected_id_shape, math_ops.to_int64(selected_id))
   result = set_ops.set_intersection(filled_selected_id, ids)
   return sparse_tensor.SparseTensor(
-      indices=result.indices, values=result.values, shape=ids_shape)
+      indices=result.indices, values=result.values, dense_shape=ids_shape)
 
 
 def _maybe_select_class_id(labels, predictions_idx, selected_id=None):
@@ -2328,12 +1981,10 @@ def streaming_mean_absolute_error(predictions, labels, weights=None,
       either `metrics_collections` or `updates_collections` are not a list or
       tuple.
   """
-  predictions, labels, weights = _remove_squeezable_dimensions(
-      predictions, labels, weights)
-  predictions.get_shape().assert_is_compatible_with(labels.get_shape())
-  absolute_errors = math_ops.abs(predictions - labels)
-  return streaming_mean(absolute_errors, weights, metrics_collections,
-                        updates_collections, name or 'mean_absolute_error')
+  return metrics.mean_absolute_error(
+      predictions=predictions, labels=labels, weights=weights,
+      metrics_collections=metrics_collections,
+      updates_collections=updates_collections, name=name)
 
 
 def streaming_mean_relative_error(predictions, labels, normalizer, weights=None,
@@ -2381,19 +2032,10 @@ def streaming_mean_relative_error(predictions, labels, normalizer, weights=None,
       either `metrics_collections` or `updates_collections` are not a list or
       tuple.
   """
-  predictions, labels, weights = _remove_squeezable_dimensions(
-      predictions, labels, weights)
-  predictions.get_shape().assert_is_compatible_with(labels.get_shape())
-
-  predictions, normalizer = tensor_util.remove_squeezable_dimensions(
-      predictions, normalizer)
-  predictions.get_shape().assert_is_compatible_with(normalizer.get_shape())
-  relative_errors = array_ops.where(
-      math_ops.equal(normalizer, 0.0),
-      array_ops.zeros_like(labels),
-      math_ops.div(math_ops.abs(labels - predictions), normalizer))
-  return streaming_mean(relative_errors, weights, metrics_collections,
-                        updates_collections, name or 'mean_relative_error')
+  return metrics.mean_relative_error(
+      normalizer=normalizer, predictions=predictions, labels=labels,
+      weights=weights, metrics_collections=metrics_collections,
+      updates_collections=updates_collections, name=name)
 
 
 def streaming_mean_squared_error(predictions, labels, weights=None,
@@ -2440,12 +2082,10 @@ def streaming_mean_squared_error(predictions, labels, weights=None,
       either `metrics_collections` or `updates_collections` are not a list or
       tuple.
   """
-  predictions, labels, weights = _remove_squeezable_dimensions(
-      predictions, labels, weights)
-  predictions.get_shape().assert_is_compatible_with(labels.get_shape())
-  squared_error = math_ops.square(labels - predictions)
-  return streaming_mean(squared_error, weights, metrics_collections,
-                        updates_collections, name or 'mean_squared_error')
+  return metrics.mean_squared_error(
+      predictions=predictions, labels=labels, weights=weights,
+      metrics_collections=metrics_collections,
+      updates_collections=updates_collections, name=name)
 
 
 def streaming_root_mean_squared_error(predictions, labels, weights=None,
@@ -2492,24 +2132,10 @@ def streaming_root_mean_squared_error(predictions, labels, weights=None,
       either `metrics_collections` or `updates_collections` are not a list or
       tuple.
   """
-  predictions, labels, weights = _remove_squeezable_dimensions(
-      predictions, labels, weights)
-  predictions.get_shape().assert_is_compatible_with(labels.get_shape())
-  value_tensor, update_op = streaming_mean_squared_error(
-      predictions, labels, weights, None, None,
-      name or 'root_mean_squared_error')
-
-  root_mean_squared_error = math_ops.sqrt(value_tensor)
-  with ops.control_dependencies([update_op]):
-    update_op = math_ops.sqrt(update_op)
-
-  if metrics_collections:
-    ops.add_to_collections(metrics_collections, root_mean_squared_error)
-
-  if updates_collections:
-    ops.add_to_collections(updates_collections, update_op)
-
-  return root_mean_squared_error, update_op
+  return metrics.root_mean_squared_error(
+      predictions=predictions, labels=labels, weights=weights,
+      metrics_collections=metrics_collections,
+      updates_collections=updates_collections, name=name)
 
 
 def streaming_covariance(predictions,
@@ -2824,12 +2450,10 @@ def streaming_percentage_less(values, threshold, weights=None,
       or if either `metrics_collections` or `updates_collections` are not a list
       or tuple.
   """
-  is_below_threshold = math_ops.to_float(math_ops.less(values, threshold))
-  return streaming_mean(is_below_threshold,
-                        weights,
-                        metrics_collections,
-                        updates_collections,
-                        name or 'percentage_below_threshold')
+  return metrics.percentage_below(
+      values=values, threshold=threshold, weights=weights,
+      metrics_collections=metrics_collections,
+      updates_collections=updates_collections, name=name)
 
 
 def streaming_mean_iou(predictions,
@@ -2880,65 +2504,10 @@ def streaming_mean_iou(predictions,
       either `metrics_collections` or `updates_collections` are not a list or
       tuple.
   """
-  with variable_scope.variable_scope(
-      name, 'mean_iou', (predictions, labels, weights)):
-    # Check if shape is compatible.
-    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
-
-    # Local variable to accumulate the predictions in the confusion matrix.
-    cm_dtype = dtypes.int64 if weights is not None else dtypes.float64
-    total_cm = _create_local('total_confusion_matrix',
-                             shape=[num_classes, num_classes], dtype=cm_dtype)
-
-    # Cast the type to int64 required by confusion_matrix_ops.
-    predictions = math_ops.to_int64(predictions)
-    labels = math_ops.to_int64(labels)
-    num_classes = math_ops.to_int64(num_classes)
-
-    # Flatten the input if its rank > 1.
-    predictions_rank = predictions.get_shape().ndims
-    if predictions_rank > 1:
-      predictions = array_ops.reshape(predictions, [-1])
-
-    labels_rank = labels.get_shape().ndims
-    if labels_rank > 1:
-      labels = array_ops.reshape(labels, [-1])
-
-    if weights is not None:
-      weights_rank = weights.get_shape().ndims
-      if weights_rank > 1:
-        weights = array_ops.reshape(weights, [-1])
-
-    # Accumulate the prediction to current confusion matrix.
-    current_cm = confusion_matrix_ops.confusion_matrix(
-        predictions, labels, num_classes, weights=weights, dtype=cm_dtype)
-    update_op = state_ops.assign_add(total_cm, current_cm)
-
-    def compute_mean_iou(name):
-      """Compute the mean intersection-over-union via the confusion matrix."""
-      sum_over_row = math_ops.to_float(math_ops.reduce_sum(total_cm, 0))
-      sum_over_col = math_ops.to_float(math_ops.reduce_sum(total_cm, 1))
-      cm_diag = math_ops.to_float(array_ops.diag_part(total_cm))
-      denominator = sum_over_row + sum_over_col - cm_diag
-
-      # If the value of the denominator is 0, set it to 1 to avoid
-      # zero division.
-      denominator = array_ops.where(
-          math_ops.greater(denominator, 0),
-          denominator,
-          array_ops.ones_like(denominator))
-      iou = math_ops.div(cm_diag, denominator)
-      return math_ops.reduce_mean(iou, name=name)
-
-    mean_iou = compute_mean_iou('mean_iou')
-
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, mean_iou)
-
-    if updates_collections:
-      ops.add_to_collections(updates_collections, update_op)
-
-    return mean_iou, update_op
+  return metrics.mean_iou(
+      num_classes=num_classes, predictions=predictions, labels=labels,
+      weights=weights, metrics_collections=metrics_collections,
+      updates_collections=updates_collections, name=name)
 
 
 def _next_array_size(required_size, growth_factor=1.5):
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
index cf58792c4ac145..53d5de6da71476 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
@@ -4651,11 +4651,6 @@ def testSparseExpandAndTileInvalidArgs(self):
         shape=[3, 3, 3])
     with self.assertRaisesRegexp(ValueError, 'nvalid multiple'):
       metric_ops.expand_and_tile(x, multiple=0)
-    with self.test_session():
-      with self.assertRaises(tf.OpError):
-        metric_ops.expand_and_tile(x, multiple=1, dim=-4).eval()
-      with self.assertRaises(ValueError):
-        metric_ops.expand_and_tile(x, multiple=1, dim=4).eval()
 
   def _test_expand_and_tile(
       self, expected_shape, expected_value, tensor, multiple, dim=None):
@@ -4768,7 +4763,7 @@ def testSparseExpandAndTile1x(self):
     with self.test_session():
       expected_result_dim0 = tf.SparseTensorValue(
           indices=[[0, i[0], i[1]] for i in x.indices], values=x.values,
-          shape=[1, 3, 3])
+          dense_shape=[1, 3, 3])
       self._assert_sparse_tensors_equal(
           expected_result_dim0,
           metric_ops.expand_and_tile(x, multiple=1).eval())
@@ -4779,7 +4774,7 @@ def testSparseExpandAndTile1x(self):
 
       expected_result_dim1 = tf.SparseTensorValue(
           indices=[[i[0], 0, i[1]] for i in x.indices], values=x.values,
-          shape=[3, 1, 3])
+          dense_shape=[3, 1, 3])
       for dim in (-1, 1):
         self._assert_sparse_tensors_equal(
             expected_result_dim1,
@@ -4787,7 +4782,7 @@ def testSparseExpandAndTile1x(self):
 
       expected_result_dim2 = tf.SparseTensorValue(
           indices=[[i[0], i[1], 0] for i in x.indices], values=x.values,
-          shape=[3, 3, 1])
+          dense_shape=[3, 3, 1])
       self._assert_sparse_tensors_equal(
           expected_result_dim2,
           metric_ops.expand_and_tile(x, multiple=1, dim=2).eval())
diff --git a/tensorflow/contrib/metrics/python/ops/set_ops.py b/tensorflow/contrib/metrics/python/ops/set_ops.py
index dd737a14c29bf0..bca8334110c9d9 100644
--- a/tensorflow/contrib/metrics/python/ops/set_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/set_ops.py
@@ -17,167 +17,12 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.framework.python.framework import tensor_util
+from tensorflow.python.ops import sets
 
-from tensorflow.contrib.util import loader
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.platform import resource_loader
+set_size = sets.set_size
 
+set_intersection = sets.set_intersection
 
-_set_ops = loader.load_op_library(
-    resource_loader.get_path_to_datafile("_set_ops.so"))
+set_difference = sets.set_difference
 
-_VALID_DTYPES = set([
-    dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
-    dtypes.uint8, dtypes.uint16, dtypes.string])
-
-
-def set_size(a, validate_indices=True):
-  """Compute number of unique elements along last dimension of `a`.
-
-  Args:
-    a: `SparseTensor`, with indices sorted in row-major order.
-    validate_indices: Whether to validate the order and range of sparse indices
-       in `a`.
-
-  Returns:
-    `int32` `Tensor` of set sizes. For `a` ranked `n`, this is a `Tensor` with
-    rank `n-1`, and the same 1st `n-1` dimensions as `a`. Each value is the
-    number of unique elements in the corresponding `[0...n-1]` dimension of `a`.
-
-  Raises:
-    TypeError: If `a` is an invalid types.
-  """
-  a = tensor_util.convert_to_tensor_or_sparse_tensor(a, name="a")
-  if not isinstance(a, sparse_tensor.SparseTensor):
-    raise TypeError("Expected `SparseTensor`, got %s." % a)
-  if a.values.dtype.base_dtype not in _VALID_DTYPES:
-    raise TypeError("Invalid dtype %s." % a.values.dtype)
-  # pylint: disable=protected-access
-  return _set_ops.set_size(a.indices, a.values, a.shape, validate_indices)
-
-ops.NotDifferentiable("SetSize")
-
-
-ops.NotDifferentiable("DenseToDenseSetOperation")
-ops.NotDifferentiable("DenseToSparseSetOperation")
-ops.NotDifferentiable("SparseToSparseSetOperation")
-
-
-def _set_operation(a, b, set_operation, validate_indices=True):
-  """Compute set operation of elements in last dimension of `a` and `b`.
-
-  All but the last dimension of `a` and `b` must match.
-
-  Args:
-    a: `Tensor` or `SparseTensor` of the same type as `b`. If sparse, indices
-        must be sorted in row-major order.
-    b: `Tensor` or `SparseTensor` of the same type as `a`. Must be
-        `SparseTensor` if `a` is `SparseTensor`. If sparse, indices must be
-        sorted in row-major order.
-    set_operation: String indicating set operaiton. See
-        SetOperationOp::SetOperationFromContext for valid values.
-    validate_indices: Whether to validate the order and range of sparse indices
-       in `a` and `b`.
-
-  Returns:
-    A `SparseTensor` with the same rank as `a` and `b`, and all but the last
-    dimension the same. Elements along the last dimension contain the results
-    of the set operation.
-
-  Raises:
-    TypeError: If inputs are invalid types.
-    ValueError: If `a` is sparse and `b` is dense.
-  """
-  a = tensor_util.convert_to_tensor_or_sparse_tensor(a, name="a")
-  if a.dtype.base_dtype not in _VALID_DTYPES:
-    raise TypeError("'a' invalid dtype %s." % a.dtype)
-  b = tensor_util.convert_to_tensor_or_sparse_tensor(b, name="b")
-  if b.dtype.base_dtype != a.dtype.base_dtype:
-    raise TypeError("Types don't match, %s vs %s." % (a.dtype, b.dtype))
-  # pylint: disable=protected-access
-  if isinstance(a, sparse_tensor.SparseTensor):
-    if isinstance(b, sparse_tensor.SparseTensor):
-      indices, values, shape = _set_ops.sparse_to_sparse_set_operation(
-          a.indices, a.values, a.shape, b.indices, b.values, b.shape,
-          set_operation, validate_indices)
-    else:
-      raise ValueError("Sparse,Dense is not supported, but Dense,Sparse is. "
-                       "Please flip the order of your inputs.")
-  elif isinstance(b, sparse_tensor.SparseTensor):
-    indices, values, shape = _set_ops.dense_to_sparse_set_operation(
-        a, b.indices, b.values, b.shape, set_operation, validate_indices)
-  else:
-    indices, values, shape = _set_ops.dense_to_dense_set_operation(
-        a, b, set_operation, validate_indices)
-  # pylint: enable=protected-access
-  return sparse_tensor.SparseTensor(indices, values, shape)
-
-
-def set_intersection(a, b, validate_indices=True):
-  """Compute set intersection of elements in last dimension of `a` and `b`.
-
-  All but the last dimension of `a` and `b` must match.
-
-  Args:
-    a: `Tensor` or `SparseTensor` of the same type as `b`. If sparse, indices
-        must be sorted in row-major order.
-    b: `Tensor` or `SparseTensor` of the same type as `a`. Must be
-        `SparseTensor` if `a` is `SparseTensor`. If sparse, indices must be
-        sorted in row-major order.
-    validate_indices: Whether to validate the order and range of sparse indices
-       in `a` and `b`.
-
-  Returns:
-    A `SparseTensor` with the same rank as `a` and `b`, and all but the last
-    dimension the same. Elements along the last dimension contain the
-    intersections.
-  """
-  return _set_operation(a, b, "intersection", validate_indices)
-
-
-def set_difference(a, b, aminusb=True, validate_indices=True):
-  """Compute set difference of elements in last dimension of `a` and `b`.
-
-  All but the last dimension of `a` and `b` must match.
-
-  Args:
-    a: `Tensor` or `SparseTensor` of the same type as `b`. If sparse, indices
-        must be sorted in row-major order.
-    b: `Tensor` or `SparseTensor` of the same type as `a`. Must be
-        `SparseTensor` if `a` is `SparseTensor`. If sparse, indices must be
-        sorted in row-major order.
-    aminusb: Whether to subtract `b` from `a`, vs vice versa.
-    validate_indices: Whether to validate the order and range of sparse indices
-       in `a` and `b`.
-
-  Returns:
-    A `SparseTensor` with the same rank as `a` and `b`, and all but the last
-    dimension the same. Elements along the last dimension contain the
-    differences.
-  """
-  return _set_operation(a, b, "a-b" if aminusb else "b-a", validate_indices)
-
-
-def set_union(a, b, validate_indices=True):
-  """Compute set union of elements in last dimension of `a` and `b`.
-
-  All but the last dimension of `a` and `b` must match.
-
-  Args:
-    a: `Tensor` or `SparseTensor` of the same type as `b`. If sparse, indices
-        must be sorted in row-major order.
-    b: `Tensor` or `SparseTensor` of the same type as `a`. Must be
-        `SparseTensor` if `a` is `SparseTensor`. If sparse, indices must be
-        sorted in row-major order.
-    validate_indices: Whether to validate the order and range of sparse indices
-       in `a` and `b`.
-
-  Returns:
-    A `SparseTensor` with the same rank as `a` and `b`, and all but the last
-    dimension the same. Elements along the last dimension contain the
-    unions.
-  """
-  return _set_operation(a, b, "union", validate_indices)
+set_union = sets.set_union
diff --git a/tensorflow/contrib/ndlstm/python/lstm1d.py b/tensorflow/contrib/ndlstm/python/lstm1d.py
index 4482cecd6f62e7..2a6cc1de8e4ba1 100644
--- a/tensorflow/contrib/ndlstm/python/lstm1d.py
+++ b/tensorflow/contrib/ndlstm/python/lstm1d.py
@@ -50,7 +50,7 @@ def ndlstm_base_unrolled(inputs, noutput, scope=None, reverse=False):
   """
   with tf.variable_scope(scope, "SeqLstmUnrolled", [inputs]):
     length, batch_size, _ = _shape(inputs)
-    lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(noutput, state_is_tuple=False)
+    lstm_cell = tf.contrib.rnn.BasicLSTMCell(noutput, state_is_tuple=False)
     state = tf.zeros([batch_size, lstm_cell.state_size])
     output_u = []
     inputs_u = tf.unstack(inputs)
@@ -86,7 +86,7 @@ def ndlstm_base_dynamic(inputs, noutput, scope=None, reverse=False):
     # TODO(tmb) make batch size, sequence_length dynamic
     # example: sequence_length = tf.shape(inputs)[0]
     _, batch_size, _ = _shape(inputs)
-    lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(noutput, state_is_tuple=False)
+    lstm_cell = tf.contrib.rnn.BasicLSTMCell(noutput, state_is_tuple=False)
     state = tf.zeros([batch_size, lstm_cell.state_size])
     sequence_length = int(inputs.get_shape()[0])
     sequence_lengths = tf.to_int64(tf.fill([batch_size], sequence_length))
@@ -145,7 +145,7 @@ def sequence_to_final(inputs, noutput, scope=None, name=None, reverse=False):
   """
   with tf.variable_scope(scope, "SequenceToFinal", [inputs]):
     length, batch_size, _ = _shape(inputs)
-    lstm = tf.nn.rnn_cell.BasicLSTMCell(noutput, state_is_tuple=False)
+    lstm = tf.contrib.rnn.BasicLSTMCell(noutput, state_is_tuple=False)
     state = tf.zeros([batch_size, lstm.state_size])
     inputs_u = tf.unstack(inputs)
     if reverse:
diff --git a/tensorflow/contrib/opt/python/training/external_optimizer_test.py b/tensorflow/contrib/opt/python/training/external_optimizer_test.py
index ec2efc75bf4a13..9dd64e5b32514c 100644
--- a/tensorflow/contrib/opt/python/training/external_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/external_optimizer_test.py
@@ -134,9 +134,11 @@ def objective(x):
       """
 
       d = tf.size(x)
-      s = tf.add(100 * tf.square(tf.sub(tf.slice(x, [1], [d - 1]),
-                                        tf.square(tf.slice(x, [0], [d - 1])))),
-                 tf.square(tf.sub(1.0, tf.slice(x, [0], [d - 1]))))
+      s = tf.add(100 * tf.square(
+          tf.sub(
+              tf.strided_slice(x, [1], [d]),
+              tf.square(tf.strided_slice(x, [0], [d - 1])))),
+                 tf.square(tf.sub(1.0, tf.strided_slice(x, [0], [d - 1]))))
       return tf.reduce_sum(s)
 
     dimension = 5
diff --git a/tensorflow/contrib/rnn/BUILD b/tensorflow/contrib/rnn/BUILD
index fdac3e9e49774f..a3d63b965c9932 100644
--- a/tensorflow/contrib/rnn/BUILD
+++ b/tensorflow/contrib/rnn/BUILD
@@ -45,10 +45,19 @@ cuda_py_tests(
     ],
 )
 
-tf_py_test(
-    name = "fused_rnn_cell_test",
+cuda_py_tests(
+    name = "core_rnn_cell_test",
     size = "small",
-    srcs = ["python/kernel_tests/fused_rnn_cell_test.py"],
+    srcs = ["python/kernel_tests/core_rnn_cell_test.py"],
+    additional_deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+cuda_py_tests(
+    name = "rnn_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/rnn_test.py"],
     additional_deps = [
         ":rnn_py",
         "//tensorflow:tensorflow_py",
@@ -58,9 +67,19 @@ tf_py_test(
 )
 
 cuda_py_tests(
-    name = "lstm_ops_test",
+    name = "core_rnn_test",
     size = "medium",
-    srcs = ["python/kernel_tests/lstm_ops_test.py"],
+    srcs = ["python/kernel_tests/core_rnn_test.py"],
+    additional_deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+    shard_count = 10,
+)
+
+tf_py_test(
+    name = "fused_rnn_cell_test",
+    size = "small",
+    srcs = ["python/kernel_tests/fused_rnn_cell_test.py"],
     additional_deps = [
         ":rnn_py",
         "//tensorflow:tensorflow_py",
@@ -70,9 +89,9 @@ cuda_py_tests(
 )
 
 cuda_py_tests(
-    name = "rnn_test",
+    name = "lstm_ops_test",
     size = "medium",
-    srcs = ["python/kernel_tests/rnn_test.py"],
+    srcs = ["python/kernel_tests/lstm_ops_test.py"],
     additional_deps = [
         ":rnn_py",
         "//tensorflow:tensorflow_py",
@@ -178,7 +197,10 @@ filegroup(
 )
 
 tf_gen_op_libs(
-    op_lib_names = ["lstm_ops"],
+    op_lib_names = [
+        "lstm_ops",
+        "gru_ops",
+    ],
 )
 
 tf_kernel_library(
diff --git a/tensorflow/contrib/rnn/__init__.py b/tensorflow/contrib/rnn/__init__.py
index e89f603b2f52ad..b2785ce6e8142b 100644
--- a/tensorflow/contrib/rnn/__init__.py
+++ b/tensorflow/contrib/rnn/__init__.py
@@ -12,9 +12,30 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Additional RNN operations and cells.
+"""Module for constructing RNN Cells and additional RNN operations.
 
-## This package provides additional contributed RNNCells.
+## Base interface for all RNN Cells
+
+@@RNNCell
+
+## RNN Cells for use with TensorFlow's core RNN methods
+
+@@BasicRNNCell
+@@BasicLSTMCell
+@@GRUCell
+@@LSTMCell
+
+## Classes storing split `RNNCell` state
+
+@@LSTMStateTuple
+
+## RNN Cell wrappers (RNNCells that wrap other RNNCells)
+
+@@MultiRNNCell
+@@DropoutWrapper
+@@EmbeddingWrapper
+@@InputProjectionWrapper
+@@OutputProjectionWrapper
 
 ### Block RNNCells
 @@LSTMBlockCell
@@ -46,3 +67,20 @@
 from tensorflow.contrib.rnn.python.ops.rnn import *
 from tensorflow.contrib.rnn.python.ops.rnn_cell import *
 # pylint: enable=unused-import,wildcard-import,line-too-long
+
+# Provides the links to core rnn and rnn_cell. Implementation will be moved in
+# to this package instead of links as tracked in b/33235120.
+from tensorflow.python.ops.rnn import bidirectional_rnn as static_bidirectional_rnn
+from tensorflow.python.ops.rnn import rnn as static_rnn
+from tensorflow.python.ops.rnn import state_saving_rnn as static_state_saving_rnn
+from tensorflow.python.ops.rnn_cell import BasicLSTMCell
+from tensorflow.python.ops.rnn_cell import BasicRNNCell
+from tensorflow.python.ops.rnn_cell import DropoutWrapper
+from tensorflow.python.ops.rnn_cell import EmbeddingWrapper
+from tensorflow.python.ops.rnn_cell import GRUCell
+from tensorflow.python.ops.rnn_cell import InputProjectionWrapper
+from tensorflow.python.ops.rnn_cell import LSTMCell
+from tensorflow.python.ops.rnn_cell import LSTMStateTuple
+from tensorflow.python.ops.rnn_cell import MultiRNNCell
+from tensorflow.python.ops.rnn_cell import OutputProjectionWrapper
+from tensorflow.python.ops.rnn_cell import RNNCell
diff --git a/tensorflow/python/kernel_tests/rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
similarity index 89%
rename from tensorflow/python/kernel_tests/rnn_cell_test.py
rename to tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
index cc60e796ba2c94..472571cf289493 100644
--- a/tensorflow/python/kernel_tests/rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
@@ -57,7 +57,7 @@ def testBasicRNNCell(self):
       with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
         x = tf.zeros([1, 2])
         m = tf.zeros([1, 2])
-        g, _ = tf.nn.rnn_cell.BasicRNNCell(2)(x, m)
+        g, _ = tf.contrib.rnn.BasicRNNCell(2)(x, m)
         sess.run([tf.global_variables_initializer()])
         res = sess.run([g], {x.name: np.array([[1., 1.]]),
                              m.name: np.array([[0.1, 0.1]])})
@@ -68,7 +68,7 @@ def testGRUCell(self):
       with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
         x = tf.zeros([1, 2])
         m = tf.zeros([1, 2])
-        g, _ = tf.nn.rnn_cell.GRUCell(2)(x, m)
+        g, _ = tf.contrib.rnn.GRUCell(2)(x, m)
         sess.run([tf.global_variables_initializer()])
         res = sess.run([g], {x.name: np.array([[1., 1.]]),
                              m.name: np.array([[0.1, 0.1]])})
@@ -77,7 +77,7 @@ def testGRUCell(self):
       with tf.variable_scope("other", initializer=tf.constant_initializer(0.5)):
         x = tf.zeros([1, 3])  # Test GRUCell with input_size != num_units.
         m = tf.zeros([1, 2])
-        g, _ = tf.nn.rnn_cell.GRUCell(2)(x, m)
+        g, _ = tf.contrib.rnn.GRUCell(2)(x, m)
         sess.run([tf.global_variables_initializer()])
         res = sess.run([g], {x.name: np.array([[1., 1., 1.]]),
                              m.name: np.array([[0.1, 0.1]])})
@@ -89,8 +89,8 @@ def testBasicLSTMCell(self):
       with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
         x = tf.zeros([1, 2])
         m = tf.zeros([1, 8])
-        g, out_m = tf.nn.rnn_cell.MultiRNNCell(
-            [tf.nn.rnn_cell.BasicLSTMCell(2, state_is_tuple=False)] * 2,
+        g, out_m = tf.contrib.rnn.MultiRNNCell(
+            [tf.contrib.rnn.BasicLSTMCell(2, state_is_tuple=False)] * 2,
             state_is_tuple=False)(x, m)
         sess.run([tf.global_variables_initializer()])
         res = sess.run([g, out_m], {x.name: np.array([[1., 1.]]),
@@ -120,7 +120,7 @@ def testBasicLSTMCell(self):
       with tf.variable_scope("other", initializer=tf.constant_initializer(0.5)):
         x = tf.zeros([1, 3])  # Test BasicLSTMCell with input_size != num_units.
         m = tf.zeros([1, 4])
-        g, out_m = tf.nn.rnn_cell.BasicLSTMCell(2, state_is_tuple=False)(x, m)
+        g, out_m = tf.contrib.rnn.BasicLSTMCell(2, state_is_tuple=False)(x, m)
         sess.run([tf.global_variables_initializer()])
         res = sess.run([g, out_m], {x.name: np.array([[1., 1., 1.]]),
                                     m.name: 0.1 * np.ones([1, 4])})
@@ -132,35 +132,35 @@ def testBasicLSTMCellStateTupleType(self):
         x = tf.zeros([1, 2])
         m0 = (tf.zeros([1, 2]),) * 2
         m1 = (tf.zeros([1, 2]),) * 2
-        cell = tf.nn.rnn_cell.MultiRNNCell(
-            [tf.nn.rnn_cell.BasicLSTMCell(2)] * 2,
+        cell = tf.contrib.rnn.MultiRNNCell(
+            [tf.contrib.rnn.BasicLSTMCell(2)] * 2,
             state_is_tuple=True)
         self.assertTrue(isinstance(cell.state_size, tuple))
         self.assertTrue(isinstance(cell.state_size[0],
-                                   tf.nn.rnn_cell.LSTMStateTuple))
+                                   tf.contrib.rnn.LSTMStateTuple))
         self.assertTrue(isinstance(cell.state_size[1],
-                                   tf.nn.rnn_cell.LSTMStateTuple))
+                                   tf.contrib.rnn.LSTMStateTuple))
 
         # Pass in regular tuples
         _, (out_m0, out_m1) = cell(x, (m0, m1))
         self.assertTrue(isinstance(out_m0,
-                                   tf.nn.rnn_cell.LSTMStateTuple))
+                                   tf.contrib.rnn.LSTMStateTuple))
         self.assertTrue(isinstance(out_m1,
-                                   tf.nn.rnn_cell.LSTMStateTuple))
+                                   tf.contrib.rnn.LSTMStateTuple))
 
         # Pass in LSTMStateTuples
         tf.get_variable_scope().reuse_variables()
         zero_state = cell.zero_state(1, tf.float32)
         self.assertTrue(isinstance(zero_state, tuple))
         self.assertTrue(isinstance(zero_state[0],
-                                   tf.nn.rnn_cell.LSTMStateTuple))
+                                   tf.contrib.rnn.LSTMStateTuple))
         self.assertTrue(isinstance(zero_state[1],
-                                   tf.nn.rnn_cell.LSTMStateTuple))
+                                   tf.contrib.rnn.LSTMStateTuple))
         _, (out_m0, out_m1) = cell(x, zero_state)
         self.assertTrue(
-            isinstance(out_m0, tf.nn.rnn_cell.LSTMStateTuple))
+            isinstance(out_m0, tf.contrib.rnn.LSTMStateTuple))
         self.assertTrue(
-            isinstance(out_m1, tf.nn.rnn_cell.LSTMStateTuple))
+            isinstance(out_m1, tf.contrib.rnn.LSTMStateTuple))
 
   def testBasicLSTMCellWithStateTuple(self):
     with self.test_session() as sess:
@@ -168,8 +168,8 @@ def testBasicLSTMCellWithStateTuple(self):
         x = tf.zeros([1, 2])
         m0 = tf.zeros([1, 4])
         m1 = tf.zeros([1, 4])
-        cell = tf.nn.rnn_cell.MultiRNNCell(
-            [tf.nn.rnn_cell.BasicLSTMCell(2, state_is_tuple=False)] * 2,
+        cell = tf.contrib.rnn.MultiRNNCell(
+            [tf.contrib.rnn.BasicLSTMCell(2, state_is_tuple=False)] * 2,
             state_is_tuple=True)
         g, (out_m0, out_m1) = cell(x, (m0, m1))
         sess.run([tf.global_variables_initializer()])
@@ -199,7 +199,7 @@ def testLSTMCell(self):
       with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
         x = tf.zeros([batch_size, input_size])
         m = tf.zeros([batch_size, state_size])
-        cell = tf.nn.rnn_cell.LSTMCell(
+        cell = tf.contrib.rnn.LSTMCell(
             num_units=num_units, num_proj=num_proj, forget_bias=1.0,
             state_is_tuple=False)
         output, state = cell(x, m)
@@ -229,7 +229,7 @@ def testLSTMCellVariables(self):
       with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
         x = tf.zeros([batch_size, input_size])
         m = tf.zeros([batch_size, state_size])
-        cell = tf.nn.rnn_cell.LSTMCell(
+        cell = tf.contrib.rnn.LSTMCell(
             num_units=num_units, num_proj=num_proj, forget_bias=1.0,
             state_is_tuple=False)
         cell(x, m)  # Execute to create variables
@@ -244,8 +244,8 @@ def testOutputProjectionWrapper(self):
       with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
         x = tf.zeros([1, 3])
         m = tf.zeros([1, 3])
-        cell = tf.nn.rnn_cell.OutputProjectionWrapper(
-            tf.nn.rnn_cell.GRUCell(3), 2)
+        cell = tf.contrib.rnn.OutputProjectionWrapper(
+            tf.contrib.rnn.GRUCell(3), 2)
         g, new_m = cell(x, m)
         sess.run([tf.global_variables_initializer()])
         res = sess.run([g, new_m], {x.name: np.array([[1., 1., 1.]]),
@@ -259,8 +259,8 @@ def testInputProjectionWrapper(self):
       with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
         x = tf.zeros([1, 2])
         m = tf.zeros([1, 3])
-        cell = tf.nn.rnn_cell.InputProjectionWrapper(
-            tf.nn.rnn_cell.GRUCell(3), num_proj=3)
+        cell = tf.contrib.rnn.InputProjectionWrapper(
+            tf.contrib.rnn.GRUCell(3), num_proj=3)
         g, new_m = cell(x, m)
         sess.run([tf.global_variables_initializer()])
         res = sess.run([g, new_m], {x.name: np.array([[1., 1.]]),
@@ -275,7 +275,7 @@ def testDropoutWrapper(self):
         x = tf.zeros([1, 3])
         m = tf.zeros([1, 3])
         keep = tf.zeros([]) + 1
-        g, new_m = tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.GRUCell(3),
+        g, new_m = tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.GRUCell(3),
                                                  keep, keep)(x, m)
         sess.run([tf.global_variables_initializer()])
         res = sess.run([g, new_m], {x.name: np.array([[1., 1., 1.]]),
@@ -289,8 +289,8 @@ def testEmbeddingWrapper(self):
       with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
         x = tf.zeros([1, 1], dtype=tf.int32)
         m = tf.zeros([1, 2])
-        embedding_cell = tf.nn.rnn_cell.EmbeddingWrapper(
-            tf.nn.rnn_cell.GRUCell(2),
+        embedding_cell = tf.contrib.rnn.EmbeddingWrapper(
+            tf.contrib.rnn.GRUCell(2),
             embedding_classes=3, embedding_size=2)
         self.assertEqual(embedding_cell.output_size, 2)
         g, new_m = embedding_cell(x, m)
@@ -306,8 +306,8 @@ def testEmbeddingWrapperWithDynamicRnn(self):
       with tf.variable_scope("root"):
         inputs = tf.convert_to_tensor([[[0], [0]]], dtype=tf.int64)
         input_lengths = tf.convert_to_tensor([2], dtype=tf.int64)
-        embedding_cell = tf.nn.rnn_cell.EmbeddingWrapper(
-            tf.nn.rnn_cell.BasicLSTMCell(1, state_is_tuple=True),
+        embedding_cell = tf.contrib.rnn.EmbeddingWrapper(
+            tf.contrib.rnn.BasicLSTMCell(1, state_is_tuple=True),
             embedding_classes=1,
             embedding_size=2)
         outputs, _ = tf.nn.dynamic_rnn(cell=embedding_cell,
@@ -323,8 +323,8 @@ def testMultiRNNCell(self):
       with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
         x = tf.zeros([1, 2])
         m = tf.zeros([1, 4])
-        _, ml = tf.nn.rnn_cell.MultiRNNCell(
-            [tf.nn.rnn_cell.GRUCell(2)] * 2, state_is_tuple=False)(x, m)
+        _, ml = tf.contrib.rnn.MultiRNNCell(
+            [tf.contrib.rnn.GRUCell(2)] * 2, state_is_tuple=False)(x, m)
         sess.run([tf.global_variables_initializer()])
         res = sess.run(ml, {x.name: np.array([[1., 1.]]),
                             m.name: np.array([[0.1, 0.1, 0.1, 0.1]])})
@@ -341,11 +341,11 @@ def testMultiRNNCellWithStateTuple(self):
 
         # Test incorrectness of state
         with self.assertRaisesRegexp(ValueError, "Expected state .* a tuple"):
-          tf.nn.rnn_cell.MultiRNNCell(
-              [tf.nn.rnn_cell.GRUCell(2)] * 2, state_is_tuple=True)(x, m_bad)
+          tf.contrib.rnn.MultiRNNCell(
+              [tf.contrib.rnn.GRUCell(2)] * 2, state_is_tuple=True)(x, m_bad)
 
-        _, ml = tf.nn.rnn_cell.MultiRNNCell(
-            [tf.nn.rnn_cell.GRUCell(2)] * 2, state_is_tuple=True)(x, m_good)
+        _, ml = tf.contrib.rnn.MultiRNNCell(
+            [tf.contrib.rnn.GRUCell(2)] * 2, state_is_tuple=True)(x, m_good)
 
         sess.run([tf.global_variables_initializer()])
         res = sess.run(ml, {x.name: np.array([[1., 1.]]),
@@ -388,7 +388,7 @@ def testBasicRNNCellMatch(self):
         slim_cell = rnn_cell_impl._SlimRNNCell(my_cell)
         # pylint: enable=protected-access
         slim_outputs, slim_state = slim_cell(inputs, initial_state)
-        rnn_cell = tf.nn.rnn_cell.BasicRNNCell(num_units)
+        rnn_cell = tf.contrib.rnn.BasicRNNCell(num_units)
         tf.get_variable_scope().reuse_variables()
         outputs, state = rnn_cell(inputs, initial_state)
         self.assertEqual(slim_outputs.get_shape(), outputs.get_shape())
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
new file mode 100644
index 00000000000000..fb25f2b877bef3
--- /dev/null
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
@@ -0,0 +1,1159 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for rnn module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+
+from tensorflow.python.util import nest
+
+
+class Plus1RNNCell(tf.contrib.rnn.RNNCell):
+  """RNN Cell generating (output, new_state) = (input + 1, state + 1)."""
+
+  @property
+  def output_size(self):
+    return 5
+
+  @property
+  def state_size(self):
+    return 5
+
+  def __call__(self, input_, state, scope=None):
+    return (input_ + 1, state + 1)
+
+
+class DummyMultiDimensionalLSTM(tf.contrib.rnn.RNNCell):
+  """LSTM Cell generating (output, new_state) = (input + 1, state + 1).
+
+  The input to this cell may have an arbitrary number of dimensions that follow
+  the preceding 'Time' and 'Batch' dimensions.
+  """
+
+  def __init__(self, dims):
+    """Initialize the Multi-dimensional LSTM cell.
+
+    Args:
+      dims: tuple that contains the dimensions of the output of the cell,
+      without including 'Time' or 'Batch' dimensions.
+    """
+    if not isinstance(dims, tuple):
+      raise TypeError("The dimensions passed to DummyMultiDimensionalLSTM"
+                      "should be a tuple of ints.")
+    self._dims = dims
+    self._output_size = tf.TensorShape(self._dims)
+    self._state_size = (tf.TensorShape(self._dims), tf.TensorShape(self._dims))
+
+  @property
+  def output_size(self):
+    return self._output_size
+
+  @property
+  def state_size(self):
+    return self._state_size
+
+  def __call__(self, input_, state, scope=None):
+    h, c = state
+    return (input_ + 1, (h + 1, c + 1))
+
+
+class NestedRNNCell(tf.contrib.rnn.RNNCell):
+  """RNN Cell generating (output, new_state) = (input + 1, state + 1).
+
+  The input, output and state of this cell is a tuple of two tensors.
+  """
+
+  @property
+  def output_size(self):
+    return (5, 5)
+
+  @property
+  def state_size(self):
+    return (6, 6)
+
+  def __call__(self, input_, state, scope=None):
+    h, c = state
+    x, y = input_
+    return ((x + 1, y + 1), (h + 1, c + 1))
+
+
+class TestStateSaver(object):
+
+  def __init__(self, batch_size, state_size):
+    self._batch_size = batch_size
+    self._state_size = state_size
+    self.saved_state = {}
+
+  def state(self, name):
+
+    if isinstance(self._state_size, dict):
+      state_size = self._state_size[name]
+    else:
+      state_size = self._state_size
+    if isinstance(state_size, int):
+      state_size = (state_size,)
+    elif isinstance(state_size, tuple):
+      pass
+    else:
+      raise TypeError("state_size should either be an int or a tuple")
+
+    return tf.zeros((self._batch_size,) + state_size)
+
+  def save_state(self, name, state):
+    self.saved_state[name] = state
+    return tf.identity(state)
+
+
+class RNNTest(tf.test.TestCase):
+
+  def setUp(self):
+    self._seed = 23489
+    np.random.seed(self._seed)
+
+  def testInvalidSequenceLengthShape(self):
+    cell = Plus1RNNCell()
+    inputs = [tf.placeholder(tf.float32, shape=(3, 4))]
+    with self.assertRaisesRegexp(ValueError, "must be a vector"):
+      tf.contrib.rnn.static_rnn(
+          cell, inputs, dtype=tf.float32, sequence_length=4)
+
+  def testRNN(self):
+    cell = Plus1RNNCell()
+    batch_size = 2
+    input_size = 5
+    max_length = 8  # unrolled up to this length
+    inputs = max_length * [
+        tf.placeholder(tf.float32, shape=(batch_size, input_size))]
+    outputs, state = tf.contrib.rnn.static_rnn(cell, inputs, dtype=tf.float32)
+    self.assertEqual(len(outputs), len(inputs))
+    for out, inp in zip(outputs, inputs):
+      self.assertEqual(out.get_shape(), inp.get_shape())
+      self.assertEqual(out.dtype, inp.dtype)
+
+    with self.test_session(use_gpu=False) as sess:
+      input_value = np.random.randn(batch_size, input_size)
+      values = sess.run(outputs + [state],
+                        feed_dict={inputs[0]: input_value})
+
+      # Outputs
+      for v in values[:-1]:
+        self.assertAllClose(v, input_value + 1.0)
+
+      # Final state
+      self.assertAllClose(
+          values[-1],
+          max_length * np.ones((batch_size, input_size), dtype=np.float32))
+
+  def testDropout(self):
+    cell = Plus1RNNCell()
+    full_dropout_cell = tf.contrib.rnn.DropoutWrapper(
+        cell, input_keep_prob=1e-12, seed=0)
+    batch_size = 2
+    input_size = 5
+    max_length = 8
+    inputs = max_length * [
+        tf.placeholder(tf.float32, shape=(batch_size, input_size))]
+    with tf.variable_scope("share_scope"):
+      outputs, state = tf.contrib.rnn.static_rnn(cell, inputs, dtype=tf.float32)
+    with tf.variable_scope("drop_scope"):
+      dropped_outputs, _ = tf.contrib.rnn.static_rnn(
+          full_dropout_cell, inputs, dtype=tf.float32)
+    self.assertEqual(len(outputs), len(inputs))
+    for out, inp in zip(outputs, inputs):
+      self.assertEqual(out.get_shape().as_list(), inp.get_shape().as_list())
+      self.assertEqual(out.dtype, inp.dtype)
+
+    with self.test_session(use_gpu=False) as sess:
+      input_value = np.random.randn(batch_size, input_size)
+      values = sess.run(outputs + [state],
+                        feed_dict={inputs[0]: input_value})
+      full_dropout_values = sess.run(dropped_outputs,
+                                     feed_dict={inputs[0]: input_value})
+
+      for v in values[:-1]:
+        self.assertAllClose(v, input_value + 1.0)
+      for d_v in full_dropout_values[:-1]:  # Add 1.0 to dropped_out (all zeros)
+        self.assertAllClose(d_v, np.ones_like(input_value))
+
+  def _testDynamicCalculation(self, use_gpu):
+    cell = Plus1RNNCell()
+    sequence_length = tf.placeholder(tf.int64)
+    batch_size = 2
+    input_size = 5
+    max_length = 8
+    inputs = max_length * [
+        tf.placeholder(tf.float32, shape=(batch_size, input_size))]
+    with tf.variable_scope("drop_scope"):
+      dynamic_outputs, dynamic_state = tf.contrib.rnn.static_rnn(
+          cell, inputs, sequence_length=sequence_length, dtype=tf.float32)
+    self.assertEqual(len(dynamic_outputs), len(inputs))
+
+    with self.test_session(use_gpu=use_gpu) as sess:
+      input_value = np.random.randn(batch_size, input_size)
+      dynamic_values = sess.run(dynamic_outputs,
+                                feed_dict={inputs[0]: input_value,
+                                           sequence_length: [2, 3]})
+      dynamic_state_value = sess.run([dynamic_state],
+                                     feed_dict={inputs[0]: input_value,
+                                                sequence_length: [2, 3]})
+
+      # outputs are fully calculated for t = 0, 1
+      for v in dynamic_values[:2]:
+        self.assertAllClose(v, input_value + 1.0)
+
+      # outputs at t = 2 are zero for entry 0, calculated for entry 1
+      self.assertAllClose(
+          dynamic_values[2],
+          np.vstack((
+              np.zeros((input_size)),
+              1.0 + input_value[1, :])))
+
+      # outputs at t = 3+ are zero
+      for v in dynamic_values[3:]:
+        self.assertAllEqual(v, np.zeros_like(input_value))
+
+      # the final states are:
+      #  entry 0: the values from the calculation at t=1
+      #  entry 1: the values from the calculation at t=2
+      self.assertAllEqual(
+          dynamic_state_value[0],
+          np.vstack((
+              1.0 * (1 + 1) * np.ones((input_size)),
+              1.0 * (2 + 1) * np.ones((input_size)))))
+
+  def testDynamicCalculation(self):
+    self._testDynamicCalculation(True)
+    self._testDynamicCalculation(False)
+
+  def _testScope(self, factory, prefix="prefix", use_outer_scope=True):
+    with self.test_session(use_gpu=True, graph=tf.Graph()):
+      if use_outer_scope:
+        with tf.variable_scope(prefix) as scope:
+          factory(scope)
+      else:
+        factory(prefix)
+
+      # check that all the variables names starts
+      # with the proper scope.
+      tf.global_variables_initializer()
+      all_vars = tf.global_variables()
+      prefix = prefix or "rnn"
+      scope_vars = [v for v in all_vars if v.name.startswith(prefix + "/")]
+      tf.logging.info("RNN with scope: %s (%s)"
+                      % (prefix, "scope" if use_outer_scope else "str"))
+      for v in scope_vars:
+        tf.logging.info(v.name)
+      self.assertEqual(len(scope_vars), len(all_vars))
+
+  def testScope(self):
+    def factory(scope):
+      cell = Plus1RNNCell()
+      batch_size = 2
+      input_size = 5
+      max_length = 8  # unrolled up to this length
+      inputs = max_length * [
+          tf.placeholder(tf.float32, shape=(batch_size, input_size))]
+      return tf.contrib.rnn.static_rnn(
+          cell, inputs, dtype=tf.float32, scope=scope)
+
+    self._testScope(factory, use_outer_scope=True)
+    self._testScope(factory, use_outer_scope=False)
+    self._testScope(factory, prefix=None, use_outer_scope=False)
+
+
+class LSTMTest(tf.test.TestCase):
+
+  def setUp(self):
+    self._seed = 23489
+    np.random.seed(self._seed)
+
+  def _testNoProjNoSharding(self, use_gpu):
+    num_units = 3
+    input_size = 5
+    batch_size = 2
+    max_length = 8
+    with self.test_session(use_gpu=use_gpu, graph=tf.Graph()) as sess:
+      initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=self._seed)
+      cell = tf.contrib.rnn.LSTMCell(num_units, initializer=initializer,
+                                     state_is_tuple=False)
+      inputs = max_length * [
+          tf.placeholder(tf.float32, shape=(batch_size, input_size))]
+      outputs, _ = tf.contrib.rnn.static_rnn(cell, inputs, dtype=tf.float32)
+      self.assertEqual(len(outputs), len(inputs))
+      for out in outputs:
+        self.assertEqual(out.get_shape().as_list(), [batch_size, num_units])
+
+      tf.global_variables_initializer().run()
+      input_value = np.random.randn(batch_size, input_size)
+      sess.run(outputs, feed_dict={inputs[0]: input_value})
+
+  def _testCellClipping(self, use_gpu):
+    num_units = 3
+    input_size = 5
+    batch_size = 2
+    max_length = 8
+    with self.test_session(use_gpu=use_gpu, graph=tf.Graph()) as sess:
+      initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=self._seed)
+      cell = tf.contrib.rnn.LSTMCell(
+          num_units, use_peepholes=True, cell_clip=0.0, initializer=initializer,
+          state_is_tuple=False)
+      inputs = max_length * [
+          tf.placeholder(tf.float32, shape=(batch_size, input_size))]
+      outputs, _ = tf.contrib.rnn.static_rnn(cell, inputs, dtype=tf.float32)
+      self.assertEqual(len(outputs), len(inputs))
+      for out in outputs:
+        self.assertEqual(out.get_shape().as_list(), [batch_size, num_units])
+
+      tf.global_variables_initializer().run()
+      input_value = np.random.randn(batch_size, input_size)
+      values = sess.run(outputs, feed_dict={inputs[0]: input_value})
+
+    for value in values:
+      # if cell c is clipped to 0, tanh(c) = 0 => m==0
+      self.assertAllEqual(value, np.zeros((batch_size, num_units)))
+
+  def _testNoProjNoShardingSimpleStateSaver(self, use_gpu):
+    num_units = 3
+    input_size = 5
+    batch_size = 2
+    max_length = 8
+    with self.test_session(use_gpu=use_gpu, graph=tf.Graph()) as sess:
+      initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=self._seed)
+      state_saver = TestStateSaver(batch_size, 2 * num_units)
+      cell = tf.contrib.rnn.LSTMCell(
+          num_units, use_peepholes=False, initializer=initializer,
+          state_is_tuple=False)
+      inputs = max_length * [
+          tf.placeholder(tf.float32, shape=(batch_size, input_size))]
+      with tf.variable_scope("share_scope"):
+        outputs, state = tf.contrib.rnn.static_state_saving_rnn(
+            cell, inputs, state_saver=state_saver, state_name="save_lstm")
+      self.assertEqual(len(outputs), len(inputs))
+      for out in outputs:
+        self.assertEqual(out.get_shape().as_list(), [batch_size, num_units])
+
+      tf.global_variables_initializer().run()
+      input_value = np.random.randn(batch_size, input_size)
+      (last_state_value, saved_state_value) = sess.run(
+          [state, state_saver.saved_state["save_lstm"]],
+          feed_dict={inputs[0]: input_value})
+      self.assertAllEqual(last_state_value, saved_state_value)
+
+  def testNoProjNoShardingTupleStateSaver(self):
+    num_units = 3
+    input_size = 5
+    batch_size = 2
+    max_length = 8
+    with self.test_session(graph=tf.Graph()) as sess:
+      initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=self._seed)
+      state_saver = TestStateSaver(batch_size, num_units)
+      cell = tf.contrib.rnn.LSTMCell(
+          num_units, use_peepholes=False, initializer=initializer,
+          state_is_tuple=True)
+      inputs = max_length * [
+          tf.placeholder(tf.float32, shape=(batch_size, input_size))]
+      with tf.variable_scope("share_scope"):
+        outputs, state = tf.contrib.rnn.static_state_saving_rnn(
+            cell, inputs, state_saver=state_saver, state_name=("c", "m"))
+      self.assertEqual(len(outputs), len(inputs))
+      for out in outputs:
+        self.assertEqual(out.get_shape().as_list(), [batch_size, num_units])
+
+      tf.global_variables_initializer().run()
+      input_value = np.random.randn(batch_size, input_size)
+      last_and_saved_states = sess.run(
+          state + (state_saver.saved_state["c"], state_saver.saved_state["m"]),
+          feed_dict={inputs[0]: input_value})
+      self.assertEqual(4, len(last_and_saved_states))
+      self.assertAllEqual(last_and_saved_states[:2], last_and_saved_states[2:])
+
+  def testNoProjNoShardingNestedTupleStateSaver(self):
+    num_units = 3
+    input_size = 5
+    batch_size = 2
+    max_length = 8
+    with self.test_session(graph=tf.Graph()) as sess:
+      initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=self._seed)
+      state_saver = TestStateSaver(batch_size, {"c0": num_units,
+                                                "m0": num_units,
+                                                "c1": num_units + 1,
+                                                "m1": num_units + 1,
+                                                "c2": num_units + 2,
+                                                "m2": num_units + 2,
+                                                "c3": num_units + 3,
+                                                "m3": num_units + 3})
+      def _cell(i):
+        return tf.contrib.rnn.LSTMCell(
+            num_units + i, use_peepholes=False, initializer=initializer,
+            state_is_tuple=True)
+
+      # This creates a state tuple which has 4 sub-tuples of length 2 each.
+      cell = tf.contrib.rnn.MultiRNNCell(
+          [_cell(i) for i in range(4)], state_is_tuple=True)
+
+      self.assertEqual(len(cell.state_size), 4)
+      for i in range(4):
+        self.assertEqual(len(cell.state_size[i]), 2)
+
+      inputs = max_length * [
+          tf.placeholder(tf.float32, shape=(batch_size, input_size))]
+
+      state_names = (("c0", "m0"), ("c1", "m1"),
+                     ("c2", "m2"), ("c3", "m3"))
+      with tf.variable_scope("share_scope"):
+        outputs, state = tf.contrib.rnn.static_state_saving_rnn(
+            cell, inputs, state_saver=state_saver, state_name=state_names)
+      self.assertEqual(len(outputs), len(inputs))
+
+      # Final output comes from _cell(3) which has state size num_units + 3
+      for out in outputs:
+        self.assertEqual(out.get_shape().as_list(), [batch_size, num_units + 3])
+
+      tf.global_variables_initializer().run()
+      input_value = np.random.randn(batch_size, input_size)
+      last_states = sess.run(
+          list(nest.flatten(state)), feed_dict={inputs[0]: input_value})
+      saved_states = sess.run(
+          list(state_saver.saved_state.values()),
+          feed_dict={inputs[0]: input_value})
+      self.assertEqual(8, len(last_states))
+      self.assertEqual(8, len(saved_states))
+      flat_state_names = nest.flatten(state_names)
+      named_saved_states = dict(
+          zip(state_saver.saved_state.keys(), saved_states))
+
+      for i in range(8):
+        self.assertAllEqual(
+            last_states[i],
+            named_saved_states[flat_state_names[i]])
+
+  def _testProjNoSharding(self, use_gpu):
+    num_units = 3
+    input_size = 5
+    batch_size = 2
+    num_proj = 4
+    max_length = 8
+    with self.test_session(use_gpu=use_gpu, graph=tf.Graph()) as sess:
+      initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=self._seed)
+      inputs = max_length * [
+          tf.placeholder(tf.float32, shape=(None, input_size))]
+      cell = tf.contrib.rnn.LSTMCell(
+          num_units, use_peepholes=True,
+          num_proj=num_proj, initializer=initializer,
+          state_is_tuple=False)
+      outputs, _ = tf.contrib.rnn.static_rnn(cell, inputs, dtype=tf.float32)
+      self.assertEqual(len(outputs), len(inputs))
+
+      tf.global_variables_initializer().run()
+      input_value = np.random.randn(batch_size, input_size)
+      sess.run(outputs, feed_dict={inputs[0]: input_value})
+
+  def testStateTupleWithProjAndSequenceLength(self):
+    num_units = 3
+    input_size = 5
+    batch_size = 2
+    num_proj = 4
+    max_length = 8
+    sequence_length = [4, 6]
+    with self.test_session(graph=tf.Graph()) as sess:
+      initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=self._seed)
+      inputs = max_length * [
+          tf.placeholder(tf.float32, shape=(None, input_size))]
+      cell_notuple = tf.contrib.rnn.LSTMCell(
+          num_units, use_peepholes=True,
+          num_proj=num_proj, initializer=initializer, state_is_tuple=False)
+      cell_tuple = tf.contrib.rnn.LSTMCell(
+          num_units, use_peepholes=True,
+          num_proj=num_proj, initializer=initializer, state_is_tuple=True)
+      with tf.variable_scope("root") as scope:
+        outputs_notuple, state_notuple = tf.contrib.rnn.static_rnn(
+            cell_notuple, inputs, dtype=tf.float32,
+            sequence_length=sequence_length, scope=scope)
+        scope.reuse_variables()
+        outputs_tuple, state_tuple = tf.contrib.rnn.static_rnn(
+            cell_tuple, inputs, dtype=tf.float32,
+            sequence_length=sequence_length, scope=scope)
+      self.assertEqual(len(outputs_notuple), len(inputs))
+      self.assertEqual(len(outputs_tuple), len(inputs))
+      self.assertTrue(isinstance(state_tuple, tuple))
+      self.assertTrue(isinstance(state_notuple, tf.Tensor))
+
+      tf.global_variables_initializer().run()
+      input_value = np.random.randn(batch_size, input_size)
+      outputs_notuple_v = sess.run(
+          outputs_notuple, feed_dict={inputs[0]: input_value})
+      outputs_tuple_v = sess.run(
+          outputs_tuple, feed_dict={inputs[0]: input_value})
+      self.assertAllEqual(outputs_notuple_v, outputs_tuple_v)
+
+      (state_notuple_v,) = sess.run(
+          (state_notuple,), feed_dict={inputs[0]: input_value})
+      state_tuple_v = sess.run(
+          state_tuple, feed_dict={inputs[0]: input_value})
+      self.assertAllEqual(state_notuple_v, np.hstack(state_tuple_v))
+
+  def _testProjSharding(self, use_gpu):
+    num_units = 3
+    input_size = 5
+    batch_size = 2
+    num_proj = 4
+    num_proj_shards = 3
+    num_unit_shards = 2
+    max_length = 8
+    with self.test_session(use_gpu=use_gpu, graph=tf.Graph()) as sess:
+      initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=self._seed)
+
+      inputs = max_length * [
+          tf.placeholder(tf.float32, shape=(None, input_size))]
+
+      cell = tf.contrib.rnn.LSTMCell(
+          num_units,
+          use_peepholes=True,
+          num_proj=num_proj,
+          num_unit_shards=num_unit_shards,
+          num_proj_shards=num_proj_shards,
+          initializer=initializer,
+          state_is_tuple=False)
+
+      outputs, _ = tf.contrib.rnn.static_rnn(cell, inputs, dtype=tf.float32)
+
+      self.assertEqual(len(outputs), len(inputs))
+
+      tf.global_variables_initializer().run()
+      input_value = np.random.randn(batch_size, input_size)
+      sess.run(outputs, feed_dict={inputs[0]: input_value})
+
+  def _testDoubleInput(self, use_gpu):
+    num_units = 3
+    input_size = 5
+    batch_size = 2
+    num_proj = 4
+    num_proj_shards = 3
+    num_unit_shards = 2
+    max_length = 8
+    with self.test_session(use_gpu=use_gpu, graph=tf.Graph()) as sess:
+      initializer = tf.random_uniform_initializer(-1, 1, seed=self._seed)
+      inputs = max_length * [
+          tf.placeholder(tf.float64, shape=(None, input_size))]
+
+      cell = tf.contrib.rnn.LSTMCell(
+          num_units,
+          use_peepholes=True,
+          num_proj=num_proj,
+          num_unit_shards=num_unit_shards,
+          num_proj_shards=num_proj_shards,
+          initializer=initializer,
+          state_is_tuple=False)
+
+      outputs, _ = tf.contrib.rnn.static_rnn(
+          cell, inputs, initial_state=cell.zero_state(batch_size, tf.float64))
+
+      self.assertEqual(len(outputs), len(inputs))
+
+      tf.global_variables_initializer().run()
+      input_value = np.asarray(np.random.randn(batch_size, input_size),
+                               dtype=np.float64)
+      values = sess.run(outputs, feed_dict={inputs[0]: input_value})
+      self.assertEqual(values[0].dtype, input_value.dtype)
+
+  def _testShardNoShardEquivalentOutput(self, use_gpu):
+    num_units = 3
+    input_size = 5
+    batch_size = 2
+    num_proj = 4
+    num_proj_shards = 3
+    num_unit_shards = 2
+    max_length = 8
+    with self.test_session(use_gpu=use_gpu, graph=tf.Graph()) as sess:
+      inputs = max_length * [
+          tf.placeholder(tf.float32, shape=(None, input_size))]
+      initializer = tf.constant_initializer(0.001)
+
+      cell_noshard = tf.contrib.rnn.LSTMCell(
+          num_units,
+          num_proj=num_proj,
+          use_peepholes=True,
+          initializer=initializer,
+          num_unit_shards=num_unit_shards,
+          num_proj_shards=num_proj_shards,
+          state_is_tuple=False)
+
+      cell_shard = tf.contrib.rnn.LSTMCell(
+          num_units, use_peepholes=True,
+          initializer=initializer, num_proj=num_proj,
+          state_is_tuple=False)
+
+      with tf.variable_scope("noshard_scope"):
+        outputs_noshard, state_noshard = tf.contrib.rnn.static_rnn(
+            cell_noshard, inputs, dtype=tf.float32)
+      with tf.variable_scope("shard_scope"):
+        outputs_shard, state_shard = tf.contrib.rnn.static_rnn(
+            cell_shard, inputs, dtype=tf.float32)
+
+      self.assertEqual(len(outputs_noshard), len(inputs))
+      self.assertEqual(len(outputs_noshard), len(outputs_shard))
+
+      tf.global_variables_initializer().run()
+      input_value = np.random.randn(batch_size, input_size)
+      feeds = dict((x, input_value) for x in inputs)
+      values_noshard = sess.run(outputs_noshard, feed_dict=feeds)
+      values_shard = sess.run(outputs_shard, feed_dict=feeds)
+      state_values_noshard = sess.run([state_noshard], feed_dict=feeds)
+      state_values_shard = sess.run([state_shard], feed_dict=feeds)
+      self.assertEqual(len(values_noshard), len(values_shard))
+      self.assertEqual(len(state_values_noshard), len(state_values_shard))
+      for (v_noshard, v_shard) in zip(values_noshard, values_shard):
+        self.assertAllClose(v_noshard, v_shard, atol=1e-3)
+      for (s_noshard, s_shard) in zip(state_values_noshard, state_values_shard):
+        self.assertAllClose(s_noshard, s_shard, atol=1e-3)
+
+  def _testDoubleInputWithDropoutAndDynamicCalculation(
+      self, use_gpu):
+    """Smoke test for using LSTM with doubles, dropout, dynamic calculation."""
+
+    num_units = 3
+    input_size = 5
+    batch_size = 2
+    num_proj = 4
+    num_proj_shards = 3
+    num_unit_shards = 2
+    max_length = 8
+    with self.test_session(use_gpu=use_gpu, graph=tf.Graph()) as sess:
+      sequence_length = tf.placeholder(tf.int64)
+      initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=self._seed)
+      inputs = max_length * [
+          tf.placeholder(tf.float64, shape=(None, input_size))]
+
+      cell = tf.contrib.rnn.LSTMCell(
+          num_units,
+          use_peepholes=True,
+          num_proj=num_proj,
+          num_unit_shards=num_unit_shards,
+          num_proj_shards=num_proj_shards,
+          initializer=initializer,
+          state_is_tuple=False)
+      dropout_cell = tf.contrib.rnn.DropoutWrapper(cell, 0.5, seed=0)
+
+      outputs, state = tf.contrib.rnn.static_rnn(
+          dropout_cell, inputs, sequence_length=sequence_length,
+          initial_state=cell.zero_state(batch_size, tf.float64))
+
+      self.assertEqual(len(outputs), len(inputs))
+
+      tf.global_variables_initializer().run(feed_dict={sequence_length: [2, 3]})
+      input_value = np.asarray(np.random.randn(batch_size, input_size),
+                               dtype=np.float64)
+      values = sess.run(outputs, feed_dict={inputs[0]: input_value,
+                                            sequence_length: [2, 3]})
+      state_value = sess.run([state], feed_dict={inputs[0]: input_value,
+                                                 sequence_length: [2, 3]})
+      self.assertEqual(values[0].dtype, input_value.dtype)
+      self.assertEqual(state_value[0].dtype, input_value.dtype)
+
+  def testSharingWeightsWithReuse(self):
+    num_units = 3
+    input_size = 5
+    batch_size = 2
+    num_proj = 4
+    max_length = 8
+    with self.test_session(graph=tf.Graph()) as sess:
+      initializer = tf.random_uniform_initializer(-1, 1, seed=self._seed)
+      initializer_d = tf.random_uniform_initializer(-1, 1, seed=self._seed+1)
+      inputs = max_length * [
+          tf.placeholder(tf.float32, shape=(None, input_size))]
+      cell = tf.contrib.rnn.LSTMCell(
+          num_units, use_peepholes=True,
+          num_proj=num_proj, initializer=initializer,
+          state_is_tuple=False)
+      cell_d = tf.contrib.rnn.LSTMCell(
+          num_units, use_peepholes=True,
+          num_proj=num_proj, initializer=initializer_d,
+          state_is_tuple=False)
+
+      with tf.variable_scope("share_scope"):
+        outputs0, _ = tf.contrib.rnn.static_rnn(cell, inputs, dtype=tf.float32)
+      with tf.variable_scope("share_scope", reuse=True):
+        outputs1, _ = tf.contrib.rnn.static_rnn(cell, inputs, dtype=tf.float32)
+      with tf.variable_scope("diff_scope"):
+        outputs2, _ = tf.contrib.rnn.static_rnn(
+            cell_d, inputs, dtype=tf.float32)
+
+      tf.global_variables_initializer().run()
+      input_value = np.random.randn(batch_size, input_size)
+      output_values = sess.run(
+          outputs0 + outputs1 + outputs2, feed_dict={inputs[0]: input_value})
+      outputs0_values = output_values[:max_length]
+      outputs1_values = output_values[max_length:2*max_length]
+      outputs2_values = output_values[2*max_length:]
+      self.assertEqual(len(outputs0_values), len(outputs1_values))
+      self.assertEqual(len(outputs0_values), len(outputs2_values))
+      for o1, o2, o3 in zip(outputs0_values, outputs1_values, outputs2_values):
+        # Same weights used by both RNNs so outputs should be the same.
+        self.assertAllEqual(o1, o2)
+        # Different weights used so outputs should be different.
+        self.assertTrue(np.linalg.norm(o1-o3) > 1e-6)
+
+  def testSharingWeightsWithDifferentNamescope(self):
+    num_units = 3
+    input_size = 5
+    batch_size = 2
+    num_proj = 4
+    max_length = 8
+    with self.test_session(graph=tf.Graph()) as sess:
+      initializer = tf.random_uniform_initializer(-1, 1, seed=self._seed)
+      inputs = max_length * [
+          tf.placeholder(tf.float32, shape=(None, input_size))]
+      cell = tf.contrib.rnn.LSTMCell(
+          num_units, use_peepholes=True,
+          num_proj=num_proj, initializer=initializer,
+          state_is_tuple=False)
+
+      with tf.name_scope("scope0"):
+        with tf.variable_scope("share_scope"):
+          outputs0, _ = tf.contrib.rnn.static_rnn(
+              cell, inputs, dtype=tf.float32)
+      with tf.name_scope("scope1"):
+        with tf.variable_scope("share_scope", reuse=True):
+          outputs1, _ = tf.contrib.rnn.static_rnn(
+              cell, inputs, dtype=tf.float32)
+
+      tf.global_variables_initializer().run()
+      input_value = np.random.randn(batch_size, input_size)
+      output_values = sess.run(
+          outputs0 + outputs1, feed_dict={inputs[0]: input_value})
+      outputs0_values = output_values[:max_length]
+      outputs1_values = output_values[max_length:]
+      self.assertEqual(len(outputs0_values), len(outputs1_values))
+      for out0, out1 in zip(outputs0_values, outputs1_values):
+        self.assertAllEqual(out0, out1)
+
+  def testNoProjNoShardingSimpleStateSaver(self):
+    self._testNoProjNoShardingSimpleStateSaver(use_gpu=False)
+    self._testNoProjNoShardingSimpleStateSaver(use_gpu=True)
+
+  def testNoProjNoSharding(self):
+    self._testNoProjNoSharding(use_gpu=False)
+    self._testNoProjNoSharding(use_gpu=True)
+
+  def testCellClipping(self):
+    self._testCellClipping(use_gpu=False)
+    self._testCellClipping(use_gpu=True)
+
+  def testProjNoSharding(self):
+    self._testProjNoSharding(use_gpu=False)
+    self._testProjNoSharding(use_gpu=True)
+
+  def testProjSharding(self):
+    self._testProjSharding(use_gpu=False)
+    self._testProjSharding(use_gpu=True)
+
+  def testShardNoShardEquivalentOutput(self):
+    self._testShardNoShardEquivalentOutput(use_gpu=False)
+    self._testShardNoShardEquivalentOutput(use_gpu=True)
+
+  def testDoubleInput(self):
+    self._testDoubleInput(use_gpu=False)
+    self._testDoubleInput(use_gpu=True)
+
+  def testDoubleInputWithDropoutAndDynamicCalculation(self):
+    self._testDoubleInputWithDropoutAndDynamicCalculation(use_gpu=False)
+    self._testDoubleInputWithDropoutAndDynamicCalculation(use_gpu=True)
+
+
+class BidirectionalRNNTest(tf.test.TestCase):
+
+  def setUp(self):
+    self._seed = 23489
+    np.random.seed(self._seed)
+
+  def _createBidirectionalRNN(self,
+                              use_gpu,
+                              use_shape,
+                              use_sequence_length,
+                              scope=None):
+    num_units = 3
+    input_size = 5
+    batch_size = 2
+    max_length = 8
+
+    initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=self._seed)
+    sequence_length = tf.placeholder(tf.int64) if use_sequence_length else None
+    cell_fw = tf.contrib.rnn.LSTMCell(num_units,
+                                      input_size,
+                                      initializer=initializer,
+                                      state_is_tuple=False)
+    cell_bw = tf.contrib.rnn.LSTMCell(num_units,
+                                      input_size,
+                                      initializer=initializer,
+                                      state_is_tuple=False)
+    inputs = max_length * [
+        tf.placeholder(
+            tf.float32,
+            shape=(batch_size, input_size) if use_shape else (None, input_size))
+    ]
+    outputs, state_fw, state_bw = tf.contrib.rnn.static_bidirectional_rnn(
+        cell_fw,
+        cell_bw,
+        inputs,
+        dtype=tf.float32,
+        sequence_length=sequence_length,
+        scope=scope)
+    self.assertEqual(len(outputs), len(inputs))
+    for out in outputs:
+      self.assertEqual(
+          out.get_shape().as_list(),
+          [batch_size if use_shape else None, 2 * num_units])
+
+    input_value = np.random.randn(batch_size, input_size)
+    outputs = tf.stack(outputs)
+
+    return input_value, inputs, outputs, state_fw, state_bw, sequence_length
+
+  def _testBidirectionalRNN(self, use_gpu, use_shape):
+    with self.test_session(use_gpu=use_gpu, graph=tf.Graph()) as sess:
+      input_value, inputs, outputs, state_fw, state_bw, sequence_length = (
+          self._createBidirectionalRNN(use_gpu, use_shape, True))
+      tf.global_variables_initializer().run()
+      # Run with pre-specified sequence length of 2, 3
+      out, s_fw, s_bw = sess.run([outputs, state_fw, state_bw],
+                                 feed_dict={inputs[0]: input_value,
+                                 sequence_length: [2, 3]})
+
+      # Since the forward and backward LSTM cells were initialized with the
+      # same parameters, the forward and backward output has to be the same,
+      # but reversed in time. The format is output[time][batch][depth], and
+      # due to depth concatenation (as num_units=3 for both RNNs):
+      # - forward output:  out[][][depth] for 0 <= depth < 3
+      # - backward output: out[][][depth] for 4 <= depth < 6
+      #
+      # First sequence in batch is length=2
+      # Check that the time=0 forward output is equal to time=1 backward output
+      self.assertEqual(out[0][0][0], out[1][0][3])
+      self.assertEqual(out[0][0][1], out[1][0][4])
+      self.assertEqual(out[0][0][2], out[1][0][5])
+      # Check that the time=1 forward output is equal to time=0 backward output
+      self.assertEqual(out[1][0][0], out[0][0][3])
+      self.assertEqual(out[1][0][1], out[0][0][4])
+      self.assertEqual(out[1][0][2], out[0][0][5])
+
+      # Second sequence in batch is length=3
+      # Check that the time=0 forward output is equal to time=2 backward output
+      self.assertEqual(out[0][1][0], out[2][1][3])
+      self.assertEqual(out[0][1][1], out[2][1][4])
+      self.assertEqual(out[0][1][2], out[2][1][5])
+      # Check that the time=1 forward output is equal to time=1 backward output
+      self.assertEqual(out[1][1][0], out[1][1][3])
+      self.assertEqual(out[1][1][1], out[1][1][4])
+      self.assertEqual(out[1][1][2], out[1][1][5])
+      # Check that the time=2 forward output is equal to time=0 backward output
+      self.assertEqual(out[2][1][0], out[0][1][3])
+      self.assertEqual(out[2][1][1], out[0][1][4])
+      self.assertEqual(out[2][1][2], out[0][1][5])
+      # Via the reasoning above, the forward and backward final state should be
+      # exactly the same
+      self.assertAllClose(s_fw, s_bw)
+
+  def _testBidirectionalRNNWithoutSequenceLength(self, use_gpu, use_shape):
+    with self.test_session(use_gpu=use_gpu, graph=tf.Graph()) as sess:
+      input_value, inputs, outputs, state_fw, state_bw, _ = (
+          self._createBidirectionalRNN(use_gpu, use_shape, False))
+      tf.global_variables_initializer().run()
+      out, s_fw, s_bw = sess.run([outputs, state_fw, state_bw],
+                                 feed_dict={inputs[0]: input_value})
+
+      # Since the forward and backward LSTM cells were initialized with the
+      # same parameters, the forward and backward output has to be the same,
+      # but reversed in time. The format is output[time][batch][depth], and
+      # due to depth concatenation (as num_units=3 for both RNNs):
+      # - forward output:  out[][][depth] for 0 <= depth < 3
+      # - backward output: out[][][depth] for 4 <= depth < 6
+      #
+      # Both sequences in batch are length=8.  Check that the time=i
+      # forward output is equal to time=8-1-i backward output
+      for i in xrange(8):
+        self.assertEqual(out[i][0][0], out[8 - 1 - i][0][3])
+        self.assertEqual(out[i][0][1], out[8 - 1 - i][0][4])
+        self.assertEqual(out[i][0][2], out[8 - 1 - i][0][5])
+      for i in xrange(8):
+        self.assertEqual(out[i][1][0], out[8 - 1 - i][1][3])
+        self.assertEqual(out[i][1][1], out[8 - 1 - i][1][4])
+        self.assertEqual(out[i][1][2], out[8 - 1 - i][1][5])
+      # Via the reasoning above, the forward and backward final state should be
+      # exactly the same
+      self.assertAllClose(s_fw, s_bw)
+
+  def testBidirectionalRNN(self):
+    self._testBidirectionalRNN(use_gpu=False, use_shape=False)
+    self._testBidirectionalRNN(use_gpu=True, use_shape=False)
+    self._testBidirectionalRNN(use_gpu=False, use_shape=True)
+    self._testBidirectionalRNN(use_gpu=True, use_shape=True)
+
+  def testBidirectionalRNNWithoutSequenceLength(self):
+    self._testBidirectionalRNNWithoutSequenceLength(use_gpu=False,
+                                                    use_shape=False)
+    self._testBidirectionalRNNWithoutSequenceLength(use_gpu=True,
+                                                    use_shape=False)
+    self._testBidirectionalRNNWithoutSequenceLength(use_gpu=False,
+                                                    use_shape=True)
+    self._testBidirectionalRNNWithoutSequenceLength(use_gpu=True,
+                                                    use_shape=True)
+
+  def _testScope(self, factory, prefix="prefix", use_outer_scope=True):
+    # REMARKS: factory(scope) is a function accepting a scope
+    #          as an argument, such scope can be None, a string
+    #          or a VariableScope instance.
+    with self.test_session(use_gpu=True, graph=tf.Graph()):
+      if use_outer_scope:
+        with tf.variable_scope(prefix) as scope:
+          factory(scope)
+      else:
+        factory(prefix)
+
+      # check that all the variables names starts
+      # with the proper scope.
+      tf.global_variables_initializer()
+      all_vars = tf.global_variables()
+      prefix = prefix or "bidirectional_rnn"
+      scope_vars = [v for v in all_vars if v.name.startswith(prefix + "/")]
+      tf.logging.info("BiRNN with scope: %s (%s)"
+                      % (prefix, "scope" if use_outer_scope else "str"))
+      for v in scope_vars:
+        tf.logging.info(v.name)
+      self.assertEqual(len(scope_vars), len(all_vars))
+
+  def testBidirectionalRNNScope(self):
+    def factory(scope):
+      return self._createBidirectionalRNN(
+          use_gpu=True, use_shape=True,
+          use_sequence_length=True, scope=scope)
+
+    self._testScope(factory, use_outer_scope=True)
+    self._testScope(factory, use_outer_scope=False)
+    self._testScope(factory, prefix=None, use_outer_scope=False)
+
+
+class MultiDimensionalLSTMTest(tf.test.TestCase):
+
+  def setUp(self):
+    self._seed = 23489
+    np.random.seed(self._seed)
+
+  def testMultiDimensionalLSTMAllRNNContainers(self):
+    feature_dims = (3, 4, 5)
+    input_size = feature_dims
+    batch_size = 2
+    max_length = 8
+    sequence_length = [4, 6]
+    with self.test_session(graph=tf.Graph()) as sess:
+      inputs = max_length * [
+          tf.placeholder(tf.float32, shape=(None,) + input_size)]
+      inputs_using_dim = max_length * [
+          tf.placeholder(tf.float32, shape=(batch_size,) + input_size)]
+      inputs_c = tf.stack(inputs)
+      # Create a cell for the whole test. This is fine because the cell has no
+      # variables.
+      cell = DummyMultiDimensionalLSTM(feature_dims)
+      state_saver = TestStateSaver(batch_size, input_size)
+      outputs_static, state_static = tf.contrib.rnn.static_rnn(
+          cell, inputs, dtype=tf.float32,
+          sequence_length=sequence_length)
+      outputs_bid, state_fw, state_bw = tf.contrib.rnn.static_bidirectional_rnn(
+          cell, cell, inputs_using_dim, dtype=tf.float32,
+          sequence_length=sequence_length)
+      outputs_sav, state_sav = tf.contrib.rnn.static_state_saving_rnn(
+          cell, inputs_using_dim, sequence_length=sequence_length,
+          state_saver=state_saver, state_name=("h", "c"))
+      for out, inp in zip(outputs_static, inputs):
+        self.assertEqual(out.get_shape().as_list(), inp.get_shape().as_list())
+      for out, inp in zip(outputs_bid, inputs_using_dim):
+        input_shape_list = inp.get_shape().as_list()
+        # fwd and bwd activations are concatenated along the second dim.
+        input_shape_list[1] *= 2
+        self.assertEqual(out.get_shape().as_list(), input_shape_list)
+
+      tf.global_variables_initializer().run()
+
+      input_total_size = (batch_size,) + input_size
+      input_value = np.random.randn(*input_total_size)
+      outputs_static_v = sess.run(
+          outputs_static, feed_dict={inputs[0]: input_value})
+      outputs_bid_v = sess.run(
+          outputs_bid, feed_dict={inputs_using_dim[0]: input_value})
+      outputs_sav_v = sess.run(
+          outputs_sav, feed_dict={inputs_using_dim[0]: input_value})
+
+      self.assertAllEqual(outputs_static_v, outputs_sav_v)
+      outputs_static_array = np.array(outputs_static_v)
+      outputs_static_array_double = np.concatenate(
+          (outputs_static_array, outputs_static_array), axis=2)
+      outputs_bid_array = np.array(outputs_bid_v)
+      self.assertAllEqual(outputs_static_array_double, outputs_bid_array)
+
+      state_static_v = sess.run(
+          state_static, feed_dict={inputs[0]: input_value})
+      state_bid_fw_v = sess.run(
+          state_fw, feed_dict={inputs_using_dim[0]: input_value})
+      state_bid_bw_v = sess.run(
+          state_bw, feed_dict={inputs_using_dim[0]: input_value})
+      state_sav_v = sess.run(
+          state_sav, feed_dict={inputs_using_dim[0]: input_value})
+      self.assertAllEqual(
+          np.hstack(state_static_v), np.hstack(state_sav_v))
+      self.assertAllEqual(
+          np.hstack(state_static_v), np.hstack(state_bid_fw_v))
+      self.assertAllEqual(
+          np.hstack(state_static_v), np.hstack(state_bid_bw_v))
+
+
+class NestedLSTMTest(tf.test.TestCase):
+
+  def setUp(self):
+    self._seed = 23489
+    np.random.seed(self._seed)
+
+  def testNestedIOLSTMAllRNNContainers(self):
+    input_size = 5
+    batch_size = 2
+    state_size = 6
+    max_length = 8
+    sequence_length = [4, 6]
+    with self.test_session(graph=tf.Graph()) as sess:
+      state_saver = TestStateSaver(batch_size, state_size)
+      single_input = (tf.placeholder(tf.float32, shape=(None, input_size)),
+                      tf.placeholder(tf.float32, shape=(None, input_size)))
+      inputs = max_length * [single_input]
+      inputs_c = (tf.stack([input_[0] for input_ in inputs]),
+                  tf.stack([input_[1] for input_ in inputs]))
+      single_input_using_dim = (
+          tf.placeholder(tf.float32, shape=(batch_size, input_size)),
+          tf.placeholder(tf.float32, shape=(batch_size, input_size)))
+      inputs_using_dim = max_length * [single_input_using_dim]
+
+      # Create a cell for the whole test. This is fine because the cell has no
+      # variables.
+      cell = NestedRNNCell()
+      outputs_static, state_static = tf.contrib.rnn.static_rnn(
+          cell, inputs, dtype=tf.float32,
+          sequence_length=sequence_length)
+      outputs_bid, state_fw, state_bw = tf.contrib.rnn.static_bidirectional_rnn(
+          cell, cell, inputs_using_dim, dtype=tf.float32,
+          sequence_length=sequence_length)
+      outputs_sav, state_sav = tf.contrib.rnn.static_state_saving_rnn(
+          cell, inputs_using_dim, sequence_length=sequence_length,
+          state_saver=state_saver, state_name=("h", "c"))
+
+      def _assert_same_shape(input1, input2, double=False):
+        flat_input1 = nest.flatten(input1)
+        flat_input2 = nest.flatten(input2)
+        for inp1, inp2 in zip(flat_input1, flat_input2):
+          input_shape = inp1.get_shape().as_list()
+          if double:
+            input_shape[1] *= 2
+          self.assertEqual(input_shape, inp2.get_shape().as_list())
+
+      _assert_same_shape(inputs, outputs_static)
+      _assert_same_shape(inputs_using_dim, outputs_sav)
+      _assert_same_shape(inputs_using_dim, outputs_bid, double=True)
+
+      tf.global_variables_initializer().run()
+
+      input_total_size = (batch_size, input_size)
+      input_value = (np.random.randn(*input_total_size),
+                     np.random.randn(*input_total_size))
+      outputs_static_v = sess.run(
+          outputs_static, feed_dict={single_input: input_value})
+      outputs_sav_v = sess.run(
+          outputs_sav, feed_dict={single_input_using_dim: input_value})
+      outputs_bid_v = sess.run(
+          outputs_bid, feed_dict={single_input_using_dim: input_value})
+
+      self.assertAllEqual(outputs_static_v, outputs_sav_v)
+      outputs_static_array = np.array(outputs_static_v)
+      outputs_static_array_double = np.concatenate(
+          (outputs_static_array, outputs_static_array), axis=3)
+      outputs_bid_array = np.array(outputs_bid_v)
+      self.assertAllEqual(outputs_static_array_double, outputs_bid_array)
+
+      state_static_v = sess.run(
+          state_static, feed_dict={single_input: input_value})
+      state_bid_fw_v = sess.run(
+          state_fw, feed_dict={single_input_using_dim: input_value})
+      state_bid_bw_v = sess.run(
+          state_bw, feed_dict={single_input_using_dim: input_value})
+      state_sav_v = sess.run(
+          state_sav, feed_dict={single_input_using_dim: input_value})
+      self.assertAllEqual(
+          np.hstack(state_static_v), np.hstack(state_sav_v))
+      self.assertAllEqual(
+          np.hstack(state_static_v), np.hstack(state_bid_fw_v))
+      self.assertAllEqual(
+          np.hstack(state_static_v), np.hstack(state_bid_bw_v))
+
+
+class StateSaverRNNTest(tf.test.TestCase):
+
+  def setUp(self):
+    self._seed = 23489
+    np.random.seed(self._seed)
+
+  def _testScope(self, factory, prefix="prefix", use_outer_scope=True):
+    with self.test_session(use_gpu=True, graph=tf.Graph()):
+      if use_outer_scope:
+        with tf.variable_scope(prefix) as scope:
+          factory(scope)
+      else:
+        factory(prefix)
+        tf.global_variables_initializer()
+
+      # check that all the variables names starts
+      # with the proper scope.
+      all_vars = tf.global_variables()
+      prefix = prefix or "rnn"
+      scope_vars = [v for v in all_vars if v.name.startswith(prefix + "/")]
+      tf.logging.info("RNN with scope: %s (%s)"
+                      % (prefix, "scope" if use_outer_scope else "str"))
+      for v in scope_vars:
+        tf.logging.info(v.name)
+      self.assertEqual(len(scope_vars), len(all_vars))
+
+  def testStateSaverRNNScope(self):
+    num_units = 3
+    input_size = 5
+    batch_size = 2
+    max_length = 8
+    def factory(scope):
+      initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=self._seed)
+      state_saver = TestStateSaver(batch_size, 2 * num_units)
+      cell = tf.contrib.rnn.LSTMCell(
+          num_units, use_peepholes=False, initializer=initializer,
+          state_is_tuple=False)
+      inputs = max_length * [
+          tf.placeholder(tf.float32, shape=(batch_size, input_size))]
+      return tf.contrib.rnn.static_state_saving_rnn(
+          cell, inputs, state_saver=state_saver,
+          state_name="save_lstm", scope=scope)
+
+    self._testScope(factory, use_outer_scope=True)
+    self._testScope(factory, use_outer_scope=False)
+    self._testScope(factory, prefix=None, use_outer_scope=False)
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/fused_rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/fused_rnn_cell_test.py
index 0b9a1b42af50e3..7f73bd18803b6e 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/fused_rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/fused_rnn_cell_test.py
@@ -30,7 +30,7 @@ def testBasicRNNFusedWrapper(self):
 
     with self.test_session() as sess:
       initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=19890212)
-      cell = tf.nn.rnn_cell.BasicRNNCell(10)
+      cell = tf.contrib.rnn.BasicRNNCell(10)
       batch_size = 5
       input_size = 20
       timelen = 15
@@ -83,7 +83,7 @@ def testBasicRNNFusedWrapper(self):
   def testTimeReversedFusedRNN(self):
     with self.test_session() as sess:
       initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=19890213)
-      cell = tf.nn.rnn_cell.BasicRNNCell(10)
+      cell = tf.contrib.rnn.BasicRNNCell(10)
       batch_size = 5
       input_size = 20
       timelen = 15
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/gru_ops_test.py b/tensorflow/contrib/rnn/python/kernel_tests/gru_ops_test.py
index 9f008023bff33f..b6903eee29eec9 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/gru_ops_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/gru_ops_test.py
@@ -62,7 +62,7 @@ def testBlockGRUToGRUCellSingleStep(self):
 
       # Output from the basic GRU cell implementation.
       with tf.variable_scope("basic", initializer=initializer):
-        output = tf.nn.rnn_cell.GRUCell(cell_size)(x, h)
+        output = tf.contrib.rnn.GRUCell(cell_size)(x, h)
         sess.run([tf.global_variables_initializer()])
         basic_res = sess.run([output], {x: x_value, h: h_value})
 
@@ -112,7 +112,7 @@ def testBlockGRUToGRUCellMultiStep(self):
 
       # Output from the basic GRU cell implementation.
       with tf.variable_scope("basic", initializer=initializer):
-        cell = tf.nn.rnn_cell.GRUCell(cell_size)
+        cell = tf.contrib.rnn.GRUCell(cell_size)
         outputs_dynamic, state_dynamic = tf.nn.dynamic_rnn(
             cell,
             inputs=concat_x,
@@ -158,7 +158,7 @@ def testDerivativeOfBlockGRUToGRUCellSingleStep(self):
         output = gru_ops.GRUBlockCell(cell_size)(x, h)
         sess.run([tf.global_variables_initializer()])
 
-        all_variables = tf.all_variables()[0:4]
+        all_variables = tf.global_variables()[0:4]
         [w_ru, b_ru, w_c, b_c] = all_variables
 
         d_new_h_wrt_x = tf.gradients([output], x)
@@ -175,10 +175,10 @@ def testDerivativeOfBlockGRUToGRUCellSingleStep(self):
 
       # Gradients from the basic GRU cell implementation.
       with tf.variable_scope("basic", initializer=initializer):
-        output = tf.nn.rnn_cell.GRUCell(cell_size)(x, h)
+        output = tf.contrib.rnn.GRUCell(cell_size)(x, h)
         sess.run([tf.global_variables_initializer()])
 
-        all_variables = tf.all_variables()[4:8]
+        all_variables = tf.global_variables()[4:8]
         [w_ru, b_ru, w_c, b_c] = all_variables
 
         d_new_h_wrt_x = tf.gradients([output], x)
@@ -239,7 +239,7 @@ def testDerivativeOfBlockGRUToGRUCellMultiSteps(self):
 
       # Gradients from the basic GRU cell implementation.
       with tf.variable_scope("basic", initializer=initializer):
-        cell = tf.nn.rnn_cell.GRUCell(cell_size)
+        cell = tf.contrib.rnn.GRUCell(cell_size)
 
         outputs_dynamic, _ = tf.nn.dynamic_rnn(
             cell,
@@ -281,7 +281,7 @@ def testGradient(self):
 
       sess.run([tf.global_variables_initializer()])
 
-      all_variables = tf.all_variables()
+      all_variables = tf.global_variables()
 
       [w_ru, b_ru, w_c, b_c] = all_variables[:4]
 
@@ -358,7 +358,7 @@ def training_gru_block_vs_gru_cell(batch_size,
 
       # Output from the basic GRU cell implementation.
       with tf.variable_scope("basic", initializer=initializer):
-        cell = tf.nn.rnn_cell.GRUCell(cell_size)
+        cell = tf.contrib.rnn.GRUCell(cell_size)
 
         outputs_dynamic, _ = tf.nn.dynamic_rnn(
             cell,
@@ -427,7 +427,7 @@ def inference_gru_block_vs_gru_cell(batch_size,
 
       # Output from the basic GRU cell implementation.
       with tf.variable_scope("basic", initializer=initializer):
-        cell = tf.nn.rnn_cell.GRUCell(cell_size)
+        cell = tf.contrib.rnn.GRUCell(cell_size)
         outputs_dynamic, _ = tf.nn.dynamic_rnn(
             cell,
             inputs=concat_x,
@@ -474,7 +474,7 @@ def single_bprop_step_gru_block_vs_gru_cell(batch_size,
 
       # Output from the basic GRU cell implementation.
       with tf.variable_scope("basic", initializer=initializer):
-        output = tf.nn.rnn_cell.GRUCell(cell_size)(tf.identity(x),
+        output = tf.contrib.rnn.GRUCell(cell_size)(tf.identity(x),
                                                    tf.identity(h))
         sess.run([tf.global_variables_initializer()])
         grad_output_wrt_input = tf.gradients([output], h)
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/lstm_ops_test.py b/tensorflow/contrib/rnn/python/kernel_tests/lstm_ops_test.py
index 77da7352843c91..f4dc91e337f2d6 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/lstm_ops_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/lstm_ops_test.py
@@ -53,7 +53,7 @@ def testLSTMBlockCell(self):
         m1 = tf.zeros([1, 2])
         m2 = tf.zeros([1, 2])
         m3 = tf.zeros([1, 2])
-        g, ((out_m0, out_m1), (out_m2, out_m3)) = tf.nn.rnn_cell.MultiRNNCell(
+        g, ((out_m0, out_m1), (out_m2, out_m3)) = tf.contrib.rnn.MultiRNNCell(
             [tf.contrib.rnn.LSTMBlockCell(2)] * 2, state_is_tuple=True)(x, (
                 (m0, m1), (m2, m3)))
         sess.run([tf.global_variables_initializer()])
@@ -73,8 +73,8 @@ def testLSTMBlockCell(self):
 
   def testCompatibleNames(self):
     with self.test_session(use_gpu=self._use_gpu, graph=tf.Graph()):
-      cell = tf.nn.rnn_cell.LSTMCell(10)
-      pcell = tf.nn.rnn_cell.LSTMCell(10, use_peepholes=True)
+      cell = tf.contrib.rnn.LSTMCell(10)
+      pcell = tf.contrib.rnn.LSTMCell(10, use_peepholes=True)
       inputs = [tf.zeros([4, 5])] * 6
       tf.nn.rnn(cell, inputs, dtype=tf.float32, scope="basic")
       tf.nn.rnn(pcell, inputs, dtype=tf.float32, scope="peephole")
@@ -116,8 +116,8 @@ def testLSTMBasicToBlockCell(self):
         m1 = tf.zeros([1, 2])
         m2 = tf.zeros([1, 2])
         m3 = tf.zeros([1, 2])
-        g, ((out_m0, out_m1), (out_m2, out_m3)) = tf.nn.rnn_cell.MultiRNNCell(
-            [tf.nn.rnn_cell.BasicLSTMCell(
+        g, ((out_m0, out_m1), (out_m2, out_m3)) = tf.contrib.rnn.MultiRNNCell(
+            [tf.contrib.rnn.BasicLSTMCell(
                 2, state_is_tuple=True)] * 2,
             state_is_tuple=True)(x, ((m0, m1), (m2, m3)))
         sess.run([tf.global_variables_initializer()])
@@ -133,7 +133,7 @@ def testLSTMBasicToBlockCell(self):
         m1 = tf.zeros([1, 2])
         m2 = tf.zeros([1, 2])
         m3 = tf.zeros([1, 2])
-        g, ((out_m0, out_m1), (out_m2, out_m3)) = tf.nn.rnn_cell.MultiRNNCell(
+        g, ((out_m0, out_m1), (out_m2, out_m3)) = tf.contrib.rnn.MultiRNNCell(
             [tf.contrib.rnn.LSTMBlockCell(2)] * 2, state_is_tuple=True)(x, (
                 (m0, m1), (m2, m3)))
         sess.run([tf.global_variables_initializer()])
@@ -164,8 +164,8 @@ def testLSTMBasicToBlockCellPeeping(self):
         m1 = tf.zeros([1, 2])
         m2 = tf.zeros([1, 2])
         m3 = tf.zeros([1, 2])
-        g, ((out_m0, out_m1), (out_m2, out_m3)) = tf.nn.rnn_cell.MultiRNNCell(
-            [tf.nn.rnn_cell.LSTMCell(
+        g, ((out_m0, out_m1), (out_m2, out_m3)) = tf.contrib.rnn.MultiRNNCell(
+            [tf.contrib.rnn.LSTMCell(
                 2, use_peepholes=True, state_is_tuple=True)] * 2,
             state_is_tuple=True)(x, ((m0, m1), (m2, m3)))
         sess.run([tf.global_variables_initializer()])
@@ -181,7 +181,7 @@ def testLSTMBasicToBlockCellPeeping(self):
         m1 = tf.zeros([1, 2])
         m2 = tf.zeros([1, 2])
         m3 = tf.zeros([1, 2])
-        g, ((out_m0, out_m1), (out_m2, out_m3)) = tf.nn.rnn_cell.MultiRNNCell(
+        g, ((out_m0, out_m1), (out_m2, out_m3)) = tf.contrib.rnn.MultiRNNCell(
             [tf.contrib.rnn.LSTMBlockCell(
                 2, use_peephole=True)] * 2,
             state_is_tuple=True)(x, ((m0, m1), (m2, m3)))
@@ -212,7 +212,7 @@ def testLSTMBasicToBlock(self):
 
       initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=19890212)
       with tf.variable_scope("basic", initializer=initializer):
-        cell = tf.nn.rnn_cell.BasicLSTMCell(cell_size, state_is_tuple=True)
+        cell = tf.contrib.rnn.BasicLSTMCell(cell_size, state_is_tuple=True)
         outputs, state = tf.nn.rnn(cell, inputs, dtype=tf.float32)
 
         sess.run([tf.global_variables_initializer()])
@@ -282,7 +282,7 @@ def testLSTMBasicToBlockPeeping(self):
 
       initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=19890212)
       with tf.variable_scope("basic", initializer=initializer):
-        cell = tf.nn.rnn_cell.LSTMCell(
+        cell = tf.contrib.rnn.LSTMCell(
             cell_size, use_peepholes=True, state_is_tuple=True)
         outputs, state = tf.nn.rnn(cell, inputs, dtype=tf.float32)
 
@@ -363,7 +363,7 @@ def testLSTMFusedSequenceLengths(self):
 
       initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=19890213)
       with tf.variable_scope("basic", initializer=initializer):
-        cell = tf.nn.rnn_cell.BasicLSTMCell(cell_size, state_is_tuple=True)
+        cell = tf.contrib.rnn.BasicLSTMCell(cell_size, state_is_tuple=True)
         outputs, state = tf.nn.rnn(cell,
                                    inputs,
                                    dtype=tf.float32,
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
index ab01fa2c100329..bbc65656df2633 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
@@ -387,7 +387,7 @@ def testAttentionCellWrapperFailures(self):
     num_units = 8
     for state_is_tuple in [False, True]:
       with tf.Graph().as_default():
-        lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(
+        lstm_cell = tf.contrib.rnn.BasicLSTMCell(
             num_units, state_is_tuple=state_is_tuple)
         with self.assertRaisesRegexp(
             ValueError, "attn_length should be greater than zero, got 0"):
@@ -398,7 +398,7 @@ def testAttentionCellWrapperFailures(self):
           tf.contrib.rnn.AttentionCellWrapper(lstm_cell, -1,
                                               state_is_tuple=state_is_tuple)
       with tf.Graph().as_default():
-        lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(
+        lstm_cell = tf.contrib.rnn.BasicLSTMCell(
             num_units, state_is_tuple=True)
         with self.assertRaisesRegexp(
             ValueError, "Cell returns tuple of states, but the flag "
@@ -415,7 +415,7 @@ def testAttentionCellWrapperZeros(self):
       with tf.Graph().as_default():
         with self.test_session() as sess:
           with tf.variable_scope("state_is_tuple_" + str(state_is_tuple)):
-            lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(
+            lstm_cell = tf.contrib.rnn.BasicLSTMCell(
                 num_units, state_is_tuple=state_is_tuple)
             cell = tf.contrib.rnn.AttentionCellWrapper(
                 lstm_cell, attn_length, state_is_tuple=state_is_tuple)
@@ -460,7 +460,7 @@ def testAttentionCellWrapperValues(self):
       with tf.Graph().as_default():
         with self.test_session() as sess:
           with tf.variable_scope("state_is_tuple_" + str(state_is_tuple)):
-            lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(
+            lstm_cell = tf.contrib.rnn.BasicLSTMCell(
                 num_units, state_is_tuple=state_is_tuple)
             cell = tf.contrib.rnn.AttentionCellWrapper(
                 lstm_cell, attn_length, state_is_tuple=state_is_tuple)
@@ -526,7 +526,7 @@ def testAttentionCellWrapperCorrectResult(self):
     for state_is_tuple in [False, True]:
       with tf.Session() as sess:
         with tf.variable_scope("state_is_tuple", reuse=state_is_tuple):
-          lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(
+          lstm_cell = tf.contrib.rnn.BasicLSTMCell(
               num_units, state_is_tuple=state_is_tuple)
           cell = tf.contrib.rnn.AttentionCellWrapper(
               lstm_cell, attn_length, state_is_tuple=state_is_tuple)
@@ -563,13 +563,13 @@ def testBasicLSTMCell(self):
         x = tf.zeros([1, 2])
         c0 = tf.zeros([1, 2])
         h0 = tf.zeros([1, 2])
-        state0 = tf.nn.rnn_cell.LSTMStateTuple(c0, h0)
+        state0 = tf.contrib.rnn.LSTMStateTuple(c0, h0)
         c1 = tf.zeros([1, 2])
         h1 = tf.zeros([1, 2])
-        state1 = tf.nn.rnn_cell.LSTMStateTuple(c1, h1)
+        state1 = tf.contrib.rnn.LSTMStateTuple(c1, h1)
         state = (state0, state1)
         cell = tf.contrib.rnn.LayerNormBasicLSTMCell(2)
-        cell = tf.nn.rnn_cell.MultiRNNCell([cell] * 2)
+        cell = tf.contrib.rnn.MultiRNNCell([cell] * 2)
         g, out_m = cell(x, state)
         sess.run([tf.global_variables_initializer()])
         res = sess.run([g, out_m],
@@ -603,7 +603,7 @@ def testBasicLSTMCell(self):
         x = tf.zeros([1, 3])  # Test BasicLSTMCell with input_size != num_units.
         c = tf.zeros([1, 2])
         h = tf.zeros([1, 2])
-        state = tf.nn.rnn_cell.LSTMStateTuple(c, h)
+        state = tf.contrib.rnn.LSTMStateTuple(c, h)
         cell = tf.contrib.rnn.LayerNormBasicLSTMCell(2)
         g, out_m = cell(x, state)
         sess.run([tf.global_variables_initializer()])
@@ -627,12 +627,12 @@ def testBasicLSTMCellWithStateTuple(self):
         x = tf.zeros([1, 2])
         c0 = tf.zeros([1, 2])
         h0 = tf.zeros([1, 2])
-        state0 = tf.nn.rnn_cell.LSTMStateTuple(c0, h0)
+        state0 = tf.contrib.rnn.LSTMStateTuple(c0, h0)
         c1 = tf.zeros([1, 2])
         h1 = tf.zeros([1, 2])
-        state1 = tf.nn.rnn_cell.LSTMStateTuple(c1, h1)
+        state1 = tf.contrib.rnn.LSTMStateTuple(c1, h1)
         cell = tf.contrib.rnn.LayerNormBasicLSTMCell(2)
-        cell = tf.nn.rnn_cell.MultiRNNCell([cell] * 2)
+        cell = tf.contrib.rnn.MultiRNNCell([cell] * 2)
         h, (s0, s1) = cell(x, (state0, state1))
         sess.run([tf.global_variables_initializer()])
         res = sess.run([h, s0, s1],
@@ -682,7 +682,7 @@ def _is_close_in(x, items, digits=4):
         x = tf.zeros([1, 5])
         c = tf.zeros([1, 5])
         h = tf.zeros([1, 5])
-        state = tf.nn.rnn_cell.LSTMStateTuple(c, h)
+        state = tf.contrib.rnn.LSTMStateTuple(c, h)
         cell = tf.contrib.rnn.LayerNormBasicLSTMCell(
             num_units, layer_norm=False, dropout_keep_prob=keep_prob)
 
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/rnn_test.py b/tensorflow/contrib/rnn/python/kernel_tests/rnn_test.py
index 91b3d7f41759cd..8374b505a7ea24 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/rnn_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/rnn_test.py
@@ -46,10 +46,10 @@ def _createStackBidirectionalRNN(self,
     initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=self._seed)
     sequence_length = tf.placeholder(tf.int64) if use_sequence_length else None
 
-    self.cells_fw = [tf.nn.rnn_cell.LSTMCell(
+    self.cells_fw = [tf.contrib.rnn.LSTMCell(
         num_units, input_size, initializer=initializer, state_is_tuple=False)
                      for num_units in self.layers]
-    self.cells_bw = [tf.nn.rnn_cell.LSTMCell(
+    self.cells_bw = [tf.contrib.rnn.LSTMCell(
         num_units, input_size, initializer=initializer, state_is_tuple=False)
                      for num_units in self.layers]
 
@@ -208,10 +208,10 @@ def _createStackBidirectionalDynamicRNN(self,
     initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=self._seed)
     sequence_length = tf.placeholder(tf.int64)
 
-    self.cells_fw = [tf.nn.rnn_cell.LSTMCell(
+    self.cells_fw = [tf.contrib.rnn.LSTMCell(
         num_units, input_size, initializer=initializer, state_is_tuple=False)
                      for num_units in self.layers]
-    self.cells_bw = [tf.nn.rnn_cell.LSTMCell(
+    self.cells_bw = [tf.contrib.rnn.LSTMCell(
         num_units, input_size, initializer=initializer, state_is_tuple=False)
                      for num_units in self.layers]
 
@@ -382,7 +382,7 @@ def _testScope(self, factory, prefix="prefix", use_outer_scope=True):
 
       # check that all the variables names starts with the proper scope.
       tf.global_variables_initializer()
-      all_vars = tf.all_variables()
+      all_vars = tf.global_variables()
       prefix = prefix or "stack_bidirectional_rnn"
       scope_vars = [v for v in all_vars if v.name.startswith(prefix + "/")]
       tf.logging.info("StackRNN with scope: %s (%s)"
diff --git a/tensorflow/contrib/rnn/python/ops/fused_rnn_cell.py b/tensorflow/contrib/rnn/python/ops/fused_rnn_cell.py
index 4be035e71aad76..1bbd39c15ef5cc 100644
--- a/tensorflow/contrib/rnn/python/ops/fused_rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/fused_rnn_cell.py
@@ -134,7 +134,7 @@ class TimeReversedFusedRNN(FusedRNNCell):
   For example,
 
   ```python
-  cell = tf.nn.rnn_cell.BasicRNNCell(10)
+  cell = tf.contrib.rnn.BasicRNNCell(10)
   fw_lstm = tf.contrib.rnn.FusedRNNCellAdaptor(cell, use_dynamic_rnn=True)
   bw_lstm = tf.contrib.rnn.TimeReversedFusedRNN(fw_lstm)
   fw_out, fw_state = fw_lstm(inputs)
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/seq2seq_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/seq2seq_test.py
index a5bb331f8199cc..f71285b6d9e626 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/seq2seq_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/seq2seq_test.py
@@ -103,7 +103,7 @@ def test_dynamic_rnn_decoder_time_major(self):
                 scope=scope))
 
         # Run model
-        tf.initialize_all_variables().run()
+        tf.global_variables_initializer().run()
         decoder_outputs_train_res, decoder_state_train_res = sess.run(
             [decoder_outputs_train, decoder_state_train])
         decoder_outputs_inference_res, decoder_state_inference_res = sess.run(
diff --git a/tensorflow/contrib/session_bundle/constants.py b/tensorflow/contrib/session_bundle/constants.py
index 1339d0be464d2b..6ced73241afdda 100644
--- a/tensorflow/contrib/session_bundle/constants.py
+++ b/tensorflow/contrib/session_bundle/constants.py
@@ -23,9 +23,11 @@
 ASSETS_DIRECTORY = "assets"
 EXPORT_BASE_NAME = "export"
 EXPORT_SUFFIX_NAME = "meta"
-META_GRAPH_DEF_FILENAME = EXPORT_BASE_NAME + "." + EXPORT_SUFFIX_NAME
-VARIABLES_FILENAME = EXPORT_BASE_NAME
-VARIABLES_FILENAME_PATTERN = VARIABLES_FILENAME + "-?????-of-?????"
+META_GRAPH_DEF_FILENAME = "export.meta"
+VARIABLES_FILENAME = "export"
+VARIABLES_FILENAME_PATTERN = "export-?????-of-?????"
+VARIABLES_FILENAME_PATTERN_V2 = "export.data-?????-of-?????"
+VARIABLES_INDEX_FILENAME_V2 = "export.index"
 INIT_OP_KEY = "serving_init_op"
 SIGNATURES_KEY = "serving_signatures"
 ASSETS_KEY = "serving_assets"
diff --git a/tensorflow/contrib/session_bundle/session_bundle.py b/tensorflow/contrib/session_bundle/session_bundle.py
index 5b3c831936bec0..8b4c1aa9bccd0a 100644
--- a/tensorflow/contrib/session_bundle/session_bundle.py
+++ b/tensorflow/contrib/session_bundle/session_bundle.py
@@ -57,18 +57,44 @@ def load_session_bundle_from_path(export_dir, target="", config=None):
   if not file_io.file_exists(meta_graph_filename):
     raise RuntimeError("Expected meta graph file missing %s" %
                        meta_graph_filename)
-  variables_filename = os.path.join(export_dir,
-                                    constants.VARIABLES_FILENAME)
-  if not file_io.file_exists(variables_filename):
-    variables_filename = os.path.join(
-        export_dir, constants.VARIABLES_FILENAME_PATTERN)
-    if not file_io.get_matching_files(variables_filename):
-      # If graph_util.convert_variables_to_constants() is called on a model
-      # it won't have any variables, and that's OK.
-      #
-      # TODO(yxshi): verify that the graph_def in fact does not have any
-      # reachable variables.
-      variables_filename = None
+
+  variables_filename = ""
+  variables_filename_list = []
+  checkpoint_sharded = False
+
+  variables_index_filename = os.path.join(
+      export_dir, constants.VARIABLES_INDEX_FILENAME_V2)
+  checkpoint_v2 = file_io.file_exists(variables_index_filename)
+
+  # Find matching checkpoint files.
+  if checkpoint_v2:
+    # The checkpoint is in v2 format.
+    variables_filename_pattern = os.path.join(
+        export_dir, constants.VARIABLES_FILENAME_PATTERN_V2)
+    variables_filename_list = file_io.get_matching_files(
+        variables_filename_pattern)
+    checkpoint_sharded = True
+  else:
+    variables_filename = os.path.join(export_dir,
+                                      constants.VARIABLES_FILENAME)
+    if file_io.file_exists(variables_filename):
+      variables_filename_list = [variables_filename]
+    else:
+      variables_filename = os.path.join(export_dir,
+                                        constants.VARIABLES_FILENAME_PATTERN)
+      variables_filename_list = file_io.get_matching_files(variables_filename)
+      checkpoint_sharded = True
+
+  # Prepare the files to restore a session.
+  if not variables_filename_list:
+    restore_files = ""
+  elif checkpoint_v2 or not checkpoint_sharded:
+    # For checkpoint v2 or v1 with non-sharded files, use "export" to restore
+    # the session.
+    restore_files = constants.VARIABLES_FILENAME
+  else:
+    restore_files = constants.VARIABLES_FILENAME_PATTERN
+
   assets_dir = os.path.join(export_dir, constants.ASSETS_DIRECTORY)
 
   # Reads meta graph file.
@@ -94,8 +120,8 @@ def load_session_bundle_from_path(export_dir, target="", config=None):
   # Import the graph.
   saver = tf.train.import_meta_graph(meta_graph_def)
   # Restore the session.
-  if variables_filename:
-    saver.restore(sess, variables_filename)
+  if restore_files:
+    saver.restore(sess, os.path.join(export_dir, restore_files))
 
   init_op_tensor = None
   if constants.INIT_OP_KEY in collection_def:
diff --git a/tensorflow/contrib/session_bundle/session_bundle_test.py b/tensorflow/contrib/session_bundle/session_bundle_test.py
index da049cb98a58d7..66385f527a04e3 100644
--- a/tensorflow/contrib/session_bundle/session_bundle_test.py
+++ b/tensorflow/contrib/session_bundle/session_bundle_test.py
@@ -51,7 +51,7 @@ def _checkRegressionSignature(self, signatures, sess):
     self.assertEqual(y[0][2], 3)
     self.assertEqual(y[0][3], 3.5)
 
-  def _checkNamedSigantures(self, signatures, sess):
+  def _checkNamedSignatures(self, signatures, sess):
     named_signatures = signatures.named_signatures
     input_name = (named_signatures["inputs"].generic_signature.map["x"]
                   .tensor_name)
@@ -88,7 +88,7 @@ def testBasic(self):
       signatures = manifest_pb2.Signatures()
       signatures_any[0].Unpack(signatures)
       self._checkRegressionSignature(signatures, sess)
-      self._checkNamedSigantures(signatures, sess)
+      self._checkNamedSignatures(signatures, sess)
 
   def testBadPath(self):
     base_path = tf.test.test_src_dir_path("/no/such/a/dir")
@@ -99,6 +99,32 @@ def testBadPath(self):
           config=tf.ConfigProto(device_count={"CPU": 2}))
     self.assertTrue("Expected meta graph file missing" in str(cm.exception))
 
+  def testVarCheckpointV2(self):
+    base_path = tf.test.test_src_dir_path(
+        "contrib/session_bundle/example/half_plus_two_ckpt_v2/00000123")
+    tf.reset_default_graph()
+    sess, meta_graph_def = session_bundle.load_session_bundle_from_path(
+        base_path, target="", config=tf.ConfigProto(device_count={"CPU": 2}))
+
+    self.assertTrue(sess)
+    asset_path = os.path.join(base_path, constants.ASSETS_DIRECTORY)
+    with sess.as_default():
+      path1, path2 = sess.run(["filename1:0", "filename2:0"])
+      self.assertEqual(
+          compat.as_bytes(os.path.join(asset_path, "hello1.txt")), path1)
+      self.assertEqual(
+          compat.as_bytes(os.path.join(asset_path, "hello2.txt")), path2)
+
+      collection_def = meta_graph_def.collection_def
+
+      signatures_any = collection_def[constants.SIGNATURES_KEY].any_list.value
+      self.assertEquals(len(signatures_any), 1)
+
+      signatures = manifest_pb2.Signatures()
+      signatures_any[0].Unpack(signatures)
+      self._checkRegressionSignature(signatures, sess)
+      self._checkNamedSignatures(signatures, sess)
+
 
 class SessionBundleLoadNoVarsTest(tf.test.TestCase):
   """Test the case where there are no variables in the graph."""
diff --git a/tensorflow/contrib/slim/README.md b/tensorflow/contrib/slim/README.md
index 8454ddc2ecd59f..e6100ef675af5b 100644
--- a/tensorflow/contrib/slim/README.md
+++ b/tensorflow/contrib/slim/README.md
@@ -901,7 +901,7 @@ slim.evaluation.evaluation_loop(
     log_dir,
     num_evals=num_batches,
     eval_op=names_to_updates.values(),
-    summary_op=tf.merge_summary(summary_ops),
+    summary_op=tf.summary.merge(summary_ops),
     eval_interval_secs=eval_interval_secs)
 ```
 
diff --git a/tensorflow/contrib/slim/python/slim/data/parallel_reader.py b/tensorflow/contrib/slim/python/slim/data/parallel_reader.py
index 7864fa099862b1..fda7b15b322d8b 100644
--- a/tensorflow/contrib/slim/python/slim/data/parallel_reader.py
+++ b/tensorflow/contrib/slim/python/slim/data/parallel_reader.py
@@ -277,5 +277,5 @@ def get_data_files(data_sources):
     else:
       data_files = [data_sources]
   if not data_files:
-    raise ValueError('No data files found in %s', data_sources)
+    raise ValueError('No data files found in %s' % (data_sources,))
   return data_files
diff --git a/tensorflow/contrib/slim/python/slim/learning.py b/tensorflow/contrib/slim/python/slim/learning.py
index 6d8446bf88e3e1..ca131cfb7c4139 100644
--- a/tensorflow/contrib/slim/python/slim/learning.py
+++ b/tensorflow/contrib/slim/python/slim/learning.py
@@ -698,6 +698,18 @@ def train(train_op,
             tf_variables.local_variables_initializer(),
             data_flow_ops.initialize_all_tables())
 
+      if sync_optimizer is not None and isinstance(
+          sync_optimizer, sync_replicas_optimizer.SyncReplicasOptimizerV2):
+        with ops.control_dependencies([local_init_op] if local_init_op is
+                                      not None else []):
+          if is_chief:
+            local_init_op = sync_optimizer.chief_init_op
+          else:
+            local_init_op = sync_optimizer.local_step_init_op
+        ready_for_local_init_op = sync_optimizer.ready_for_local_init_op
+      else:
+        ready_for_local_init_op = None
+
     if summary_op == _USE_DEFAULT:
       summary_op = summary.merge_all()
 
@@ -715,9 +727,7 @@ def train(train_op,
             'tf.train.SyncReplicasOptimizerV2.')
 
       # Need to create these BEFORE the supervisor finalizes the graph:
-      with ops.control_dependencies([init_op]):
-        init_tokens_op = sync_optimizer.get_init_tokens_op()
-      init_op = init_tokens_op
+      init_tokens_op = sync_optimizer.get_init_tokens_op()
       chief_queue_runner = sync_optimizer.get_chief_queue_runner()
       if isinstance(sync_optimizer,
                     sync_replicas_optimizer.SyncReplicasOptimizer):
@@ -746,6 +756,7 @@ def train(train_op,
       init_op=init_op,
       init_feed_dict=init_feed_dict,
       local_init_op=local_init_op,
+      ready_for_local_init_op=ready_for_local_init_op,
       ready_op=ready_op,
       summary_op=summary_op,
       summary_writer=summary_writer,
@@ -776,6 +787,7 @@ def train(train_op,
         logging.info('Starting Queues.')
         if is_chief and sync_optimizer is not None:
           sv.start_queue_runners(sess, [chief_queue_runner])
+          sess.run(init_tokens_op)
         try:
           while not sv.should_stop():
             total_loss, should_stop = train_step_fn(
diff --git a/tensorflow/contrib/slim/python/slim/learning_test.py b/tensorflow/contrib/slim/python/slim/learning_test.py
index 42949e2c286185..7225ab86c43d99 100644
--- a/tensorflow/contrib/slim/python/slim/learning_test.py
+++ b/tensorflow/contrib/slim/python/slim/learning_test.py
@@ -625,7 +625,7 @@ def testTrainWithInitFromCheckpoint(self):
       tf.set_random_seed(2)
       train_op = self.create_train_op()
 
-      model_variables = tf.all_variables()
+      model_variables = tf.global_variables()
       model_path = os.path.join(logdir1, 'model.ckpt-300')
 
       init_op = tf.global_variables_initializer()
@@ -674,7 +674,7 @@ def testTrainWithInitFromFn(self):
       tf.set_random_seed(2)
       train_op = self.create_train_op()
 
-      model_variables = tf.all_variables()
+      model_variables = tf.global_variables()
       model_path = os.path.join(logdir1, 'model.ckpt-300')
       saver = tf.train.Saver(model_variables)
       def RestoreFn(sess):
diff --git a/tensorflow/contrib/slim/python/slim/model_analyzer.py b/tensorflow/contrib/slim/python/slim/model_analyzer.py
index e29c7b1d8ca079..74617928a7168c 100644
--- a/tensorflow/contrib/slim/python/slim/model_analyzer.py
+++ b/tensorflow/contrib/slim/python/slim/model_analyzer.py
@@ -84,7 +84,7 @@ def analyze_vars(variables, print_info=False):
   """Prints the names and shapes of the variables.
 
   Args:
-    variables: list of variables, for example tf.all_variables().
+    variables: list of variables, for example tf.global_variables().
     print_info: Optional, if true print variables and their shape.
 
   Returns:
diff --git a/tensorflow/contrib/slim/python/slim/nets/BUILD b/tensorflow/contrib/slim/python/slim/nets/BUILD
index 9d5b95c6b15dca..c844fb5d1403c8 100644
--- a/tensorflow/contrib/slim/python/slim/nets/BUILD
+++ b/tensorflow/contrib/slim/python/slim/nets/BUILD
@@ -135,7 +135,7 @@ py_library(
 
 py_test(
     name = "resnet_v1_test",
-    size = "medium",
+    size = "large",
     srcs = ["resnet_v1_test.py"],
     srcs_version = "PY2AND3",
     deps = [
@@ -154,7 +154,7 @@ py_library(
 
 py_test(
     name = "resnet_v2_test",
-    size = "medium",
+    size = "large",
     srcs = ["resnet_v2_test.py"],
     srcs_version = "PY2AND3",
     deps = [
diff --git a/tensorflow/contrib/specs/python/specs_test.py b/tensorflow/contrib/specs/python/specs_test.py
index a25532ab415135..67e4a559a9c67c 100644
--- a/tensorflow/contrib/specs/python/specs_test.py
+++ b/tensorflow/contrib/specs/python/specs_test.py
@@ -118,8 +118,8 @@ def testConc(self):
       result = outputs.eval()
       self.assertEqual(tuple(result.shape), (10, 30))
       self.assertEqual(summaries.tf_spec_structure(spec, inputs),
-                       "_ _ var dot var biasadd sig "
-                       "<> var dot var biasadd sig concat")
+                       "_ var dot var biasadd sig "
+                       "<> var dot var biasadd sig _ concatv2")
 
   def testImport(self):
     with self.test_session():
@@ -197,7 +197,7 @@ def testVar(self):
                 initializer=tf.constant_initializer(42.0))
       inputs = tf.constant(_rand(10, 100))
       outputs = v.funcall(inputs)
-      self.assertEqual(len(tf.all_variables()), 1)
+      self.assertEqual(len(tf.global_variables()), 1)
       sess.run([outputs.initializer])
       outputs_value = outputs.eval()
       self.assertEqual(outputs_value.shape, (2, 2))
@@ -211,7 +211,7 @@ def testShared(self):
         g = f | f | f | f
       inputs = tf.constant(_rand(10, 100))
       _ = g.funcall(inputs)
-      self.assertEqual(len(tf.all_variables()), 2)
+      self.assertEqual(len(tf.global_variables()), 2)
 
   def testAutoFunction(self):
     with self.test_session():
diff --git a/tensorflow/contrib/specs/python/summaries.py b/tensorflow/contrib/specs/python/summaries.py
index 27f3bb32d74581..a0d56cd97afc91 100644
--- a/tensorflow/contrib/specs/python/summaries.py
+++ b/tensorflow/contrib/specs/python/summaries.py
@@ -156,7 +156,7 @@ def tf_num_params(x):
   if isinstance(x, tf.Tensor):
     shape = x.get_shape()
     x = x.op
-  if x.type == "Variable":
+  if x.type in ["Variable", "VariableV2"]:
     return shape.num_elements()
   totals = [tf_num_params(y) for y in x.inputs]
   return sum(totals)
diff --git a/tensorflow/contrib/stat_summarizer/python/stat_summarizer_test.py b/tensorflow/contrib/stat_summarizer/python/stat_summarizer_test.py
index 616be81e277970..a84e11b0438526 100644
--- a/tensorflow/contrib/stat_summarizer/python/stat_summarizer_test.py
+++ b/tensorflow/contrib/stat_summarizer/python/stat_summarizer_test.py
@@ -34,7 +34,7 @@ def testStatSummarizer(self):
           graph_def.SerializeToString())
 
       with self.test_session() as sess:
-        sess.run(tf.initialize_all_variables())
+        sess.run(tf.global_variables_initializer())
 
         for _ in range(20):
           run_metadata = tf.RunMetadata()
diff --git a/tensorflow/contrib/tensor_forest/python/tensor_forest.py b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
index 38bdd976ccf9b4..504462c8c6b284 100644
--- a/tensorflow/contrib/tensor_forest/python/tensor_forest.py
+++ b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
@@ -112,12 +112,10 @@ def fill(self):
     # regression and avoids having to recompute sums for classification.
     self.num_output_columns = self.num_classes + 1
 
-    # The Random Forest literature recommends sqrt(# features) for
-    # classification problems, and p/3 for regression problems.
-    # TODO(thomaswc): Consider capping this for large number of features.
-    self.num_splits_to_consider = (
-        self.num_splits_to_consider or
-        max(10, int(math.ceil(math.sqrt(self.num_features)))))
+    # Our experiments have found that num_splits_to_consider = num_features
+    # gives good accuracy.
+    self.num_splits_to_consider = self.num_splits_to_consider or min(
+        self.num_features, 1000)
 
     self.max_fertile_nodes = (self.max_fertile_nodes or
                               int(math.ceil(self.max_nodes / 2.0)))
@@ -371,7 +369,8 @@ def training_graph(self,
         if self.params.bagging_fraction < 1.0:
           # TODO(thomaswc): This does sampling without replacment.  Consider
           # also allowing sampling with replacement as an option.
-          batch_size = array_ops.slice(array_ops.shape(input_data), [0], [1])
+          batch_size = array_ops.strided_slice(
+              array_ops.shape(input_data), [0], [1])
           r = random_ops.random_uniform(batch_size, seed=seed)
           mask = math_ops.less(
               r, array_ops.ones_like(r) * self.params.bagging_fraction)
@@ -537,9 +536,10 @@ def _nothing():
       return control_flow_ops.no_op()
 
     return control_flow_ops.cond(
-        math_ops.equal(array_ops.squeeze(array_ops.slice(
-            self.variables.tree, [0, 0], [1, 1])), -2),
-        _init_tree, _nothing)
+        math_ops.equal(
+            array_ops.squeeze(
+                array_ops.strided_slice(self.variables.tree, [0, 0], [1, 1])),
+            -2), _init_tree, _nothing)
 
   def _gini(self, class_counts):
     """Calculate the Gini impurity.
@@ -633,7 +633,7 @@ def training_graph(self,
     if isinstance(input_data, sparse_tensor.SparseTensor):
       sparse_indices = input_data.indices
       sparse_values = input_data.values
-      sparse_shape = input_data.shape
+      sparse_shape = input_data.dense_shape
       input_data = []
 
     # Count extremely random stats.
@@ -890,7 +890,7 @@ def inference_graph(self, input_data, data_spec):
     if isinstance(input_data, sparse_tensor.SparseTensor):
       sparse_indices = input_data.indices
       sparse_values = input_data.values
-      sparse_shape = input_data.shape
+      sparse_shape = input_data.dense_shape
       input_data = []
     return self.inference_ops.tree_predictions(
         input_data, sparse_indices, sparse_values, sparse_shape, data_spec,
diff --git a/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py b/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py
index 75b00aa990dca3..0a0f473855d07f 100644
--- a/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py
+++ b/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py
@@ -33,14 +33,13 @@ def testForestHParams(self):
         split_after_samples=25, num_features=60).fill()
     self.assertEquals(2, hparams.num_classes)
     self.assertEquals(3, hparams.num_output_columns)
-    # sqrt(num_features) < 10, so num_splits_to_consider should be 10.
-    self.assertEquals(10, hparams.num_splits_to_consider)
+    self.assertEquals(60, hparams.num_splits_to_consider)
     # Don't have more fertile nodes than max # leaves, which is 500.
     self.assertEquals(500, hparams.max_fertile_nodes)
     # Default value of valid_leaf_threshold
     self.assertEquals(1, hparams.valid_leaf_threshold)
-    # split_after_samples is larger than 10
-    self.assertEquals(1, hparams.split_initializations_per_input)
+    # floor(60 / 25) = 2
+    self.assertEquals(2, hparams.split_initializations_per_input)
     self.assertEquals(0, hparams.base_random_seed)
 
   def testForestHParamsBigTree(self):
@@ -48,12 +47,11 @@ def testForestHParamsBigTree(self):
         num_classes=2, num_trees=100, max_nodes=1000000,
         split_after_samples=25,
         num_features=1000).fill()
-    # sqrt(1000) = 31.63...
-    self.assertEquals(32, hparams.num_splits_to_consider)
+    self.assertEquals(1000, hparams.num_splits_to_consider)
     # 1000000 / 2 = 500000
     self.assertEquals(500000, hparams.max_fertile_nodes)
-    # floor(31.63 / 25) = 1
-    self.assertEquals(1, hparams.split_initializations_per_input)
+    # floor(1000 / 25) = 40
+    self.assertEquals(40, hparams.split_initializations_per_input)
 
   def testTrainingConstructionClassification(self):
     input_data = [[-1., 0.], [-1., 2.],  # node 1
diff --git a/tensorflow/contrib/tensorboard/plugins/trace/trace_info.proto b/tensorflow/contrib/tensorboard/plugins/trace/trace_info.proto
index 09013c63876d82..9f20becb0f4d41 100644
--- a/tensorflow/contrib/tensorboard/plugins/trace/trace_info.proto
+++ b/tensorflow/contrib/tensorboard/plugins/trace/trace_info.proto
@@ -15,7 +15,7 @@ limitations under the License.
 
 syntax = "proto3";
 
-package tensorflow;
+package tensorflow.contrib.tensorboard;
 
 message TraceInfo {
   repeated OpInfo ops = 1;
diff --git a/tensorflow/contrib/training/python/training/bucket_ops.py b/tensorflow/contrib/training/python/training/bucket_ops.py
index 3f397d240103bd..e2e893676c53af 100644
--- a/tensorflow/contrib/training/python/training/bucket_ops.py
+++ b/tensorflow/contrib/training/python/training/bucket_ops.py
@@ -152,7 +152,7 @@ def bucket(tensors,
   with ops.name_scope(name, "bucket", tensor_list) as name:
     tensor_list = _validate_bucket(tensor_list)
     (tensor_list, sparse_info) = _store_sparse_tensors(
-        tensor_list, enqueue_many=False)
+        tensor_list, enqueue_many=False, keep_input=constant_op.constant(True))
 
     # Round-trip batch_size to a tensor, and possibly back
     batch_size = ops.convert_to_tensor(
diff --git a/tensorflow/contrib/training/python/training/evaluation_test.py b/tensorflow/contrib/training/python/training/evaluation_test.py
index 927f6ab75a3bfc..3d83aec94e7511 100644
--- a/tensorflow/contrib/training/python/training/evaluation_test.py
+++ b/tensorflow/contrib/training/python/training/evaluation_test.py
@@ -51,7 +51,7 @@ def testReturnsSingleCheckpointIfOneCheckpointFound(self):
     saver = tf.train.Saver()  # Saves the global step.
 
     with self.test_session() as session:
-      session.run(tf.initialize_all_variables())
+      session.run(tf.global_variables_initializer())
       save_path = os.path.join(checkpoint_dir, 'model.ckpt')
       saver.save(session, save_path, global_step=global_step)
 
@@ -81,7 +81,7 @@ def testReturnsSingleCheckpointIfOneShardedCheckpoint(self):
         target='',
         config=tf.ConfigProto(device_count={'CPU': 2})) as session:
 
-      session.run(tf.initialize_all_variables())
+      session.run(tf.global_variables_initializer())
       save_path = os.path.join(checkpoint_dir, 'model.ckpt')
       saver.save(session, save_path, global_step=global_step)
 
diff --git a/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py b/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
index 4baece2e5d99e2..3967c0fb533d5c 100644
--- a/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
+++ b/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
@@ -662,7 +662,7 @@ class SequenceQueueingStateSaver(object):
   batch_size = 32
   num_unroll = 20
   lstm_size = 8
-  cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=lstm_size)
+  cell = tf.contrib.rnn.BasicLSTMCell(num_units=lstm_size)
   initial_state_values = tf.zeros(cell.state_size, dtype=tf.float32)
 
   raw_data = get_single_input_from_input_reader()
@@ -1267,7 +1267,7 @@ def batch_sequences_with_states(input_key, input_sequences, input_context,
   num_unroll = 20
   num_enqueue_threads = 3
   lstm_size = 8
-  cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=lstm_size)
+  cell = tf.contrib.rnn.BasicLSTMCell(num_units=lstm_size)
 
   key, sequences, context = my_parser(raw_data)
   initial_state_values = tf.zeros((state_size,), dtype=tf.float32)
diff --git a/tensorflow/contrib/training/python/training/training.py b/tensorflow/contrib/training/python/training/training.py
index e65ef6ba119681..d5347a83066373 100644
--- a/tensorflow/contrib/training/python/training/training.py
+++ b/tensorflow/contrib/training/python/training/training.py
@@ -14,7 +14,230 @@
 # ==============================================================================
 """Contains various routines and helper functions for training models.
 
-TODO(nsilberman): Port documentation.
+This script contains various functions for training models. These include
+manipulating gradients, creating a `train_op` (an operation that computes the
+loss and applies the gradients) and a training loop function. The training loop
+allows the user to pass in the `train_op` and runs the optimization according
+to user-specified arguments.
+
+************************************
+* A simple working training script *
+************************************
+
+  # Load data and create the model:
+  images, labels = LoadData(...)
+  predictions = MyModel(images)
+
+  # Define the loss:
+  tf.contrib.losses.log_loss(predictions, labels)
+  total_loss = tf.contrib.losses.get_total_loss()
+
+  # Define the optimizer:
+  optimizer = tf.train.MomentumOptimizer(FLAGS.learning_rate, FLAGS.momentum)
+
+  # Create the train_op
+  train_op = tf.contrib.training.create_train_op(total_loss, optimizer)
+
+  # Run training.
+  tf.contrib.training.train(train_op, my_log_dir)
+
+*************************
+* Creating the train_op *
+*************************
+
+In order to use the `train` function, one needs a train_op: an `Operation` that
+(a) computes the loss, (b) applies the gradients to update the weights and
+(c) returns the value of the loss. tf.contrib.training.create_train_op creates
+such an `Operation`. This function also provides the ability to manipulate
+the gradients using a few arguments:
+
+  # Create the train_op and clip the gradient norms:
+  train_op = tf.contrib.training.create_train_op(
+      total_loss,
+      optimizer,
+      clip_gradient_norm=4)
+
+  # Create the train_op and scale the gradients by providing a map from variable
+  # name (or variable) to a scaling coefficient:
+  def transform_grads_fn(grads):
+    gradient_multipliers = {
+      'conv0/weights': 1.2,
+      'fc8/weights': 3.4,
+    }
+    return tf.contrib.training.multiply_gradients(
+            grads, gradient_multipliers)
+
+  train_op = tf.contrib.training.create_train_op(
+      total_loss,
+      optimizer,
+      transform_grads_fn=transform_grads_fn)
+
+****************************************************************
+* Performing additional (non-gradient) updates during training *
+****************************************************************
+
+Many networks utilize modules, like BatchNorm, that require performing a series
+of non-gradient updates during training. tf.contrib.training.create_train_op
+allows a user to pass in a list of update_ops to call along with the gradient
+updates.
+
+  train_op = tf.contrib.training.create_train_op(
+      total_loss, optimizer, update_ops)
+
+By default, tf.contrib.training.create_train_op includes all update ops that are
+part of the `tf.GraphKeys.UPDATE_OPS` collection. Additionally, the
+tf.contrib.layers.batch_norm function adds the moving mean and moving variance
+updates to this collection. Consequently, users who want to use
+tf.contrib.layers.batch_norm will not need to take any additional steps in order
+to have the moving mean and moving variance updates be computed.
+
+However, users with additional, specialized updates can either override the
+default update ops or simply add additional update ops to the
+`tf.GraphKeys.UPDATE_OPS` collection:
+
+  # Force `create_train_op` to NOT use ANY update_ops:
+  train_op = tf.contrib.training.create_train_op(
+     total_loss,
+     optimizer,
+     update_ops=[])
+
+  # Use an alternative set of update ops:
+  train_op = tf.contrib.training.create_train_op(
+     total_loss,
+     optimizer,
+     update_ops=my_other_update_ops)
+
+  # Use a set of update ops in addition to the default updates:
+  tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, my_update0)
+  tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, my_update1)
+
+  train_op = tf.contrib.training.create_train_op(
+     total_loss,
+     optimizer)
+
+  # Which is the same as:
+  train_op = tf.contrib.training.create_train_op(
+     total_loss,
+     optimizer,
+     update_ops=tf.get_collection(tf.GraphKeys.UPDATE_OPS))
+
+******************************************
+* Initializing a model from a checkpoint *
+******************************************
+
+It is common to want to 'warm-start' a model from a pre-trained checkpoint.
+One can use a tf.Scaffold and an initializing function to do so.
+
+  ...
+
+  # Create the train_op
+  train_op = tf.contrib.training.create_train_op(total_loss, optimizer)
+
+  # Create the initial assignment op
+  checkpoint_path = '/path/to/old_model_checkpoint'
+  variables_to_restore = tf.contrib.framework.get_model_variables()
+  init_fn = tf.contrib.framework.assign_from_checkpoint_fn(
+      checkpoint_path, variables_to_restore)
+
+  # Run training.
+  scaffold = tf.Scaffold(init_fn=init_fn)
+  tf.contrib.training.train(train_op, my_log_dir, scaffold=scaffold)
+
+***************************************************************************
+* Initializing a model from a checkpoint whose variable names don't match *
+***************************************************************************
+
+At times, a user may want to initialize a new model with values from a
+checkpoint whose variable names do not match those of the current model. In this
+case, one needs to create a mapping from the checkpoint variable names to the
+current model variables. This requires only a small modification of the code
+above:
+  ...
+  # Creates a model with two variables, var0 and var1
+  predictions = MyModel(images)
+  ...
+
+  # Create the train_op
+  train_op = tf.contrib.training.create_train_op(total_loss, optimizer)
+
+  checkpoint_path = '/path/to/old_model_checkpoint'
+
+  # Create the mapping:
+  variables_to_restore = {
+      'name_var_0_in_checkpoint':
+          tf.contrib.framework.get_unique_variable('var0'),
+      'name_var_1_in_checkpoint':
+          tf.contrib.framework.get_unique_variable('var1')
+  }
+  init_fn = tf.contrib.framework.assign_from_checkpoint_fn(
+        checkpoint_path, variables_to_restore)
+  scaffold = tf.Scaffold(init_fn=init_fn)
+
+  # Run training.
+  tf.contrib.training.train(train_op, my_log_dir, scaffold=scaffold)
+
+
+*************************************************
+* Fine-Tuning Part of a model from a checkpoint *
+*************************************************
+
+Rather than initializing all of the weights of a given model, we sometimes
+only want to restore some of the weights from a checkpoint. To do this, one
+need only filter those variables to initialize as follows:
+
+  ...
+
+  # Create the train_op
+  train_op = tf.contrib.training.create_train_op(total_loss, optimizer)
+
+  checkpoint_path = '/path/to/old_model_checkpoint'
+
+  # Specify the variables to restore via a list of inclusion or exclusion
+  # patterns:
+  variables_to_restore = tf.contrib.framework.get_variables_to_restore(
+      include=["conv"], exclude=["fc8", "fc9])
+  # or
+  variables_to_restore = tf.contrib.framework.get_variables_to_restore(
+      exclude=["conv"])
+
+  init_fn = tf.contrib.framework.assign_from_checkpoint_fn(
+      checkpoint_path, variables_to_restore)
+  scaffold = tf.Scaffold(init_fn=init_fn)
+
+  # Run training.
+  tf.contrib.training.train(train_op, my_log_dir, scaffold=scaffold)
+
+******************************************************
+* Initializing model variables from values in memory *
+******************************************************
+
+One may want to initialize the weights of a model from values coming from an
+arbitrary source (a text document, matlab file, etc). While this is technically
+feasible using assign operations, this strategy results in the values of your
+weights being stored in the graph. For large models, this becomes prohibitively
+large. However, it's possible to perform this initial assignment without having
+to store the values of the initial model in the graph itself by using
+placeholders and a feed dictionary:
+
+  ...
+
+  # Create the train_op
+  train_op = tf.contrib.training.create_train_op(total_loss, optimizer)
+
+  # Create the mapping from variable names to values:
+  var0_initial_value = ReadFromDisk(...)
+  var1_initial_value = ReadFromDisk(...)
+
+  var_names_to_values = {
+    'var0': var0_initial_value,
+    'var1': var1_initial_value,
+  }
+
+  init_fn = tf.contrib.framework.assign_from_values_fn(var_names_to_values)
+  scaffold = tf.Scaffold(init_fn=init_fn)
+
+  # Run training.
+  tf.contrib.training.train(train_op, my_log_dir, scaffold=scaffold)
 """
 
 from __future__ import absolute_import
@@ -149,7 +372,7 @@ def create_train_op(total_loss,
     total_loss: A `Tensor` representing the total loss.
     optimizer: A tf.Optimizer to use for computing the gradients.
     global_step: A `Tensor` representing the global step variable. If left as
-      `None`, then slim.variables.global_step() is used.
+      `None`, then tf.contrib.framework.global_step() is used.
     update_ops: An optional list of updates to execute. If `update_ops` is
       `None`, then the update ops are set to the contents of the
       `tf.GraphKeys.UPDATE_OPS` collection. If `update_ops` is not `None`, but
diff --git a/tensorflow/contrib/training/python/training/training_test.py b/tensorflow/contrib/training/python/training/training_test.py
index 918c1da018d11c..c0e79aa798757d 100644
--- a/tensorflow/contrib/training/python/training/training_test.py
+++ b/tensorflow/contrib/training/python/training/training_test.py
@@ -310,7 +310,7 @@ def testTrainWithInitFromCheckpoint(self):
       tf.set_random_seed(2)
       train_op = self.create_train_op()
 
-      model_variables = tf.all_variables()
+      model_variables = tf.global_variables()
       model_path = os.path.join(logdir1, 'model.ckpt-300')
 
       assign_fn = tf.contrib.framework.assign_from_checkpoint_fn(
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 3f9b94128a9d35..9a882ea3356f41 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -73,7 +73,6 @@ load(
     "tf_generate_proto_text_sources",
     "tf_genrule_cmd_append_to_srcs",
     "tf_opts_nortti_if_android",
-    "tf_proto_text_protos_relative",
     "cc_header_only_library",
 )
 load("//tensorflow:tensorflow.bzl", "tf_cc_test_gpu")
@@ -93,6 +92,9 @@ load(
     "tf_additional_lib_deps",
     "tf_additional_stream_executor_srcs",
     "tf_additional_cupti_wrapper_deps",
+    "tf_additional_libdevice_data",
+    "tf_additional_libdevice_deps",
+    "tf_additional_libdevice_srcs",
     "tf_additional_test_deps",
     "tf_additional_test_srcs",
     "tf_kernel_tests_linkstatic",
@@ -105,18 +107,61 @@ load(
 # -----------------------------------------------------------------------------
 # Public targets
 
+# Protos which are needed for core tensorflow, including on mobile builds.
+#
+# Note that some protos are in neither additional_core_proto_srcs nor this
+# filegroup; e.g.  ones with individual proto_library targets.
+CORE_PROTO_SRCS = [
+    "example/example.proto",
+    "example/feature.proto",
+    "framework/allocation_description.proto",
+    "framework/attr_value.proto",
+    "framework/cost_graph.proto",
+    "framework/device_attributes.proto",
+    "framework/function.proto",
+    "framework/graph.proto",
+    "framework/kernel_def.proto",
+    "framework/log_memory.proto",
+    "framework/node_def.proto",
+    "framework/op_def.proto",
+    "framework/resource_handle.proto",
+    "framework/step_stats.proto",
+    "framework/summary.proto",
+    "framework/tensor.proto",
+    "framework/tensor_description.proto",
+    "framework/tensor_shape.proto",
+    "framework/tensor_slice.proto",
+    "framework/types.proto",
+    "framework/versions.proto",
+    "lib/core/error_codes.proto",
+    "protobuf/config.proto",
+    "protobuf/tensor_bundle.proto",
+    "protobuf/saver.proto",
+    "util/memmapped_file_system.proto",
+    "util/saved_tensor_slice.proto",
+]
+
+# Protos which are not needed on mobile builds, but should be included in
+# protos_all.
+#
+# Note that some protos are in neither core_proto_srcs nor this filegroup; e.g.
+# ones with individual proto_library targets.
+ADDITIONAL_CORE_PROTO_SRCS = [
+    "example/example_parser_configuration.proto",
+    "framework/variable.proto",
+    "protobuf/control_flow.proto",
+    "protobuf/meta_graph.proto",
+    "protobuf/named_tensor.proto",
+    "protobuf/queue_runner.proto",
+    "protobuf/saved_model.proto",
+    "protobuf/tensorflow_server.proto",
+    "util/event.proto",
+    "util/test_log.proto",
+]
+
 tf_proto_library(
     name = "protos_all",
-    srcs = glob(
-        ["**/*.proto"],
-        exclude = [
-            "debug/debug_service.proto",
-            "protobuf/worker.proto",
-            "protobuf/worker_service.proto",
-            "protobuf/master.proto",
-            "protobuf/master_service.proto",
-        ],
-    ),
+    srcs = CORE_PROTO_SRCS + ADDITIONAL_CORE_PROTO_SRCS,
     cc_api_version = 2,
     go_api_version = 2,
     java_api_version = 2,
@@ -183,12 +228,7 @@ cc_library(
         "lib/io/table_builder.h",
         "lib/io/table_options.h",
         "lib/math/math_util.h",
-        "lib/monitoring/collected_metrics.h",
-        "lib/monitoring/collection_registry.h",
         "lib/monitoring/counter.h",
-        "lib/monitoring/metric_def.h",
-        "lib/monitoring/mobile_counter.h",
-        "lib/monitoring/mobile_sampler.h",
         "lib/monitoring/sampler.h",
         "lib/random/distribution_sampler.h",
         "lib/random/philox_random.h",
@@ -377,6 +417,7 @@ tf_gen_op_libs(
         "random_ops",
         "resource_variable_ops",
         "sdca_ops",
+        "set_ops",
         "script_ops",
         "sendrecv_ops",
         "sparse_ops",
@@ -420,6 +461,7 @@ cc_library(
         ":script_ops_op_lib",
         ":sdca_ops_op_lib",
         ":sendrecv_ops_op_lib",
+        ":set_ops_op_lib",
         ":sparse_ops_op_lib",
         ":state_ops_op_lib",
         ":string_ops_op_lib",
@@ -542,6 +584,7 @@ cc_library(
         "//tensorflow/core/kernels:required",
         "//tensorflow/core/kernels:resource_variable_ops",
         "//tensorflow/core/kernels:sdca_ops",
+        "//tensorflow/core/kernels:set_kernels",
         "//tensorflow/core/kernels:sparse",
         "//tensorflow/core/kernels:state",
         "//tensorflow/core/kernels:string",
@@ -646,7 +689,7 @@ load(
 # List of protos we want on android
 filegroup(
     name = "android_proto_srcs",
-    srcs = tf_android_core_proto_sources(),
+    srcs = tf_android_core_proto_sources(CORE_PROTO_SRCS),
     visibility = ["//visibility:public"],
 )
 
@@ -988,6 +1031,7 @@ cc_library(
                 "platform/gif.h",
                 "platform/jpeg.h",
                 "platform/**/cuda.h",
+                "platform/**/cuda_libdevice_path.cc",
                 "platform/**/stream_executor.h",
                 "platform/load_library.cc",
             ],
@@ -1008,6 +1052,7 @@ cc_library(
                 "platform/gif.h",
                 "platform/jpeg.h",
                 "platform/**/cuda.h",
+                "platform/**/cuda_libdevice_path.cc",
                 "platform/**/stream_executor.h",
             ],
         ),
@@ -1015,6 +1060,7 @@ cc_library(
         exclude = [
             "**/*test*",
             "platform/**/cuda.h",
+            "platform/**/cuda_libdevice_path.cc",
             "platform/**/stream_executor.h",
         ] +
         # Protobuf deps already included through the ":lib_proto_parsing"
@@ -1039,6 +1085,11 @@ cc_library(
         "lib/io/zlib_compression_options.h",
         "lib/io/zlib_inputstream.h",
         "lib/io/zlib_outputbuffer.h",
+        "lib/monitoring/collected_metrics.h",
+        "lib/monitoring/collection_registry.h",
+        "lib/monitoring/metric_def.h",
+        "lib/monitoring/mobile_counter.h",
+        "lib/monitoring/mobile_sampler.h",
         "lib/png/png_io.h",
         "lib/random/random.h",
         "lib/random/random_distributions.h",
@@ -1051,6 +1102,7 @@ cc_library(
         "platform/demangle.h",
         "platform/denormal.h",
         "platform/host_info.h",
+        "platform/monitoring.h",
         "platform/platform.h",
         "platform/protobuf_internal.h",
         "platform/tensor_coding.h",
@@ -1103,7 +1155,7 @@ cc_library(
 
 proto_text_hdrs_and_srcs = tf_generate_proto_text_sources(
     name = "proto_text_srcs_all",
-    srcs = tf_proto_text_protos_relative(),
+    srcs = CORE_PROTO_SRCS,
     srcs_relative_dir = "tensorflow/core/",
 )
 
@@ -1750,6 +1802,20 @@ tf_cc_tests_gpu(
     ],
 )
 
+tf_cc_test_gpu(
+    name = "cuda_libdevice_path_test",
+    size = "small",
+    srcs = ["platform/cuda_libdevice_path_test.cc"],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    tags = tf_cuda_tests_tags(),
+    deps = [
+        ":cuda_libdevice_path",
+        ":lib",
+        ":test",
+        ":test_main",
+    ],
+)
+
 tf_cc_test_gpu(
     name = "memory_types_test",
     size = "small",
@@ -2131,6 +2197,7 @@ tf_cc_tests(
         "ops/nn_ops_test.cc",
         "ops/parsing_ops_test.cc",
         "ops/random_ops_test.cc",
+        "ops/set_ops_test.cc",
         "ops/sparse_ops_test.cc",
         "ops/state_ops_test.cc",
         "ops/string_ops_test.cc",
@@ -2242,6 +2309,18 @@ filegroup(
     ],
 )
 
+cc_library(
+    name = "cuda_libdevice_path",
+    srcs = ["platform/cuda_libdevice_path.cc"] + tf_additional_libdevice_srcs(),
+    hdrs = ["platform/cuda_libdevice_path.h"],
+    copts = tf_copts(),
+    data = tf_additional_libdevice_data(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":lib",
+    ] + tf_additional_libdevice_deps(),
+)
+
 # -----------------------------------------------------------------------------
 # Google-internal targets go here (must be at the end).
 
diff --git a/tensorflow/core/common_runtime/device_set.cc b/tensorflow/core/common_runtime/device_set.cc
index 8ff93760d4974b..84f435b0229662 100644
--- a/tensorflow/core/common_runtime/device_set.cc
+++ b/tensorflow/core/common_runtime/device_set.cc
@@ -54,10 +54,15 @@ Device* DeviceSet::FindDeviceByName(const string& name) const {
 int DeviceSet::DeviceTypeOrder(const DeviceType& d) {
   if (StringPiece(d.type()) == DEVICE_CPU) {
     return 3;
-  } else if (StringPiece(d.type()) == DEVICE_GPU) {
+  } else if (StringPiece(d.type()) == DEVICE_GPU ||
+             StringPiece(d.type()) == DEVICE_SYCL) {
     return 2;
   } else {
-    return 1;
+    // Non-CPU/GPU devices are never prioritized over CPU and GPU, and
+    // must be explicitly selected.  This is to prevent surprising
+    // placements that cause a lot of cross-device communication
+    // between the host CPU device and other devices.
+    return 10;
   }
 }
 
diff --git a/tensorflow/core/common_runtime/device_set_test.cc b/tensorflow/core/common_runtime/device_set_test.cc
index 8ca744c004deed..7c59b43489f761 100644
--- a/tensorflow/core/common_runtime/device_set_test.cc
+++ b/tensorflow/core/common_runtime/device_set_test.cc
@@ -69,18 +69,22 @@ TEST_F(DeviceSetTest, PrioritizedDeviceTypeList) {
       types());
 
   AddDevice("SYCL", "/job:a/replica:0/task:0/device:sycl:0");
-  EXPECT_EQ(
-      (std::vector<DeviceType>{DeviceType(DEVICE_SYCL), DeviceType(DEVICE_GPU),
-                               DeviceType(DEVICE_CPU)}),
-      types());
+  EXPECT_TRUE((types()[0] == DeviceType(DEVICE_SYCL) ||
+               types()[0] == DeviceType(DEVICE_GPU)));
+  EXPECT_TRUE((types()[1] == DeviceType(DEVICE_SYCL) ||
+               types()[1] == DeviceType(DEVICE_GPU)));
+  EXPECT_TRUE(types()[2] == DeviceType(DEVICE_CPU));
 
   AddDevice("T1", "/job:a/replica:0/task:0/device:T1:0");
   AddDevice("T1", "/job:a/replica:0/task:0/device:T1:1");
   AddDevice("T2", "/job:a/replica:0/task:0/device:T2:0");
-  EXPECT_EQ((std::vector<DeviceType>{DeviceType(DEVICE_SYCL), DeviceType("T1"),
-                                     DeviceType("T2"), DeviceType(DEVICE_GPU),
-                                     DeviceType(DEVICE_CPU)}),
-            types());
+  EXPECT_TRUE((types()[0] == DeviceType(DEVICE_SYCL) ||
+               types()[0] == DeviceType(DEVICE_GPU)));
+  EXPECT_TRUE((types()[1] == DeviceType(DEVICE_SYCL) ||
+               types()[1] == DeviceType(DEVICE_GPU)));
+  EXPECT_TRUE(types()[2] == DeviceType(DEVICE_CPU));
+  EXPECT_TRUE(types()[3] == DeviceType("T1"));
+  EXPECT_TRUE(types()[4] == DeviceType("T2"));
 }
 
 }  // namespace
diff --git a/tensorflow/core/common_runtime/executor.h b/tensorflow/core/common_runtime/executor.h
index 8cca22fb6fdb32..239c9666e33911 100644
--- a/tensorflow/core/common_runtime/executor.h
+++ b/tensorflow/core/common_runtime/executor.h
@@ -39,7 +39,7 @@ class StepStatsCollector;
 //   Rendezvous* rendezvous = NewNaiveRendezvous();
 //   TF_CHECK_OK(rendezvous->Send("input", some_input_tensor));
 //   TF_CHECK_OK(executor->Run({ExecutorOpts, rendezvous, nullptr}));
-//   TF_CHECK_OK(rendezvous->Recv("input", &output_tensor));
+//   TF_CHECK_OK(rendezvous->Recv("output", &output_tensor));
 //   ... ...
 //
 // Multiple threads can call Executor::Run concurrently.
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index 695c7244aee450..11e0b3a0421bc0 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -672,6 +672,16 @@ const Edge* GetTheOnlyDataEdge(const EdgeSet& edges) {
       // a ref.
       return nullptr;
     }
+    if (IsRecv(e->src()) || IsSwitch(e->src())) {
+      // Don't touch it if the identity is introduced for control flow.
+      // Recv disables all its successors if it receives a dead signal.
+      // When Recv has an outgoing control edge, the current executor
+      // would not disable the destination. The current solution (see
+      // graph_partition.cc) is to add an identity after Recv and change
+      // the control edge to be from this identity node. So the identity
+      // can't be removed.
+      return nullptr;
+    }
     ret = e;
   }
   return ret;
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index 523547b4eb4d1f..03879f9ce63c4d 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -229,6 +229,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
 TEST_F(FunctionLibraryRuntimeTest, IsStateful) {
   Init({});
   EXPECT_TRUE(lib_->IsStateful("Variable"));
+  EXPECT_TRUE(lib_->IsStateful("VariableV2"));
   EXPECT_FALSE(lib_->IsStateful("Matmul"));
 }
 
@@ -389,7 +390,7 @@ TEST_F(FunctionLibraryRuntimeTest, ManySwapsOld) {
 }
 
 // Like the above test, but using NodeDefs in the FunctionDef.
-TEST_F(FunctionLibraryRuntimeTest, ManySwapsNodeDef) {
+TEST_F(FunctionLibraryRuntimeTest, DISABLED_ManySwapsNodeDef) {
   auto func = FDH::Create(  // Creates a FunctionDef using NodeDefs
       // Name
       "ManySwapsNodeDef",
@@ -764,14 +765,14 @@ TEST(OptimizationTest, RemoveIdentityNodes_Ref) {
       {},
       // Nodes
       {// variable
-       {{"v"}, "Variable", {}, {{"dtype", T}, {"shape", TensorShape({})}}},
+       {{"v"}, "VariableV2", {}, {{"dtype", T}, {"shape", TensorShape({})}}},
        // read the variable. Shouldn't be removed.
        {{"v_read"}, "Identity", {"v"}, {{"T", T}}},
        // returns v + v
        {{"ret"}, "Add", {"v_read", "v_read"}, {{"T", T}}}});
   const char* e0 = R"S(
 () -> (n2:float) {
-  n0 = Variable[container="", dtype=float, shape=[], shared_name=""]()
+  n0 = VariableV2[container="", dtype=float, shape=[], shared_name=""]()
   n1 = Identity[T=float](n0)
   n2 = Add[T=float](n1, n1)
 }
@@ -780,7 +781,7 @@ TEST(OptimizationTest, RemoveIdentityNodes_Ref) {
 
   const char* e1 = R"S(
 () -> (n2:float) {
-  n0 = Variable[container="", dtype=float, shape=[], shared_name=""]()
+  n0 = VariableV2[container="", dtype=float, shape=[], shared_name=""]()
   n1 = Identity[T=float](n0)
   n2 = Add[T=float](n1, n1)
 }
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index 269192373d8fa2..d40e3e4caa84ea 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -583,9 +583,9 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(const SessionOptions& options,
     // may run into trouble later with data transfer operations.  The
     // trouble may manifest as slower than expected performance, or
     // outright failures.
-    LOG(ERROR) << "Could not identify NUMA node of " << name
-               << ", defaulting to 0.  Your kernel may not have been built "
-                  "with NUMA support.";
+    LOG(INFO) << "Could not identify NUMA node of " << name
+              << ", defaulting to 0.  Your kernel may not have been built "
+              << "with NUMA support.";
     numa_node = 0;
   }
 
diff --git a/tensorflow/core/common_runtime/session.cc b/tensorflow/core/common_runtime/session.cc
index 55648bc97c9dd5..4a26a2187f5c92 100644
--- a/tensorflow/core/common_runtime/session.cc
+++ b/tensorflow/core/common_runtime/session.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/session_factory.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/monitoring.h"
 #include "tensorflow/core/public/session.h"
 
 namespace tensorflow {
@@ -52,16 +54,22 @@ Status Session::PRun(const string& handle,
 }
 
 Session* NewSession(const SessionOptions& options) {
-  SessionFactory* factory;
-  Status s = SessionFactory::GetFactory(options, &factory);
+  Session* out_session;
+  const Status s = NewSession(options, &out_session);
   if (!s.ok()) {
     LOG(ERROR) << s;
     return nullptr;
   }
-  return factory->NewSession(options);
+  return out_session;
 }
 
 Status NewSession(const SessionOptions& options, Session** out_session) {
+  // Starts the monitoring exporter the first time this method is called.
+  static bool started TF_ATTRIBUTE_UNUSED = []() {
+    monitoring::StartExporter();
+    return true;
+  }();
+
   SessionFactory* factory;
   Status s = SessionFactory::GetFactory(options, &factory);
   if (!s.ok()) {
diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc
index f5d661d199e7b4..686bc6885e06c9 100644
--- a/tensorflow/core/common_runtime/threadpool_device.cc
+++ b/tensorflow/core/common_runtime/threadpool_device.cc
@@ -59,13 +59,15 @@ Allocator* ThreadPoolDevice::GetAllocator(AllocatorAttributes attr) {
 Status ThreadPoolDevice::MakeTensorFromProto(
     const TensorProto& tensor_proto, const AllocatorAttributes alloc_attrs,
     Tensor* tensor) {
-  Tensor parsed(tensor_proto.dtype());
-  if (!parsed.FromProto(cpu_allocator(), tensor_proto)) {
-    return errors::InvalidArgument("Cannot parse tensor from proto: ",
-                                   ProtoDebugString(tensor_proto));
+  if (tensor_proto.dtype() > 0 && tensor_proto.dtype() <= DataType_MAX) {
+    Tensor parsed(tensor_proto.dtype());
+    if (parsed.FromProto(cpu_allocator(), tensor_proto)) {
+      *tensor = parsed;
+      return Status::OK();
+    }
   }
-  *tensor = parsed;
-  return Status::OK();
+  return errors::InvalidArgument("Cannot parse tensor from proto: ",
+                                 ProtoDebugString(tensor_proto));
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
index ec8c06abb492ff..3bee20623b6f6b 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
@@ -343,8 +343,16 @@ class GrpcWorkerService : public AsyncServiceInterface {
     {
       mutex_lock l(mu_);
       token = cancellation_manager_->get_cancellation_token();
-      cancellation_manager_->RegisterCallback(token,
-                                              [cm]() { cm->StartCancel(); });
+      bool already_cancelled = !cancellation_manager_->RegisterCallback(
+          token, [cm]() { cm->StartCancel(); });
+      if (already_cancelled) {
+        call->ClearCancelCallback();
+        delete cm;
+        delete collector;
+        delete out;
+        call->SendResponse(ToGrpcStatus(errors::Aborted("Call was aborted")));
+        return;
+      }
     }
     CostGraphDef* cost_graph = call->response.mutable_cost_graph();
     env_->graph_mgr->ExecuteAsync(
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index 8874e99078aefb..134bd4fadbf6d7 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -169,7 +169,7 @@ typedef std::unordered_map<string, NameInfoItem> NameInfoIndex;
 Status AddArgName(NameInfoIndex* name_info, const string& arg,
                   const NameInfoItem& item) {
   if (!name_info->insert({arg, item}).second) {
-    return errors::InvalidArgument("Duplicated arg name.");
+    return errors::InvalidArgument("Duplicated arg name: ", arg);
   }
   return Status::OK();
 }
@@ -206,7 +206,7 @@ Status BuildInputArgIndex(const OpDef::ArgDef& arg_def,
 Status AddRetName(NameInfoIndex* name_info, const string& ret,
                   const NameInfoItem& item) {
   if (!name_info->insert({ret, item}).second) {
-    return errors::InvalidArgument("Duplicated ret name.");
+    return errors::InvalidArgument("Duplicated ret name: ", ret);
   }
   return Status::OK();
 }
@@ -741,6 +741,8 @@ Status InstantiateFunction(const FunctionDef& fdef,
                            const InstantiateAttrValueMap& attr_values,
                            GetFunctionSignature get_function,
                            InstantiationResult* result) {
+  VLOG(3) << "Instantiation Function: " << Print(fdef);
+
   const OpDef& sig = fdef.signature();
   GraphDef* gdef = &result->gdef;
   gdef->Clear();
@@ -770,7 +772,8 @@ Status InstantiateFunction(const FunctionDef& fdef,
   // Makes a copy of all attrs in fdef and substitutes placeholders.
   // After this step, every attr is bound to a concrete value.
   std::vector<InstantiateAttrValueMap> node_attrs;
-  if (fdef.node_def_size() > 0) {
+  if (false && fdef.node_def_size() > 0) {
+    // TODO(josh11b): enable this branch.
     node_attrs.resize(fdef.node_def_size());
     for (int i = 0; i < fdef.node_def_size(); ++i) {
       for (auto attr : fdef.node_def(i).attr()) {
diff --git a/tensorflow/core/framework/function_test.cc b/tensorflow/core/framework/function_test.cc
index eb5aa9a534352b..e9e7bbf5b8ed28 100644
--- a/tensorflow/core/framework/function_test.cc
+++ b/tensorflow/core/framework/function_test.cc
@@ -91,7 +91,7 @@ SquarePlusOne[T:{float, double, int32, int64}](x:T) -> (y:T) {
   EXPECT_EQ(DebugString(result.gdef), e2);
 }
 
-TEST(TFunc, SquarePlusOneNodeDef) {
+TEST(TFunc, DISABLED_SquarePlusOneNodeDef) {
   auto fdef = FDH::Create(  // Create a FunctionDef using NodeDefs.
       // Name
       "SquarePlusOne",
@@ -137,7 +137,7 @@ SquarePlusOne[T:{float, double, int32, int64}](x:T) -> (y:T) {
   EXPECT_EQ(DebugString(result.gdef), e2);
 }
 
-TEST(TFunc, ControlDepNodeDef) {
+TEST(TFunc, DISABLED_ControlDepNodeDef) {
   auto fdef = FDH::Create(  // Create a FunctionDef using NodeDefs.
       // Name
       "ControlDep",
@@ -224,7 +224,7 @@ BackCompat() -> (y:float) {
   EXPECT_EQ(DebugString(result.gdef), e2);
 }
 
-TEST(TFunc, MissingTypeAttrNodeDef) {
+TEST(TFunc, DISABLED_MissingTypeAttrNodeDef) {
   auto fdef = FDH::Create(  // Create a FunctionDef using NodeDefs.
       // Name
       "BackCompat",
@@ -262,7 +262,7 @@ BackCompat() -> (y:float) {
   EXPECT_EQ(DebugString(result.gdef), e2);
 }
 
-TEST(TFunc, NTimesTNodeDef) {
+TEST(TFunc, DISABLED_NTimesTNodeDef) {
   // Note that the equivalent FunctionDef using FunctionDef::Node requires
   // using a _ListToArray to package up the two inputs to AddN as a single
   // N*T edge.
@@ -777,7 +777,7 @@ TEST(InstantiateErrors, TypeList_Missing_Arg) {
            "arg[1] is not found");
 }
 
-TEST(InstantiateErrors, NodeDef_TooManyInputs) {
+TEST(InstantiateErrors, DISABLED_NodeDef_TooManyInputs) {
   auto fdef = FDH::Create(  // Create a FunctionDef using NodeDefs.
       // Name
       "TooManyInputs",
@@ -798,7 +798,7 @@ TEST(InstantiateErrors, NodeDef_TooManyInputs) {
            "Expected input[2] == 'x' to be a control input.");
 }
 
-TEST(InstantiateErrors, NodeDef_TooFewInputs) {
+TEST(InstantiateErrors, DISABLED_NodeDef_TooFewInputs) {
   auto fdef = FDH::Create(  // Create a FunctionDef using NodeDefs.
       // Name
       "TooFewInputs",
@@ -819,7 +819,7 @@ TEST(InstantiateErrors, NodeDef_TooFewInputs) {
            "Attempt to access beyond input size: 2 >= 2");
 }
 
-TEST(InstantiateErrors, NodeDef_TooManyInputsFromArray1) {
+TEST(InstantiateErrors, DISABLED_NodeDef_TooManyInputsFromArray1) {
   auto fdef = FDH::Create(  // Create a FunctionDef using NodeDefs.
       // Name
       "TooManyInputsFromArray",
@@ -847,7 +847,7 @@ TEST(InstantiateErrors, NodeDef_TooManyInputsFromArray1) {
            "Expected input[1] == 'y' to be a control input.");
 }
 
-TEST(InstantiateErrors, NodeDef_TooManyInputsFromArray2) {
+TEST(InstantiateErrors, DISABLED_NodeDef_TooManyInputsFromArray2) {
   auto fdef = FDH::Create(  // Create a FunctionDef using NodeDefs.
       // Name
       "TooManyInputsFromArray",
@@ -875,7 +875,7 @@ TEST(InstantiateErrors, NodeDef_TooManyInputsFromArray2) {
            "Input a:output too long for inputs");
 }
 
-TEST(InstantiateErrors, NodeDef_TypeMismatch) {
+TEST(InstantiateErrors, DISABLED_NodeDef_TypeMismatch) {
   auto fdef = FDH::Create(  // Create a FunctionDef using NodeDefs.
       // Name
       "TypeMismatch",
diff --git a/tensorflow/core/framework/lookup_interface.h b/tensorflow/core/framework/lookup_interface.h
index b0b661be0ec676..1381dd66a56c7e 100644
--- a/tensorflow/core/framework/lookup_interface.h
+++ b/tensorflow/core/framework/lookup_interface.h
@@ -114,7 +114,9 @@ class LookupInterface : public ResourceBase {
   // - the default_value tensor shape matches the table's value shape.
   Status CheckFindArguments(const Tensor& keys, const Tensor& default_value);
 
-  string DebugString() override { return "A lookup table"; }
+  string DebugString() override {
+    return strings::StrCat("A lookup table of size: ", size());
+  }
 
   // Returns an InitializableLookupTable, a subclass of LookupInterface, if the
   // current object is an InitializableLookupTable. Otherwise, returns nullptr.
diff --git a/tensorflow/core/framework/queue_interface.h b/tensorflow/core/framework/queue_interface.h
index 9c592e63e8b81f..baddf0bbfa21b8 100644
--- a/tensorflow/core/framework/queue_interface.h
+++ b/tensorflow/core/framework/queue_interface.h
@@ -86,7 +86,9 @@ class QueueInterface : public ResourceBase {
 
   virtual const DataTypeVector& component_dtypes() const = 0;
 
-  string DebugString() override { return "A queue"; }
+  string DebugString() override {
+    return strings::StrCat("A Queue of size: ", size());
+  }
 
  protected:
   virtual ~QueueInterface() {}
diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index 965ca437860614..078fdcb4e77b5e 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -399,16 +399,19 @@ TensorBuffer* FromProtoField(Allocator* a, const TensorProto& in, int64 n) {
   }
 
   const int64 in_n = ProtoHelper<T>::NumElements(in);
-  auto begin = ProtoHelper<T>::Begin(in);
-  if (n <= in_n) {
-    std::copy_n(begin, n, data);
-  } else if (in_n > 0) {
-    std::copy_n(begin, in_n, data);
-    const T& last = *(data + in_n - 1);
-    std::fill_n(data + in_n, n - in_n, last);
-  } else {
+  if (in_n <= 0) {
     std::fill_n(data, n, T());
+  } else {
+    auto begin = ProtoHelper<T>::Begin(in);
+    if (n <= in_n) {
+      std::copy_n(begin, n, data);
+    } else {
+      std::copy_n(begin, in_n, data);
+      const T& last = *(data + in_n - 1);
+      std::fill_n(data + in_n, n - in_n, last);
+    }
   }
+
   return buf;
 }
 
@@ -532,35 +535,39 @@ void Tensor::UnsafeCopyFromInternal(const Tensor& other, DataType dtype,
     STMTS;                            \
     break;                            \
   }
-#define CASES(TYPE_ENUM, STMTS)                       \
-  switch (TYPE_ENUM) {                                \
-    CASE(float, SINGLE_ARG(STMTS))                    \
-    CASE(double, SINGLE_ARG(STMTS))                   \
-    CASE(int32, SINGLE_ARG(STMTS))                    \
-    CASE(uint8, SINGLE_ARG(STMTS))                    \
-    CASE(uint16, SINGLE_ARG(STMTS))                   \
-    CASE(int16, SINGLE_ARG(STMTS))                    \
-    CASE(int8, SINGLE_ARG(STMTS))                     \
-    CASE(string, SINGLE_ARG(STMTS))                   \
-    CASE(complex64, SINGLE_ARG(STMTS))                \
-    CASE(complex128, SINGLE_ARG(STMTS))               \
-    CASE(int64, SINGLE_ARG(STMTS))                    \
-    CASE(bool, SINGLE_ARG(STMTS))                     \
-    CASE(qint32, SINGLE_ARG(STMTS))                   \
-    CASE(quint8, SINGLE_ARG(STMTS))                   \
-    CASE(qint8, SINGLE_ARG(STMTS))                    \
-    CASE(quint16, SINGLE_ARG(STMTS))                  \
-    CASE(qint16, SINGLE_ARG(STMTS))                   \
-    CASE(bfloat16, SINGLE_ARG(STMTS))                 \
-    CASE(Eigen::half, SINGLE_ARG(STMTS))              \
-    CASE(ResourceHandle, SINGLE_ARG(STMTS))           \
-    case DT_INVALID:                                  \
-      LOG(FATAL) << "Type not set";                   \
-      break;                                          \
-    default:                                          \
-      LOG(FATAL) << "Unexpected type: " << TYPE_ENUM; \
-      break;                                          \
-  }
+#define CASES_WITH_DEFAULT(TYPE_ENUM, STMTS, INVALID, DEFAULT) \
+  switch (TYPE_ENUM) {                                         \
+    CASE(float, SINGLE_ARG(STMTS))                             \
+    CASE(double, SINGLE_ARG(STMTS))                            \
+    CASE(int32, SINGLE_ARG(STMTS))                             \
+    CASE(uint8, SINGLE_ARG(STMTS))                             \
+    CASE(uint16, SINGLE_ARG(STMTS))                            \
+    CASE(int16, SINGLE_ARG(STMTS))                             \
+    CASE(int8, SINGLE_ARG(STMTS))                              \
+    CASE(string, SINGLE_ARG(STMTS))                            \
+    CASE(complex64, SINGLE_ARG(STMTS))                         \
+    CASE(complex128, SINGLE_ARG(STMTS))                        \
+    CASE(int64, SINGLE_ARG(STMTS))                             \
+    CASE(bool, SINGLE_ARG(STMTS))                              \
+    CASE(qint32, SINGLE_ARG(STMTS))                            \
+    CASE(quint8, SINGLE_ARG(STMTS))                            \
+    CASE(qint8, SINGLE_ARG(STMTS))                             \
+    CASE(quint16, SINGLE_ARG(STMTS))                           \
+    CASE(qint16, SINGLE_ARG(STMTS))                            \
+    CASE(bfloat16, SINGLE_ARG(STMTS))                          \
+    CASE(Eigen::half, SINGLE_ARG(STMTS))                       \
+    CASE(ResourceHandle, SINGLE_ARG(STMTS))                    \
+    case DT_INVALID:                                           \
+      INVALID;                                                 \
+      break;                                                   \
+    default:                                                   \
+      DEFAULT;                                                 \
+      break;                                                   \
+  }
+
+#define CASES(TYPE_ENUM, STMTS)                                      \
+  CASES_WITH_DEFAULT(TYPE_ENUM, STMTS, LOG(FATAL) << "Type not set"; \
+                     , LOG(FATAL) << "Unexpected type: " << TYPE_ENUM;)
 
 Tensor::Tensor(Allocator* a, DataType type, const TensorShape& shape)
     : shape_(shape), buf_(nullptr) {
@@ -665,13 +672,16 @@ bool Tensor::FromProto(Allocator* a, const TensorProto& proto) {
   TensorShape shape(proto.tensor_shape());
   const int64 N = shape.num_elements();
   if (N > 0 && proto.dtype()) {
+    bool dtype_error = false;
     if (!proto.tensor_content().empty()) {
       const auto& content = proto.tensor_content();
-      CASES(proto.dtype(), p = Helper<T>::Decode(a, content, N));
+      CASES_WITH_DEFAULT(proto.dtype(), p = Helper<T>::Decode(a, content, N),
+                         dtype_error = true, dtype_error = true);
     } else {
-      CASES(proto.dtype(), p = FromProtoField<T>(a, proto, N));
+      CASES_WITH_DEFAULT(proto.dtype(), p = FromProtoField<T>(a, proto, N),
+                         dtype_error = true, dtype_error = true);
     }
-    if (p == nullptr) return false;
+    if (dtype_error || p == nullptr) return false;
   }
   shape_ = shape;
   set_dtype(proto.dtype());
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index 648f4396074eff..3e75d4435bac69 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -93,6 +93,7 @@ void Node::Initialize(int id, int cost_id, Properties* props) {
   SET_CLASS(NC_HOST_RECV, ts, "_HostRecv", "");
   SET_CLASS(NC_CONSTANT, ts, "Const", "HostConst");
   SET_CLASS(NC_VARIABLE, ts, "Variable", "");
+  SET_CLASS(NC_VARIABLE, ts, "VariableV2", "");
   SET_CLASS(NC_IDENTITY, ts, "Identity", "RefIdentity");
   SET_CLASS(NC_GET_SESSION_HANDLE, ts, "GetSessionHandle", "");
   SET_CLASS(NC_GET_SESSION_TENSOR, ts, "GetSessionTensor", "");
diff --git a/tensorflow/core/graph/graph_partition.cc b/tensorflow/core/graph/graph_partition.cc
index a35f3ff15cba9d..8e9eceb6992b28 100644
--- a/tensorflow/core/graph/graph_partition.cc
+++ b/tensorflow/core/graph/graph_partition.cc
@@ -74,28 +74,6 @@ struct RecvInfo {
 typedef std::unordered_map<DupRecvKey, RecvInfo, DupRecvKeyHash, DupRecvKeyEq>
     DupRecvTable;
 
-struct DupControlKey {
-  int dst_node_id;      // Edge's dst node id
-  GraphDef* src_graph;  // Edge's src node is in this subgraph
-};
-
-struct DupControlKeyHash {
-  size_t operator()(const DupControlKey& k) const {
-    return Hash64(reinterpret_cast<const char*>(&k.src_graph),
-                  sizeof(k.src_graph), k.dst_node_id);
-  }
-};
-
-struct DupControlKeyEq {
-  bool operator()(const DupControlKey& x, const DupControlKey& y) const {
-    return (x.dst_node_id == y.dst_node_id) && (x.src_graph == y.src_graph);
-  }
-};
-
-typedef std::unordered_map<DupControlKey, NodeDef*, DupControlKeyHash,
-                           DupControlKeyEq>
-    DupControlTable;
-
 struct PairIntHash {
  public:
   std::size_t operator()(const std::pair<int, int>& x) const {
@@ -847,7 +825,6 @@ Status Partition(const PartitionOptions& opts, Graph* g,
   string dstp;
   std::vector<const Edge*> inputs;
   DupRecvTable dup_recv(3);
-  DupControlTable dup_control(3);
   // For a node dst, 'ref_recvs' remembers the recvs introduced by a ref
   // edge to dst. 'ref_control_inputs' remembers the inputs by a non-ref
   // edge to dst. We will add a control edge for every pair in
@@ -941,9 +918,7 @@ Status Partition(const PartitionOptions& opts, Graph* g,
       }
 
       // Check whether there is already a send/recv pair transferring
-      // the same tensor/control from src to the dst partition. This
-      // handles the dedup case when a single source in one partition
-      // going to multiple destinations in another partition.
+      // the same tensor/control from the src to dst partition.
       const bool on_host = IsDstInputOnHost(edge, g_info);
       DupRecvKey key{src->id(), edge->src_output(), dst_graph, on_host};
       auto iter = dup_recv.find(key);
@@ -968,16 +943,6 @@ Status Partition(const PartitionOptions& opts, Graph* g,
 
       NodeDefBuilder::NodeOut send_from;
       if (edge->IsControlEdge()) {
-        // This handles the dedup case when multiple control edges going from
-        // one partition to a single destination in another partition.
-        DupControlKey key{dst->id(), src_graph};
-        auto iter = dup_control.find(key);
-        if (iter != dup_control.end()) {
-          // This could cause start_time(src) > start_time(iter->second).
-          AddInput(iter->second, src->name(), Graph::kControlSlot);
-          continue;
-        }
-
         // Insert a dummy const node that will generate a tiny
         // data element to be sent from send to recv.
         VLOG(1) << "Send/Recv control: " << src->assigned_device_name() << "["
@@ -991,7 +956,6 @@ Status Partition(const PartitionOptions& opts, Graph* g,
         }
         AddInput(dummy, src->name(), Graph::kControlSlot);
         send_from.Reset(dummy->name(), 0, DT_FLOAT);
-        dup_control[key] = dummy;
       } else {
         send_from.Reset(src->name(), edge->src_output(), EdgeType(edge));
       }
diff --git a/tensorflow/core/graph/graph_partition_test.cc b/tensorflow/core/graph/graph_partition_test.cc
index fd259f0b407895..d8322e60778dc1 100644
--- a/tensorflow/core/graph/graph_partition_test.cc
+++ b/tensorflow/core/graph/graph_partition_test.cc
@@ -398,37 +398,5 @@ TEST_F(GraphPartitionTest, PartitionIncompleteGraph) {
   EXPECT_EQ(error::INVALID_ARGUMENT, status.code()) << status;
 }
 
-TEST_F(GraphPartitionTest, CrossDevice_MultiControl) {
-  using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
-  auto a1 = Input(in_.WithOpName("A1"));
-  auto a2 = Input(in_.WithOpName("A2"));
-  auto b1 = Input(in_.WithOpName("B1"));
-  Combine(
-      in_.WithOpName("B2").WithControlDependencies(a1).WithControlDependencies(
-          a2),
-      b1, b1);
-
-  Partition(ToGraphDef(), &partitions_);
-  EXPECT_EQ(2, partitions_.size());
-
-  string a = "/job:a/replica:0/task:0/cpu:0";
-  string b = "/job:a/replica:0/task:0/cpu:1";
-  a1 = Input(scope_a_.WithOpName("A1"));
-  a2 = Input(scope_a_.WithOpName("A2"));
-  auto c = Const(scope_a_.WithOpName("A1/_0")
-                     .WithControlDependencies(a1)
-                     .WithControlDependencies(a2),
-                 {});
-  _Send(scope_a_.WithOpName("A1/_1"), c, "edge_3_A1", a, 82, b);
-  ExpectMatchA();
-
-  auto recv =
-      _Recv(scope_b_.WithOpName("A1/_2"), DT_FLOAT, "edge_3_A1", a, 82, b);
-  auto id = Identity(scope_b_.WithOpName("A1/_3"), recv);
-  b1 = Input(scope_b_.WithOpName("B1"));
-  Combine(scope_b_.WithOpName("B2").WithControlDependencies(id), b1, b1);
-  ExpectMatchB();
-}
-
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/graph/quantize_training.cc b/tensorflow/core/graph/quantize_training.cc
index 75582927e67ad0..21756b56f51200 100644
--- a/tensorflow/core/graph/quantize_training.cc
+++ b/tensorflow/core/graph/quantize_training.cc
@@ -75,7 +75,7 @@ inline bool IsGradientNode(const Graph* graph, const Node* node) {
 bool FindType(const Graph* graph, const Node* node, bool* signed_input,
               bool* range_given, float* input_min, float* input_max) {
   const string& src_op = node->type_string();
-  if (src_op == "Const" || src_op == "Variable") {
+  if (src_op == "Const" || src_op == "Variable" || src_op == "VariableV2") {
     *signed_input = true;
     *range_given = false;
   } else if (src_op == "Relu") {
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 101727191a037a..df775fcf82f5c0 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -41,6 +41,16 @@ load(
     "tf_kernel_tests_linkstatic",
 )
 
+config_setting(
+    # Add "--define tensorflow_xsmm=1" to your build command to use libxsmm
+    # for convolutions (and possibly more in the future). You will also need
+    # appropriate -mavx*, as required by specific op you use.
+    name = "xsmm",
+    values = {
+        "define": "tensorflow_xsmm=1",
+    },
+)
+
 # Public support libraries ----------------------------------------------------
 
 cc_library(
@@ -421,6 +431,17 @@ tf_kernel_library(
     deps = ARRAY_DEPS,
 )
 
+tf_kernel_library(
+    name = "set_kernels",
+    prefix = "set_kernels",
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:set_ops_op_lib",
+        "//third_party/eigen3",
+    ],
+)
+
 tf_kernel_library(
     name = "debug_ops",
     prefix = "debug_ops",
@@ -717,6 +738,17 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "xsmm_conv2d_test",
+    size = "small",
+    srcs = ["xsmm_conv2d_test.cc"],
+    deps = [
+        ":conv_ops",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 tf_cc_test(
     name = "conv_ops_test",
     size = "small",
@@ -2238,13 +2270,23 @@ tf_kernel_library(
         "conv_grad_ops.cc",
         "conv_grad_ops_3d.cc",
         "deep_conv2d.cc",
-    ],
+    ] + select({
+        ":xsmm": ["xsmm_conv2d.cc"],
+        "//conditions:default": [],
+    }),
     hdrs = [
         "conv_grad_ops.h",
         "deep_conv2d.h",
         "gemm_functors.h",
         "winograd_transform.h",
-    ],
+    ] + select({
+        ":xsmm": ["xsmm_conv2d.h"],
+        "//conditions:default": [],
+    }),
+    defines = select({
+        ":xsmm": ["TENSORFLOW_USE_LIBXSMM"],
+        "//conditions:default": [],
+    }),
     prefix = "conv_ops",
     deps = [
         ":bounds_check",
@@ -2255,8 +2297,15 @@ tf_kernel_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
-    ],
+    ] + select({
+        ":xsmm": [
+            "@libxsmm_archive//:libxsmm_headers",
+            "@libxsmm_archive//:xsmm_avx",
+        ],
+        "//conditions:default": [],
+    }),
 )
 
 tf_kernel_library(
@@ -3438,7 +3487,13 @@ filegroup(
         "fused_batch_norm_op.cc",
         "winograd_transform.h",
         ":android_extended_ops_headers",
-    ],
+    ] + select({
+        ":xsmm": [
+            "xsmm_conv2d.h",
+            "xsmm_conv2d.cc",
+        ],
+        "//conditions:default": [],
+    }),
 )
 
 filegroup(
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index ab22a032126fde..b01263f288bdaa 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -474,14 +474,22 @@ class AbortOp : public OpKernel {
  public:
   explicit AbortOp(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("error_msg", &error_msg_));
+    OP_REQUIRES_OK(
+        context, context->GetAttr("exit_without_error", &exit_without_error_));
   }
 
   void Compute(OpKernelContext* context) override {
-    CHECK(false) << "Abort_op intentional failure; " << error_msg_;
+    if (!exit_without_error_) {
+      CHECK(false) << "Abort_op intentional failure; " << error_msg_;
+    } else {
+      LOG(WARNING) << "Exiting the process: " << error_msg_;
+      exit(0);
+    }
   }
 
  private:
   string error_msg_;
+  bool exit_without_error_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("Abort").Device(DEVICE_CPU), AbortOp);
diff --git a/tensorflow/core/kernels/control_flow_ops_test.cc b/tensorflow/core/kernels/control_flow_ops_test.cc
index 383bc927be8bca..301609e04dc229 100644
--- a/tensorflow/core/kernels/control_flow_ops_test.cc
+++ b/tensorflow/core/kernels/control_flow_ops_test.cc
@@ -117,5 +117,14 @@ TEST_F(AbortOpTest, default_msg) {
               "Abort_op intentional failure; ");
 }
 
+// Exit normally.
+TEST_F(AbortOpTest, exit_normally) {
+  TF_ASSERT_OK(NodeDefBuilder("abort_op", "Abort")
+                   .Attr("exit_without_error", true)
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  EXPECT_EXIT(RunOpKernel(), ::testing::ExitedWithCode(0), "");
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index 2d1b21d9e437ce..f6e3b532aa2404 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -32,6 +32,9 @@ limitations under the License.
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/deep_conv2d.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#ifdef TENSORFLOW_USE_LIBXSMM
+#include "tensorflow/core/kernels/xsmm_conv2d.h"
+#endif
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/strings/numbers.h"
@@ -157,6 +160,68 @@ class LaunchDeepConvOp<CPUDevice, float> {
   }
 };
 
+#ifdef TENSORFLOW_USE_LIBXSMM
+template <typename Device, typename T>
+class LaunchXsmmConvOp {
+ public:
+  static bool Run(OpKernelContext* ctx, const Tensor& input,
+                  const Tensor& filter, int batch, int input_rows,
+                  int input_cols, int in_depth, int filter_rows,
+                  int filter_cols, int pad_rows, int pad_cols, int out_rows,
+                  int out_cols, int out_depth, int stride_rows, int stride_cols,
+                  Tensor* output, TensorFormat data_format) {
+    return false;
+  }
+};
+
+template <>
+class LaunchXsmmConvOp<CPUDevice, float> {
+ public:
+  static bool Run(OpKernelContext* ctx, const Tensor& input,
+                  const Tensor& filter, int batch, int input_rows,
+                  int input_cols, int in_depth, int filter_rows,
+                  int filter_cols, int pad_rows, int pad_cols, int out_rows,
+                  int out_cols, int out_depth, int stride_rows, int stride_cols,
+                  Tensor* output, TensorFormat data_format) {
+    // See libxsmm_dnn.h for this struct definition.
+    libxsmm_dnn_conv_desc desc;
+    desc.N = batch;
+    desc.C = in_depth;
+    desc.H = input_rows;
+    desc.W = input_cols;
+    desc.K = out_depth;
+    desc.R = filter_rows;
+    desc.S = filter_cols;
+    desc.u = stride_rows;
+    desc.v = stride_cols;
+    desc.pad_h_in = pad_rows;  // ignored by libxsmm for now.
+    desc.pad_w_in = pad_cols;  // ignored by libxsmm for now.
+    desc.pad_h_out = 0;
+    desc.pad_w_out = 0;
+    desc.threads = 0;  // Unknown at this point, will be set later.
+    desc.algo = LIBXSMM_DNN_CONV_ALGO_DIRECT;
+    desc.buffer_format = LIBXSMM_DNN_CONV_FORMAT_NHWC;
+    desc.filter_format = LIBXSMM_DNN_CONV_FORMAT_RSCK;
+    desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE;
+    desc.options = LIBXSMM_DNN_CONV_OPTION_NONE;
+    desc.datatype_in = LIBXSMM_DNN_DATATYPE_F32;
+    desc.datatype_out = LIBXSMM_DNN_DATATYPE_F32;
+
+    if (!CanUseXsmmConv2D(desc, data_format)) {
+      return false;
+    }
+
+    auto input_ptr = input.template flat<float>().data();
+    auto filter_ptr = filter.template flat<float>().data();
+    auto output_ptr = output->template flat<float>().data();
+
+    functor::XsmmConv2D<CPUDevice, float>()(ctx, desc, input_ptr, filter_ptr,
+                                            output_ptr);
+    return true;
+  }
+};
+#endif
+
 template <typename Device, typename T>
 class Conv2DOp : public BinaryOp<T> {
  public:
@@ -275,6 +340,15 @@ class Conv2DOp : public BinaryOp<T> {
       return;
     }
 
+#ifdef TENSORFLOW_USE_LIBXSMM
+    if (LaunchXsmmConvOp<Device, T>::Run(
+            context, input, filter, batch, input_rows, input_cols, in_depth,
+            filter_rows, filter_cols, pad_rows, pad_cols, out_rows, out_cols,
+            out_depth, stride_rows, stride_cols, output, data_format_)) {
+      return;
+    }
+#endif
+
     if (LaunchDeepConvOp<Device, T>::Run(
             context, input, filter, batch, input_rows, input_cols, in_depth,
             filter_rows, filter_cols, pad_rows, pad_cols, out_rows, out_cols,
diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h
index 34103347fb975e..981b60c58c9499 100644
--- a/tensorflow/core/kernels/cwise_ops.h
+++ b/tensorflow/core/kernels/cwise_ops.h
@@ -440,6 +440,7 @@ struct use_bcast_optimization<double> {
 // rsqrt(x) = x^(-1/2)
 // exp(x) = e^x
 // log(x) = natural logarithm of x
+// log1p(x) = natural logarithm of 1 + x
 // tanh = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
 // sigmoid = 1 / (1 + exp(-x))  // a.k.a, logistic
 //
diff --git a/tensorflow/core/kernels/fill_functor.cc b/tensorflow/core/kernels/fill_functor.cc
index 96d8cd604aea12..94cc91bba3fe89 100644
--- a/tensorflow/core/kernels/fill_functor.cc
+++ b/tensorflow/core/kernels/fill_functor.cc
@@ -50,7 +50,6 @@ DEFINE_SETZERO_CPU(int32);
 DEFINE_SETZERO_CPU(int64);
 DEFINE_SETZERO_CPU(complex64);
 DEFINE_SETZERO_CPU(complex128);
-DEFINE_SETZERO_CPU(string);
 #undef DEFINE_SETZERO_CPU
 
 #ifdef TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/hexagon/BUILD b/tensorflow/core/kernels/hexagon/BUILD
index 444180f9862980..1222093a7a327b 100644
--- a/tensorflow/core/kernels/hexagon/BUILD
+++ b/tensorflow/core/kernels/hexagon/BUILD
@@ -46,7 +46,10 @@ tf_cc_test(
 tf_cc_test(
     name = "graph_transferer_test",
     size = "small",
-    srcs = ["graph_transferer_test.cc"],
+    srcs = [
+        "graph_transferer_test.cc",
+        "hexagon_graph_execution_test.cc",
+    ],
     deps = [
         ":graph_transferer",
         "//tensorflow/cc:cc_ops",
@@ -70,13 +73,16 @@ tf_kernel_library(
     name = "graph_transferer",
     srcs = [
         "graph_transferer.cc",
+        "hexagon_control_wrapper.cc",
         "hexagon_ops_definitions.cc",
         "i_graph_transfer_ops_definitions.cc",
     ],
     hdrs = [
         "graph_transferer.h",
+        "hexagon_control_wrapper.h",
         "hexagon_ops_definitions.h",
         "i_graph_transfer_ops_definitions.h",
+        "i_soc_control_wrapper.h",
     ],
     data = ["//tensorflow/core:example_parser_configuration_testdata"],
     deps = [
diff --git a/tensorflow/core/kernels/hexagon/graph_transferer.cc b/tensorflow/core/kernels/hexagon/graph_transferer.cc
index da13d640526307..352c2c63f0d631 100644
--- a/tensorflow/core/kernels/hexagon/graph_transferer.cc
+++ b/tensorflow/core/kernels/hexagon/graph_transferer.cc
@@ -256,8 +256,8 @@ Status GraphTransferer::LoadGraphFromProtoFile(
   for (int i = 0; i < input_node_info_list.size(); ++i) {
     const string& name = input_node_info_list.at(i).name;
     CHECK(output_tensor_map.count(name) == 0);
-    output_tensor_map.emplace(
-        name, &output_tensors.at(output_node_names.size() - 1 + i));
+    output_tensor_map.emplace(name,
+                              &output_tensors.at(output_node_names.size() + i));
   }
   CHECK(graph_def.node_size() == output_tensors.size());
   return status;
@@ -482,7 +482,7 @@ void GraphTransferer::RegisterNodeWithPaddingAndStrides(
     std::vector<int32> kernel_sizes;
     context->GetAttr(KSIZE_ATTR_NAME, &kernel_sizes);
     const int ksize_id = RegisterConstantShape(kernel_sizes);
-    extra_inputs.push_back(ksize_id);
+    extra_inputs.insert(extra_inputs.begin(), ksize_id);
   }
   const std::string padding_str =
       padding == VALID ? PADDING_VALID_STR : PADDING_SAME_STR;
@@ -605,13 +605,16 @@ void GraphTransferer::AppendNodeInputParams(
   NodeInputParams input_params;
   input_params.node_id = id;
   for (int i = 0; i < node.num_inputs(); ++i) {
-    const Node* input_node = nullptr;
-    TF_CHECK_OK(node.input_node(i, &input_node));
+    const Edge* edge = nullptr;
+    TF_CHECK_OK(node.input_edge(i, &edge));
+    const Node* input_node = edge->src();
+    const int port = edge->src_output();
+
     const std::string& op_name = input_node->name();
     CHECK(node_name_to_id_cache_map_.count(op_name) > 0) << op_name;
     const int src_id = node_name_to_id_cache_map_[op_name];
     input_params.input_node_id_and_output_port_list.emplace_back(
-        std::make_tuple(src_id, i));
+        std::make_tuple(src_id, port));
   }
   for (const int extra_input : extra_inputs) {
     input_params.input_node_id_and_output_port_list.emplace_back(
@@ -637,8 +640,7 @@ void GraphTransferer::AppendNodeOutputParams(
     CHECK(output_node != nullptr) << node.name() << ", " << node.type_string();
     const int output_index = i;
     const DataType dt = node.output_type(output_index);
-    const size_t max_bytes_per_data =
-        checkpoint::TensorSliceWriter::MaxBytesPerElement(dt);
+    const size_t max_bytes_per_data = DataTypeSize(dt);
     shape_inference::InferenceContext* context =
         shape_refiner.GetContext(output_node);
     shape_inference::ShapeHandle shape_handle = context->output(output_index);
@@ -836,7 +838,8 @@ void GraphTransferer::DumpVerificationStringOfNodeTransferParams() const {
     sstream << "---(INPUT) [" << std::hex << params.node_id << std::dec;
     for (const std::tuple<int, int>& pair :
          params.input_node_id_and_output_port_list) {
-      sstream << "," << std::get<0>(pair) << "," << std::get<1>(pair);
+      sstream << "," << std::hex << std::get<0>(pair) << std::dec << ","
+              << std::get<1>(pair);
     }
     sstream << "]";
     LOG(INFO) << sstream.str();
diff --git a/tensorflow/core/kernels/hexagon/graph_transferer_test.cc b/tensorflow/core/kernels/hexagon/graph_transferer_test.cc
index 23d57ff3e98e29..b9a4c8aff0626a 100644
--- a/tensorflow/core/kernels/hexagon/graph_transferer_test.cc
+++ b/tensorflow/core/kernels/hexagon/graph_transferer_test.cc
@@ -423,8 +423,5 @@ TEST(GraphTransferer, LoadGraphFromProtoFile) {
   Status status = gt.LoadGraphFromProtoFile(
       *ops_definitions, filename, input_node_info_list, output_node_names,
       is_text_proto, true, &output_tensor_info);
-  // TODO(satok): Uncomment following assert once we fix the loader problem
-  // ASSERT_TRUE(status.ok()) << status;
 }
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc
new file mode 100644
index 00000000000000..ea5d0a73fa2b4b
--- /dev/null
+++ b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc
@@ -0,0 +1,63 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+vcyou may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h"
+
+namespace tensorflow {
+
+int HexagonControlWrapper::GetVersion() const {
+  // TODO: Implement
+  return 1;
+}
+
+bool HexagonControlWrapper::Init() {
+  // TODO: Implement
+  return false;
+}
+
+bool HexagonControlWrapper::Finalize() {
+  // TODO: Implement
+  return false;
+}
+bool HexagonControlWrapper::SetupGraph(
+    const GraphTransferer &graph_transferer) {
+  // TODO: Implement
+  return false;
+}
+
+bool HexagonControlWrapper::ExecuteGraph() {
+  // TODO: Implement
+  return false;
+}
+
+bool HexagonControlWrapper::TeardownGraph() {
+  // TODO: Implement
+  return false;
+}
+
+bool HexagonControlWrapper::FillInputNode(const string node_name,
+                                          const ByteArray bytes) {
+  // TODO: Implement
+  return false;
+}
+
+bool HexagonControlWrapper::ReadOutputNode(
+    const string node_name, std::vector<ByteArray> *const outputs) const {
+  CHECK(outputs != nullptr);
+  // TODO: Implement
+  return false;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h
new file mode 100644
index 00000000000000..743cb5393efd76
--- /dev/null
+++ b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h
@@ -0,0 +1,50 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+vcyou may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_CONTROL_WRAPPER_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_CONTROL_WRAPPER_H_
+
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/hexagon/graph_transferer.h"
+#include "tensorflow/core/kernels/hexagon/i_soc_control_wrapper.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+/*
+  HexagonControlWrapper is implementing interfaces in ISocControlWrapper.
+  This class calls APIs on hexagon via hexagon control binary.
+  TODO(satok): Add more documents about hexagon control binary.
+ */
+class HexagonControlWrapper final : public ISocControlWrapper {
+ public:
+  HexagonControlWrapper() = default;
+  int GetVersion() const final;
+  bool Init() final;
+  bool Finalize() final;
+  bool SetupGraph(const GraphTransferer &graph_transferer) final;
+  bool ExecuteGraph() final;
+  bool TeardownGraph() final;
+  bool FillInputNode(string node_name, const ByteArray bytes) final;
+  bool ReadOutputNode(string node_name,
+                      std::vector<ByteArray> *outputs) const final;
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(HexagonControlWrapper);
+};
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_CONTROL_WRAPPER_H_
diff --git a/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc b/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
new file mode 100644
index 00000000000000..02733bf1b15cd6
--- /dev/null
+++ b/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
@@ -0,0 +1,79 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/kernels/hexagon/graph_transferer.h"
+#include "tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h"
+#include "tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h"
+#include "tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+#ifdef USE_HEXAGON_LIBS
+TEST(GraphTransferer, RunInceptionV3OnHexagonExample) {
+  // Change file path to absolute path of model file on your local machine
+  const string filename =
+      "/tmp/tensorflow_inception_v3_stripped_optimized_quantized.pb";
+  const IGraphTransferOpsDefinitions* ops_definitions =
+      &HexagonOpsDefinitions::getInstance();
+  std::vector<GraphTransferer::InputNodeInfo> input_node_info_list = {
+      GraphTransferer::InputNodeInfo{"Mul",
+                                     Tensor{DT_FLOAT, {1, 299, 299, 3}}}};
+  std::vector<string> output_node_names = {"softmax"};
+  const bool is_text_proto = false;
+
+  GraphTransferer::OutputTensorInfo output_tensor_info;
+  GraphTransferer gt;
+  gt.EnableStrictCheckMode(false);
+  Status status = gt.LoadGraphFromProtoFile(
+      *ops_definitions, filename, input_node_info_list, output_node_names,
+      is_text_proto, true /* dry_run_for_unknown_shape */, &output_tensor_info);
+  EXPECT_TRUE(status.ok());
+
+  HexagonControlWrapper hexagon_control_wrapper;
+  const int version = hexagon_control_wrapper.GetVersion();
+  ASSERT_GE(version, 1);
+  LOG(INFO) << "Hexagon controller version is " << version;
+
+  // 1. Initialize hexagon
+  hexagon_control_wrapper.Init();
+
+  // 2. Setup graph in hexagon
+  hexagon_control_wrapper.SetupGraph(gt);
+
+  // 3. Fill input node's output
+  hexagon_control_wrapper.FillInputNode("Mul", {});
+
+  // 4. Execute graph
+  hexagon_control_wrapper.ExecuteGraph();
+
+  // 5. Read output node's outputs
+  std::vector<ISocControlWrapper::ByteArray> outputs;
+  hexagon_control_wrapper.ReadOutputNode("softmax", &outputs);
+
+  // 6. Teardown graph in hexagon
+  hexagon_control_wrapper.TeardownGraph();
+
+  // 7. Finalize hexagon
+  hexagon_control_wrapper.Finalize();
+}
+#endif
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/hexagon/i_soc_control_wrapper.h b/tensorflow/core/kernels/hexagon/i_soc_control_wrapper.h
new file mode 100644
index 00000000000000..1fb339891ca5a5
--- /dev/null
+++ b/tensorflow/core/kernels/hexagon/i_soc_control_wrapper.h
@@ -0,0 +1,67 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+vcyou may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_I_SOC_CONTROL_WRAPPER_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_I_SOC_CONTROL_WRAPPER_H_
+
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/hexagon/graph_transferer.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+class ISocControlWrapper {
+ public:
+  using ByteArray =
+      std::tuple<uint8 * /* data */, uint64 /* size */, DataType /* type */>;
+
+  ISocControlWrapper() = default;
+  virtual ~ISocControlWrapper() = default;
+
+  // Return version of SOC controller library.
+  // This function is mainly for a debug purpose to verify SOC controller.
+  virtual int GetVersion() const = 0;
+
+  // Initialize SOC. This function should be called before
+  // starting graph transfer.
+  virtual bool Init() = 0;
+
+  // Finalize SOC. This function should be called when all graph executions
+  // are finished.
+  virtual bool Finalize() = 0;
+
+  // Setup graph on SOC
+  virtual bool SetupGraph(const GraphTransferer &graph_transferer) = 0;
+
+  // Execute graph on SOC
+  virtual bool ExecuteGraph() = 0;
+
+  // Teardown Graph on SOC
+  virtual bool TeardownGraph() = 0;
+
+  // Fill input node's output on SOC
+  virtual bool FillInputNode(string node_name, const ByteArray bytes) = 0;
+
+  // Read output node's outputs on SOC
+  virtual bool ReadOutputNode(string node_name,
+                              std::vector<ByteArray> *outputs) const = 0;
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(ISocControlWrapper);
+};
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_I_SOC_CONTROL_WRAPPER_H_
diff --git a/tensorflow/core/kernels/lookup_table_init_op.cc b/tensorflow/core/kernels/lookup_table_init_op.cc
index 6303a0f5cfbd45..016f5d794f7689 100644
--- a/tensorflow/core/kernels/lookup_table_init_op.cc
+++ b/tensorflow/core/kernels/lookup_table_init_op.cc
@@ -285,6 +285,15 @@ class TextFileLineIterator
         }
         tensor->flat<float>()(0) = value;
       } break;
+      case DT_DOUBLE: {
+        double value;
+        if (!strings::safe_strtod(token.c_str(), &value)) {
+          valid_ = false;
+          return errors::InvalidArgument("Field ", token, " in line ", next_id_,
+                                         " is not a valid double.");
+        }
+        tensor->flat<double>()(0) = value;
+      } break;
       case DT_STRING:
         tensor->flat<string>()(0) = token;
         break;
diff --git a/tensorflow/core/kernels/lookup_table_op.cc b/tensorflow/core/kernels/lookup_table_op.cc
index d34edbfe65a774..4aca9a726746a2 100644
--- a/tensorflow/core/kernels/lookup_table_op.cc
+++ b/tensorflow/core/kernels/lookup_table_op.cc
@@ -49,6 +49,10 @@ const float SubtleMustCopyUnlessStringOrFloat(const float value) {
   return value;
 }
 
+const double SubtleMustCopyUnlessStringOrFloat(const double value) {
+  return value;
+}
+
 }  // namespace
 
 // Lookup table that wraps an unordered_map, where the key and value data type
@@ -850,6 +854,9 @@ REGISTER_KERNEL_BUILDER(Name("LookupTableImport").Device(DEVICE_CPU),
       LookupTableOp<lookup::HashTable<key_dtype, value_dtype>, key_dtype, \
                     value_dtype>)
 
+REGISTER_KERNEL(string, double);
+REGISTER_KERNEL(string, float);
+REGISTER_KERNEL(string, int32);
 REGISTER_KERNEL(string, int64);
 REGISTER_KERNEL(int64, string);
 
diff --git a/tensorflow/core/kernels/sdca_ops.cc b/tensorflow/core/kernels/sdca_ops.cc
index 4230c2aad4df1e..d7365c65601947 100644
--- a/tensorflow/core/kernels/sdca_ops.cc
+++ b/tensorflow/core/kernels/sdca_ops.cc
@@ -224,7 +224,7 @@ class SdcaOptimizer : public OpKernel {
   explicit SdcaOptimizer(OpKernelConstruction* const context)
       : OpKernel(context), options_(context) {}
 
-  void Compute(OpKernelContext* const context) override {
+  void Compute(OpKernelContext* context) override {
     DoCompute(options_, context);
   }
 
@@ -244,7 +244,7 @@ class SdcaShrinkL1 : public OpKernel {
     OP_REQUIRES_OK(context, regularizations_.Initialize(context));
   }
 
-  void Compute(OpKernelContext* const context) override {
+  void Compute(OpKernelContext* context) override {
     OpMutableInputList weights_inputs;
     OP_REQUIRES_OK(context,
                    context->mutable_input_list("weights", &weights_inputs));
@@ -287,7 +287,7 @@ class SdcaFprint : public OpKernel {
   explicit SdcaFprint(OpKernelConstruction* const context)
       : OpKernel(context) {}
 
-  void Compute(OpKernelContext* const context) override {
+  void Compute(OpKernelContext* context) override {
     const Tensor& input = context->input(0);
     OP_REQUIRES(context, TensorShapeUtils::IsVector(input.shape()),
                 errors::InvalidArgument("Input must be a vector, got shape ",
diff --git a/tensorflow/contrib/metrics/kernels/set_kernels.cc b/tensorflow/core/kernels/set_kernels.cc
similarity index 100%
rename from tensorflow/contrib/metrics/kernels/set_kernels.cc
rename to tensorflow/core/kernels/set_kernels.cc
diff --git a/tensorflow/core/kernels/tensor_array.h b/tensorflow/core/kernels/tensor_array.h
index 1fb1be6b628210..ae1700cd0a77ca 100644
--- a/tensorflow/core/kernels/tensor_array.h
+++ b/tensorflow/core/kernels/tensor_array.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -132,8 +133,9 @@ class TensorArray : public ResourceBase {
   // can hold more than MAX_INT entries, in practice we do not expect
   // users to construct this many Tensors for storage in a TensorArray.
   TensorArray(const DataType& dtype, const Tensor& handle, int32 N,
-              bool dynamic_size, bool multiple_writes_aggregate, bool is_grad,
-              int32 marked_size, bool clear_after_read)
+              const PartialTensorShape& element_shape, bool dynamic_size,
+              bool multiple_writes_aggregate, bool is_grad, int32 marked_size,
+              bool clear_after_read)
       : dtype_(dtype),
         handle_(handle),
         closed_(false),
@@ -143,6 +145,7 @@ class TensorArray : public ResourceBase {
         clear_after_read_(clear_after_read),
         is_grad_(is_grad),
         marked_size_(marked_size),
+        element_shape_(element_shape),
         tensors_(N) {}
 
   // Write PersistentTensor 'value' to index 'index'.
@@ -234,6 +237,22 @@ class TensorArray : public ResourceBase {
 
   DataType ElemType() const { return dtype_; }
 
+  PartialTensorShape ElemShape() {
+    mutex_lock l(mu_);
+    return element_shape_;
+  }
+
+  Status SetElemShape(const PartialTensorShape& candidate) {
+    mutex_lock l(mu_);
+    PartialTensorShape new_element_shape_;
+    Status s = element_shape_.MergeWith(candidate, &new_element_shape_);
+    if (!s.ok()) {
+      return s;
+    }
+    element_shape_ = new_element_shape_;
+    return Status::OK();
+  }
+
   string DebugString() override {
     mutex_lock l(mu_);
     CHECK(!closed_);
@@ -362,10 +381,14 @@ class TensorArray : public ResourceBase {
   // True iff this is a gradient tensor array.
   bool is_grad_;
 
-  // The size of the TensorArray after an unpack or split is performed.
+  // The size of the TensorArray after a (legacy) unpack or split is performed.
   // -1 if there has been no unpack or split performed on the TensorArray.
   int32 marked_size_;
 
+  // The shape of each element in the TensorArray, may be partially known or not
+  // known at all.
+  PartialTensorShape element_shape_ GUARDED_BY(mu_);
+
   // TensorAndState is used to keep track of the PersistentTensors
   // stored in the TensorArray, along with their shapes, and a boolean
   // that determines whether they have already been read or not.
@@ -421,6 +444,14 @@ Status TensorArray::LockedWriteOrAggregate(OpKernelContext* ctx,
         " because the value dtype is ", DataTypeString(value_t->dtype()),
         " but TensorArray dtype is ", DataTypeString(dtype_), ".");
   }
+  if (!element_shape_.IsCompatibleWith(value_t->shape())) {
+    return errors::InvalidArgument(
+        "TensorArray ", handle_.vec<string>()(1),
+        ": Could not write to TensorArray index ", index,
+        " because the value shape is ", value_t->shape().DebugString(),
+        " which is incompatible with the TensorArray's element shape: ",
+        element_shape_.DebugString(), ".");
+  }
 
   if (t.read) {
     return errors::InvalidArgument("TensorArray ", handle_.vec<string>()(1),
diff --git a/tensorflow/core/kernels/tensor_array_ops.cc b/tensorflow/core/kernels/tensor_array_ops.cc
index fa262324681e53..b90efd666a6c1e 100644
--- a/tensorflow/core/kernels/tensor_array_ops.cc
+++ b/tensorflow/core/kernels/tensor_array_ops.cc
@@ -136,6 +136,7 @@ class TensorArrayOp : public TensorArrayCreationOp {
   explicit TensorArrayOp(OpKernelConstruction* context)
       : TensorArrayCreationOp(context) {
     OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype_));
+    OP_REQUIRES_OK(context, context->GetAttr("element_shape", &element_shape_));
     OP_REQUIRES_OK(context, context->GetAttr("dynamic_size", &dynamic_size_));
     OP_REQUIRES_OK(context,
                    context->GetAttr("clear_after_read", &clear_after_read_));
@@ -165,9 +166,9 @@ class TensorArrayOp : public TensorArrayCreationOp {
     handle(1) = unique_tensor_array_name;
 
     TensorArray* tensor_array = new TensorArray(
-        dtype_, *tensor_array_output_handle, size, dynamic_size_,
-        false /* multiple_writes_aggregate */, false /* is_grad */,
-        -1 /* marked_size */, clear_after_read_);
+        dtype_, *tensor_array_output_handle, size, element_shape_,
+        dynamic_size_, false /* multiple_writes_aggregate */,
+        false /* is_grad */, -1 /* marked_size */, clear_after_read_);
 
     TF_RETURN_IF_ERROR(rm->Create(
         ctx->step_container()->name(),
@@ -180,6 +181,7 @@ class TensorArrayOp : public TensorArrayCreationOp {
 
  private:
   DataType dtype_;
+  PartialTensorShape element_shape_;
   bool dynamic_size_;
   bool clear_after_read_;
   string tensor_array_name_;  // The name used to create the TensorArray.
@@ -188,7 +190,8 @@ class TensorArrayOp : public TensorArrayCreationOp {
 };
 
 REGISTER_KERNEL_BUILDER(Name("TensorArray").Device(DEVICE_CPU), TensorArrayOp);
-REGISTER_KERNEL_BUILDER(Name("TensorArrayV2").Device(DEVICE_CPU), TensorArrayOp);
+REGISTER_KERNEL_BUILDER(Name("TensorArrayV2").Device(DEVICE_CPU),
+                        TensorArrayOp);
 
 #if GOOGLE_CUDA
 
@@ -198,8 +201,8 @@ REGISTER_KERNEL_BUILDER(Name("TensorArrayV2").Device(DEVICE_CPU), TensorArrayOp)
                               .TypeConstraint<type>("dtype") \
                               .HostMemory("size")            \
                               .HostMemory("handle"),         \
-                          TensorArrayOp); \
-  REGISTER_KERNEL_BUILDER(Name("TensorArrayV2")                \
+                          TensorArrayOp);                    \
+  REGISTER_KERNEL_BUILDER(Name("TensorArrayV2")              \
                               .Device(DEVICE_GPU)            \
                               .TypeConstraint<type>("dtype") \
                               .HostMemory("size")            \
@@ -265,9 +268,9 @@ class TensorArrayGradOp : public TensorArrayCreationOp {
                     tensor_array_output_handle](TensorArray** ret) -> Status {
       *ret = new TensorArray(
           tensor_array->ElemType(), *tensor_array_output_handle, array_size,
-          false /* dynamic_size */, true /* multiple_writes_aggregate */,
-          true /* is_grad */, marked_size /* marked_size */,
-          true /* close_after_read */);
+          tensor_array->ElemShape(), false /* dynamic_size */,
+          true /* multiple_writes_aggregate */, true /* is_grad */,
+          marked_size /* marked_size */, true /* close_after_read */);
       TF_RETURN_IF_ERROR((*ret)->CopyShapesFrom(tensor_array));
       return Status::OK();
     };
@@ -344,11 +347,11 @@ class TensorArrayWriteOp : public OpKernel {
   }
 };
 
-#define REGISTER_WRITE(type)                                                 \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("TensorArrayWrite").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
-      TensorArrayWriteOp<CPUDevice, type>);\
-  REGISTER_KERNEL_BUILDER(                                                   \
+#define REGISTER_WRITE(type)                                                   \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("TensorArrayWrite").Device(DEVICE_CPU).TypeConstraint<type>("T"),   \
+      TensorArrayWriteOp<CPUDevice, type>);                                    \
+  REGISTER_KERNEL_BUILDER(                                                     \
       Name("TensorArrayWriteV2").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
       TensorArrayWriteOp<CPUDevice, type>);
 
@@ -358,18 +361,18 @@ TF_CALL_ALL_TYPES(REGISTER_WRITE);
 
 #if GOOGLE_CUDA
 
-#define REGISTER_GPU(type)                               \
-  REGISTER_KERNEL_BUILDER(Name("TensorArrayWrite")       \
-                              .Device(DEVICE_GPU)        \
-                              .TypeConstraint<type>("T") \
-                              .HostMemory("handle")      \
-                              .HostMemory("index"),      \
+#define REGISTER_GPU(type)                                      \
+  REGISTER_KERNEL_BUILDER(Name("TensorArrayWrite")              \
+                              .Device(DEVICE_GPU)               \
+                              .TypeConstraint<type>("T")        \
+                              .HostMemory("handle")             \
+                              .HostMemory("index"),             \
                           TensorArrayWriteOp<GPUDevice, type>); \
-  REGISTER_KERNEL_BUILDER(Name("TensorArrayWriteV2")       \
-                              .Device(DEVICE_GPU)        \
-                              .TypeConstraint<type>("T") \
-                              .HostMemory("handle")      \
-                              .HostMemory("index"),      \
+  REGISTER_KERNEL_BUILDER(Name("TensorArrayWriteV2")            \
+                              .Device(DEVICE_GPU)               \
+                              .TypeConstraint<type>("T")        \
+                              .HostMemory("handle")             \
+                              .HostMemory("index"),             \
                           TensorArrayWriteOp<GPUDevice, type>);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
@@ -419,14 +422,14 @@ class TensorArrayReadOp : public OpKernel {
   DataType dtype_;
 };
 
-#define REGISTER_READ(type)                                   \
-  REGISTER_KERNEL_BUILDER(Name("TensorArrayRead")             \
-                              .Device(DEVICE_CPU)             \
-                              .TypeConstraint<type>("dtype"), \
+#define REGISTER_READ(type)                                    \
+  REGISTER_KERNEL_BUILDER(Name("TensorArrayRead")              \
+                              .Device(DEVICE_CPU)              \
+                              .TypeConstraint<type>("dtype"),  \
                           TensorArrayReadOp<CPUDevice, type>); \
-  REGISTER_KERNEL_BUILDER(Name("TensorArrayReadV2")             \
-                              .Device(DEVICE_CPU)             \
-                              .TypeConstraint<type>("dtype"), \
+  REGISTER_KERNEL_BUILDER(Name("TensorArrayReadV2")            \
+                              .Device(DEVICE_CPU)              \
+                              .TypeConstraint<type>("dtype"),  \
                           TensorArrayReadOp<CPUDevice, type>);
 
 TF_CALL_ALL_TYPES(REGISTER_READ)
@@ -435,18 +438,18 @@ TF_CALL_ALL_TYPES(REGISTER_READ)
 
 #if GOOGLE_CUDA
 
-#define REGISTER_GPU(type)                                   \
-  REGISTER_KERNEL_BUILDER(Name("TensorArrayRead")            \
-                              .Device(DEVICE_GPU)            \
-                              .TypeConstraint<type>("dtype") \
-                              .HostMemory("handle")          \
-                              .HostMemory("index"),          \
+#define REGISTER_GPU(type)                                     \
+  REGISTER_KERNEL_BUILDER(Name("TensorArrayRead")              \
+                              .Device(DEVICE_GPU)              \
+                              .TypeConstraint<type>("dtype")   \
+                              .HostMemory("handle")            \
+                              .HostMemory("index"),            \
                           TensorArrayReadOp<GPUDevice, type>); \
   REGISTER_KERNEL_BUILDER(Name("TensorArrayReadV2")            \
-                              .Device(DEVICE_GPU)            \
-                              .TypeConstraint<type>("dtype") \
-                              .HostMemory("handle")          \
-                              .HostMemory("index"),          \
+                              .Device(DEVICE_GPU)              \
+                              .TypeConstraint<type>("dtype")   \
+                              .HostMemory("handle")            \
+                              .HostMemory("index"),            \
                           TensorArrayReadOp<GPUDevice, type>);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
@@ -484,6 +487,10 @@ class TensorArrayPackOrGatherOp : public OpKernel {
             "TensorArray dtype is ", DataTypeString(tensor_array->ElemType()),
             " but Op requested dtype ", DataTypeString(dtype_), "."));
 
+    // Ensure new element shape is compatible with the one stored in the
+    // TensorArray.
+    OP_REQUIRES_OK(ctx, tensor_array->SetElemShape(element_shape_));
+
     int32 num_indices;
     std::vector<PersistentTensor> values;
     std::vector<int32> indices;
@@ -577,21 +584,21 @@ class TensorArrayPackOrGatherOp : public OpKernel {
   PartialTensorShape element_shape_;
 };
 
-#define REGISTER_GATHER_AND_PACK(type)                                     \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("TensorArrayPack")                                              \
-          .Device(DEVICE_CPU)                                              \
-          .TypeConstraint<type>("dtype"),                                  \
-      TensorArrayPackOrGatherOp<CPUDevice, type, true /* LEGACY_PACK */>); \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("TensorArrayGather")                                            \
-          .Device(DEVICE_CPU)                                              \
-          .TypeConstraint<type>("dtype"),                                  \
+#define REGISTER_GATHER_AND_PACK(type)                                      \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("TensorArrayPack")                                               \
+          .Device(DEVICE_CPU)                                               \
+          .TypeConstraint<type>("dtype"),                                   \
+      TensorArrayPackOrGatherOp<CPUDevice, type, true /* LEGACY_PACK */>);  \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("TensorArrayGather")                                             \
+          .Device(DEVICE_CPU)                                               \
+          .TypeConstraint<type>("dtype"),                                   \
       TensorArrayPackOrGatherOp<CPUDevice, type, false /* LEGACY_PACK */>); \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("TensorArrayGatherV2")                                            \
-          .Device(DEVICE_CPU)                                              \
-          .TypeConstraint<type>("dtype"),                                  \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("TensorArrayGatherV2")                                           \
+          .Device(DEVICE_CPU)                                               \
+          .TypeConstraint<type>("dtype"),                                   \
       TensorArrayPackOrGatherOp<CPUDevice, type, false /* LEGACY_PACK */>);
 
 TF_CALL_ALL_TYPES(REGISTER_GATHER_AND_PACK);
@@ -604,26 +611,26 @@ REGISTER_GATHER_AND_PACK(bfloat16);
 
 #if GOOGLE_CUDA
 
-#define REGISTER_GPU(type)                                                 \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("TensorArrayPack")                                              \
-          .Device(DEVICE_GPU)                                              \
-          .TypeConstraint<type>("dtype")                                   \
-          .HostMemory("handle"),                                           \
-      TensorArrayPackOrGatherOp<GPUDevice, type, true /* LEGACY_PACK */>); \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("TensorArrayGather")                                            \
-          .Device(DEVICE_GPU)                                              \
-          .TypeConstraint<type>("dtype")                                   \
-          .HostMemory("indices")                                           \
-          .HostMemory("handle"),                                           \
+#define REGISTER_GPU(type)                                                  \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("TensorArrayPack")                                               \
+          .Device(DEVICE_GPU)                                               \
+          .TypeConstraint<type>("dtype")                                    \
+          .HostMemory("handle"),                                            \
+      TensorArrayPackOrGatherOp<GPUDevice, type, true /* LEGACY_PACK */>);  \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("TensorArrayGather")                                             \
+          .Device(DEVICE_GPU)                                               \
+          .TypeConstraint<type>("dtype")                                    \
+          .HostMemory("indices")                                            \
+          .HostMemory("handle"),                                            \
       TensorArrayPackOrGatherOp<GPUDevice, type, false /* LEGACY_PACK */>); \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("TensorArrayGatherV2")                                            \
-          .Device(DEVICE_GPU)                                              \
-          .TypeConstraint<type>("dtype")                                   \
-          .HostMemory("indices")                                           \
-          .HostMemory("handle"),                                           \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("TensorArrayGatherV2")                                           \
+          .Device(DEVICE_GPU)                                               \
+          .TypeConstraint<type>("dtype")                                    \
+          .HostMemory("indices")                                            \
+          .HostMemory("handle"),                                            \
       TensorArrayPackOrGatherOp<GPUDevice, type, false /* LEGACY_PACK */>);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
@@ -789,18 +796,18 @@ class TensorArrayConcatOp : public OpKernel {
   PartialTensorShape element_shape_except0_;
 };
 
-#define REGISTER_CONCAT(type)                                \
-  REGISTER_KERNEL_BUILDER(Name("TensorArrayConcat")          \
-                              .Device(DEVICE_CPU)            \
-                              .TypeConstraint<type>("dtype") \
-                              .HostMemory("lengths")         \
-                              .HostMemory("handle"),         \
+#define REGISTER_CONCAT(type)                                    \
+  REGISTER_KERNEL_BUILDER(Name("TensorArrayConcat")              \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("dtype")     \
+                              .HostMemory("lengths")             \
+                              .HostMemory("handle"),             \
                           TensorArrayConcatOp<CPUDevice, type>); \
-  REGISTER_KERNEL_BUILDER(Name("TensorArrayConcatV2")          \
-                              .Device(DEVICE_CPU)            \
-                              .TypeConstraint<type>("dtype") \
-                              .HostMemory("lengths")         \
-                              .HostMemory("handle"),         \
+  REGISTER_KERNEL_BUILDER(Name("TensorArrayConcatV2")            \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("dtype")     \
+                              .HostMemory("lengths")             \
+                              .HostMemory("handle"),             \
                           TensorArrayConcatOp<CPUDevice, type>)
 
 TF_CALL_ALL_TYPES(REGISTER_CONCAT);
@@ -813,18 +820,18 @@ REGISTER_CONCAT(bfloat16);
 
 #if GOOGLE_CUDA
 
-#define REGISTER_GPU(type)                                   \
-  REGISTER_KERNEL_BUILDER(Name("TensorArrayConcat")          \
-                              .Device(DEVICE_GPU)            \
-                              .TypeConstraint<type>("dtype") \
-                              .HostMemory("lengths")         \
-                              .HostMemory("handle"),         \
+#define REGISTER_GPU(type)                                       \
+  REGISTER_KERNEL_BUILDER(Name("TensorArrayConcat")              \
+                              .Device(DEVICE_GPU)                \
+                              .TypeConstraint<type>("dtype")     \
+                              .HostMemory("lengths")             \
+                              .HostMemory("handle"),             \
                           TensorArrayConcatOp<GPUDevice, type>); \
-  REGISTER_KERNEL_BUILDER(Name("TensorArrayConcatV2")          \
-                              .Device(DEVICE_GPU)            \
-                              .TypeConstraint<type>("dtype") \
-                              .HostMemory("lengths")         \
-                              .HostMemory("handle"),         \
+  REGISTER_KERNEL_BUILDER(Name("TensorArrayConcatV2")            \
+                              .Device(DEVICE_GPU)                \
+                              .TypeConstraint<type>("dtype")     \
+                              .HostMemory("lengths")             \
+                              .HostMemory("handle"),             \
                           TensorArrayConcatOp<GPUDevice, type>)
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
@@ -986,9 +993,11 @@ class TensorArrayUnpackOrScatterOp : public OpKernel {
   REGISTER_KERNEL_BUILDER(                                                     \
       Name("TensorArrayScatter").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
       TensorArrayUnpackOrScatterOp<CPUDevice, type,                            \
-                                   false /* LEGACY_UNPACK */>); \
+                                   false /* LEGACY_UNPACK */>);                \
   REGISTER_KERNEL_BUILDER(                                                     \
-      Name("TensorArrayScatterV2").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      Name("TensorArrayScatterV2")                                             \
+          .Device(DEVICE_CPU)                                                  \
+          .TypeConstraint<type>("T"),                                          \
       TensorArrayUnpackOrScatterOp<CPUDevice, type,                            \
                                    false /* LEGACY_UNPACK */>);
 
@@ -997,29 +1006,29 @@ TF_CALL_ALL_TYPES(REGISTER_SCATTER_AND_UNPACK);
 
 #if GOOGLE_CUDA
 
-#define REGISTER_GPU(type)                                     \
-  REGISTER_KERNEL_BUILDER(                                     \
-      Name("TensorArrayUnpack")                                \
-          .Device(DEVICE_GPU)                                  \
-          .TypeConstraint<type>("T")                           \
-          .HostMemory("handle"),                               \
-      TensorArrayUnpackOrScatterOp<GPUDevice, type,            \
-                                   true /* LEGACY_UNPACK */>); \
-  REGISTER_KERNEL_BUILDER(                                     \
-      Name("TensorArrayScatter")                               \
-          .Device(DEVICE_GPU)                                  \
-          .TypeConstraint<type>("T")                           \
-          .HostMemory("indices")                               \
-          .HostMemory("handle"),                               \
-      TensorArrayUnpackOrScatterOp<GPUDevice, type,            \
+#define REGISTER_GPU(type)                                      \
+  REGISTER_KERNEL_BUILDER(                                      \
+      Name("TensorArrayUnpack")                                 \
+          .Device(DEVICE_GPU)                                   \
+          .TypeConstraint<type>("T")                            \
+          .HostMemory("handle"),                                \
+      TensorArrayUnpackOrScatterOp<GPUDevice, type,             \
+                                   true /* LEGACY_UNPACK */>);  \
+  REGISTER_KERNEL_BUILDER(                                      \
+      Name("TensorArrayScatter")                                \
+          .Device(DEVICE_GPU)                                   \
+          .TypeConstraint<type>("T")                            \
+          .HostMemory("indices")                                \
+          .HostMemory("handle"),                                \
+      TensorArrayUnpackOrScatterOp<GPUDevice, type,             \
                                    false /* LEGACY_UNPACK */>); \
-  REGISTER_KERNEL_BUILDER(                                     \
+  REGISTER_KERNEL_BUILDER(                                      \
       Name("TensorArrayScatterV2")                              \
-          .Device(DEVICE_GPU)                                  \
-          .TypeConstraint<type>("T")                           \
-          .HostMemory("indices")                               \
-          .HostMemory("handle"),                               \
-      TensorArrayUnpackOrScatterOp<GPUDevice, type,            \
+          .Device(DEVICE_GPU)                                   \
+          .TypeConstraint<type>("T")                            \
+          .HostMemory("indices")                                \
+          .HostMemory("handle"),                                \
+      TensorArrayUnpackOrScatterOp<GPUDevice, type,             \
                                    false /* LEGACY_UNPACK */>);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
@@ -1151,11 +1160,11 @@ class TensorArraySplitOp : public OpKernel {
   }
 };
 
-#define REGISTER_SPLIT(type)                                                 \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("TensorArraySplit").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
-      TensorArraySplitOp<CPUDevice, type>); \
-  REGISTER_KERNEL_BUILDER(                                                   \
+#define REGISTER_SPLIT(type)                                                   \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("TensorArraySplit").Device(DEVICE_CPU).TypeConstraint<type>("T"),   \
+      TensorArraySplitOp<CPUDevice, type>);                                    \
+  REGISTER_KERNEL_BUILDER(                                                     \
       Name("TensorArraySplitV2").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
       TensorArraySplitOp<CPUDevice, type>);
 
@@ -1164,18 +1173,18 @@ TF_CALL_ALL_TYPES(REGISTER_SPLIT);
 
 #if GOOGLE_CUDA
 
-#define REGISTER_GPU(type)                               \
-  REGISTER_KERNEL_BUILDER(Name("TensorArraySplit")       \
-                              .Device(DEVICE_GPU)        \
-                              .TypeConstraint<type>("T") \
-                              .HostMemory("lengths")     \
-                              .HostMemory("handle"),     \
+#define REGISTER_GPU(type)                                      \
+  REGISTER_KERNEL_BUILDER(Name("TensorArraySplit")              \
+                              .Device(DEVICE_GPU)               \
+                              .TypeConstraint<type>("T")        \
+                              .HostMemory("lengths")            \
+                              .HostMemory("handle"),            \
                           TensorArraySplitOp<GPUDevice, type>); \
-  REGISTER_KERNEL_BUILDER(Name("TensorArraySplitV2")       \
-                              .Device(DEVICE_GPU)        \
-                              .TypeConstraint<type>("T") \
-                              .HostMemory("lengths")     \
-                              .HostMemory("handle"),     \
+  REGISTER_KERNEL_BUILDER(Name("TensorArraySplitV2")            \
+                              .Device(DEVICE_GPU)               \
+                              .TypeConstraint<type>("T")        \
+                              .HostMemory("lengths")            \
+                              .HostMemory("handle"),            \
                           TensorArraySplitOp<GPUDevice, type>);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
diff --git a/tensorflow/core/kernels/variable_ops.cc b/tensorflow/core/kernels/variable_ops.cc
index 8819124ae67bdc..34e227156d84a6 100644
--- a/tensorflow/core/kernels/variable_ops.cc
+++ b/tensorflow/core/kernels/variable_ops.cc
@@ -24,6 +24,7 @@ limitations under the License.
 namespace tensorflow {
 
 REGISTER_KERNEL_BUILDER(Name("Variable").Device(DEVICE_CPU), VariableOp);
+REGISTER_KERNEL_BUILDER(Name("VariableV2").Device(DEVICE_CPU), VariableOp);
 REGISTER_KERNEL_BUILDER(Name("TemporaryVariable").Device(DEVICE_CPU),
                         TemporaryVariableOp);
 REGISTER_KERNEL_BUILDER(Name("DestroyTemporaryVariable").Device(DEVICE_CPU),
@@ -38,6 +39,10 @@ REGISTER_KERNEL_BUILDER(Name("IsVariableInitialized").Device(DEVICE_CPU),
                           .Device(DEVICE_SYCL)                          \
                           .TypeConstraint<TYPE>("dtype"),               \
                           VariableOp);                                  \
+  REGISTER_KERNEL_BUILDER(Name("VariableV2")                            \
+                          .Device(DEVICE_SYCL)                          \
+                          .TypeConstraint<TYPE>("dtype"),               \
+                          VariableOp);                                  \
   REGISTER_KERNEL_BUILDER(Name("TemporaryVariable")                     \
                           .Device(DEVICE_SYCL)                          \
                           .TypeConstraint<TYPE>("dtype"),               \
@@ -59,22 +64,25 @@ REGISTER_SYCL_KERNEL(float);
 #if GOOGLE_CUDA
 // Only register 'Variable' on GPU for the subset of types also supported by
 // 'Assign' (see dense_update_ops.cc.)
-#define REGISTER_GPU_KERNELS(type)                                       \
-  REGISTER_KERNEL_BUILDER(                                               \
-      Name("Variable").Device(DEVICE_GPU).TypeConstraint<type>("dtype"), \
-      VariableOp);                                                       \
-  REGISTER_KERNEL_BUILDER(Name("TemporaryVariable")                      \
-                              .Device(DEVICE_GPU)                        \
-                              .TypeConstraint<type>("dtype"),            \
-                          TemporaryVariableOp);                          \
-  REGISTER_KERNEL_BUILDER(Name("DestroyTemporaryVariable")               \
-                              .Device(DEVICE_GPU)                        \
-                              .TypeConstraint<type>("T"),                \
-                          DestroyTemporaryVariableOp);                   \
-  REGISTER_KERNEL_BUILDER(Name("IsVariableInitialized")                  \
-                              .Device(DEVICE_GPU)                        \
-                              .TypeConstraint<type>("dtype")             \
-                              .HostMemory("is_initialized"),             \
+#define REGISTER_GPU_KERNELS(type)                                         \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("Variable").Device(DEVICE_GPU).TypeConstraint<type>("dtype"),   \
+      VariableOp);                                                         \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("VariableV2").Device(DEVICE_GPU).TypeConstraint<type>("dtype"), \
+      VariableOp);                                                         \
+  REGISTER_KERNEL_BUILDER(Name("TemporaryVariable")                        \
+                              .Device(DEVICE_GPU)                          \
+                              .TypeConstraint<type>("dtype"),              \
+                          TemporaryVariableOp);                            \
+  REGISTER_KERNEL_BUILDER(Name("DestroyTemporaryVariable")                 \
+                              .Device(DEVICE_GPU)                          \
+                              .TypeConstraint<type>("T"),                  \
+                          DestroyTemporaryVariableOp);                     \
+  REGISTER_KERNEL_BUILDER(Name("IsVariableInitialized")                    \
+                              .Device(DEVICE_GPU)                          \
+                              .TypeConstraint<type>("dtype")               \
+                              .HostMemory("is_initialized"),               \
                           IsVariableInitializedOp);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
diff --git a/tensorflow/core/kernels/xsmm_conv2d.cc b/tensorflow/core/kernels/xsmm_conv2d.cc
new file mode 100644
index 00000000000000..75cd7325575ef7
--- /dev/null
+++ b/tensorflow/core/kernels/xsmm_conv2d.cc
@@ -0,0 +1,143 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Make this file empty (or nearly empty) so that it can be compiled even when
+// libxsmm is not available.
+
+#ifndef TENSORFLOW_USE_LIBXSMM
+void dummy_xsmm_conv2d_ensure_file_is_not_empty(void);
+#else
+
+#define USE_EIGEN_TENSOR
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/xsmm_conv2d.h"
+
+#include <stdlib.h>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/blocking_counter.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+
+#include "libxsmm/include/libxsmm_cpuid.h"
+
+namespace tensorflow {
+
+// XsmmConv2D is a wrapper for libxsmm direct convolutions.
+
+// Returns true if convolution can be computed efficiently by XsmmConv2D,
+// returns false otherwise.
+bool CanUseXsmmConv2D(const libxsmm_dnn_conv_desc& desc,
+                      TensorFormat data_format) {
+  int VECTOR_SIZE;
+  int arch = libxsmm_cpuid_x86();
+
+  if (arch == LIBXSMM_X86_AVX512_CORE) {
+    VECTOR_SIZE = 16;
+  } else if (arch == LIBXSMM_X86_AVX2) {
+    VECTOR_SIZE = 8;
+  } else {
+    VLOG(1) << "Cannot use XSMM convolutions: unsupported architecture!";
+    return false;
+  }
+
+  if (data_format != FORMAT_NHWC) {
+    VLOG(1) << "Cannot use XSMM convolutions: unsupported format!";
+    return false;
+  }
+  if (desc.pad_h_in != 0 || desc.pad_w_in != 0) {
+    VLOG(1) << "Cannot use XSMM convolutions: unsupported padding!";
+    return false;
+  }
+  if (desc.K % VECTOR_SIZE != 0) {
+    VLOG(1) << "Cannot use XSMM convolutions: output features count not"
+               " divisible by vector size!";
+    return false;
+  }
+  VLOG(2) << "Can use XSMM convolutions.";
+  return true;
+}
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+namespace functor {
+
+template <typename T>
+struct XsmmConv2D<CPUDevice, T> {
+  static void chkerr(libxsmm_dnn_err_t status, string msg) {
+    if (status != LIBXSMM_DNN_SUCCESS) {
+      VLOG(0) << msg << " failed: " << libxsmm_dnn_get_error(status);
+    }
+  }
+
+  void operator()(OpKernelContext* ctx, const libxsmm_dnn_conv_desc& desc,
+                  const T* input, const T* filter, T* output) {
+    libxsmm_dnn_err_t status;
+
+    libxsmm_dnn_conv_handle* libxsmm_handle;
+    libxsmm_handle = libxsmm_dnn_create_conv_handle_check(desc, &status);
+    chkerr(status, "Create handle");
+
+    libxsmm_dnn_buffer* libxsmm_input;
+    libxsmm_dnn_buffer* libxsmm_output;
+    libxsmm_dnn_filter* libxsmm_filter;
+
+    libxsmm_input = libxsmm_dnn_link_input_buffer_check(
+        libxsmm_handle, input, LIBXSMM_DNN_CONV_FORMAT_NHWC_PTR, &status);
+    chkerr(status, "Link input buffer");
+    libxsmm_output = libxsmm_dnn_link_output_buffer_check(
+        libxsmm_handle, output, LIBXSMM_DNN_CONV_FORMAT_NHWC_PTR, &status);
+    chkerr(status, "Link output buffer");
+    libxsmm_filter = libxsmm_dnn_link_filter_check(
+        libxsmm_handle, filter, LIBXSMM_DNN_CONV_FORMAT_RSCK_PTR, &status);
+    chkerr(status, "Link filter");
+
+    chkerr(libxsmm_dnn_zero_buffer(libxsmm_output), "Zero output");
+
+    chkerr(libxsmm_dnn_bind_input_buffer(libxsmm_handle, libxsmm_input),
+           "Bind input");
+    chkerr(libxsmm_dnn_bind_output_buffer(libxsmm_handle, libxsmm_output),
+           "Bind output");
+    chkerr(libxsmm_dnn_bind_filter(libxsmm_handle, libxsmm_filter),
+           "Bind filter");
+
+    // TODO(maciejd) We would prefer raw threads instead of threadpool.
+    auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
+    int num_threads = worker_threads.num_threads;
+    BlockingCounter counter(num_threads);
+    for (int i = 0; i < num_threads; ++i) {
+      worker_threads.workers->Schedule([=, &counter]() {
+        chkerr(libxsmm_dnn_convolve_st(libxsmm_handle,
+                                       LIBXSMM_DNN_CONV_KIND_FWD, 0, i),
+               "Worker");
+        counter.DecrementCount();
+      });
+    }
+    counter.Wait();
+
+    chkerr(libxsmm_dnn_destroy_buffer(libxsmm_input), "Destroy input");
+    chkerr(libxsmm_dnn_destroy_buffer(libxsmm_output), "Destroy output");
+    chkerr(libxsmm_dnn_destroy_filter(libxsmm_filter), "Destroy filter");
+    chkerr(libxsmm_dnn_destroy_conv_handle(libxsmm_handle), "Destory handle");
+  }
+};
+
+}  // namespace functor
+
+template struct functor::XsmmConv2D<CPUDevice, float>;
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_USE_LIBXSMM
diff --git a/tensorflow/core/kernels/xsmm_conv2d.h b/tensorflow/core/kernels/xsmm_conv2d.h
new file mode 100644
index 00000000000000..ad4777040a5342
--- /dev/null
+++ b/tensorflow/core/kernels/xsmm_conv2d.h
@@ -0,0 +1,48 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_XSMM_CONV2D_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_XSMM_CONV2D_H_
+
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+#include "libxsmm/include/libxsmm.h"
+#include "libxsmm/include/libxsmm_dnn.h"
+
+namespace tensorflow {
+
+class OpKernelContext;
+
+// XsmmConv2D is a wrapper for libxsmm direct convolutions.
+
+// Returns true if convolution operation specified by function arguments
+// can use XsmmConv2D implementation, and false otherwise.
+bool CanUseXsmmConv2D(const libxsmm_dnn_conv_desc& desc,
+                      TensorFormat data_format);
+
+namespace functor {
+
+template <typename Device, typename T>
+struct XsmmConv2D {
+  void operator()(OpKernelContext* ctx, const libxsmm_dnn_conv_desc& desc,
+                  const T* input, const T* filter, T* output);
+};
+
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_XSMM_CONV2D_H_
diff --git a/tensorflow/core/kernels/xsmm_conv2d_test.cc b/tensorflow/core/kernels/xsmm_conv2d_test.cc
new file mode 100644
index 00000000000000..d81368314cd62b
--- /dev/null
+++ b/tensorflow/core/kernels/xsmm_conv2d_test.cc
@@ -0,0 +1,27 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/conv_ops.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+void RunXsmmVsGeneric() {}
+
+TEST(XsmmConv2DTest, Basic) {}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/lib/core/threadpool.cc b/tensorflow/core/lib/core/threadpool.cc
index a2245bb28ec066..59d77a37344532 100644
--- a/tensorflow/core/lib/core/threadpool.cc
+++ b/tensorflow/core/lib/core/threadpool.cc
@@ -123,6 +123,15 @@ void ThreadPool::ParallelFor(int64 total, int64 cost_per_unit,
   impl_->ParallelFor(total, cost_per_unit, std::move(fn));
 }
 
+void ThreadPool::ParallelForWithWorkerId(
+    int64 total, int64 cost_per_unit,
+    const std::function<void(int64, int64, int)>& fn) {
+  impl_->ParallelFor(total, cost_per_unit,
+                     [this, &fn](int64 start, int64 limit) {
+                       fn(start, limit, CurrentThreadId());
+                     });
+}
+
 int ThreadPool::NumThreads() const { return impl_->NumThreads(); }
 
 int ThreadPool::CurrentThreadId() const { return impl_->CurrentThreadId(); }
diff --git a/tensorflow/core/lib/core/threadpool.h b/tensorflow/core/lib/core/threadpool.h
index 371339598d34c4..4d791721b43e4f 100644
--- a/tensorflow/core/lib/core/threadpool.h
+++ b/tensorflow/core/lib/core/threadpool.h
@@ -47,7 +47,7 @@ class ThreadPool {
   // Schedule fn() for execution in the pool of threads.
   void Schedule(std::function<void()> fn);
 
-  // ParallelFor shards the "total" unit of work assuming each unit of work
+  // ParallelFor shards the "total" units of work assuming each unit of work
   // having roughly "cost_per_unit" cost, in cycles. Each unit of work is
   // indexed 0, 1, ..., total - 1. Each shard contains 1 or more units of work
   // and the total cost of each shard is roughly the same.
@@ -60,6 +60,15 @@ class ThreadPool {
   void ParallelFor(int64 total, int64 cost_per_unit,
                    std::function<void(int64, int64)> fn);
 
+  // Shard the "total" units of work. For more details, see "ParallelFor".
+  //
+  // The function is passed a thread_id in the range [0, NumThreads()]. The
+  // functions can safely write to a partial result for their id, in a tensor of
+  // size (NumThreads(), ...).
+  void ParallelForWithWorkerId(
+      int64 total, int64 cost_per_unit,
+      const std::function<void(int64, int64, int)>& fn);
+
   // Returns the number of threads in the pool.
   int NumThreads() const;
 
diff --git a/tensorflow/core/lib/core/threadpool_test.cc b/tensorflow/core/lib/core/threadpool_test.cc
index c7d8db51364fe5..cecdc3437996ae 100644
--- a/tensorflow/core/lib/core/threadpool_test.cc
+++ b/tensorflow/core/lib/core/threadpool_test.cc
@@ -80,6 +80,42 @@ TEST(ThreadPool, ParallelFor) {
   }
 }
 
+TEST(ThreadPool, ParallelForWithWorkerId) {
+  // Make ParallelForWithWorkerId use as many threads as possible.
+  int64 kHugeCost = 1 << 30;
+  for (int num_threads = 1; num_threads < kNumThreads; num_threads++) {
+    fprintf(stderr, "Testing with %d threads\n", num_threads);
+    const int kWorkItems = 15;
+    bool work[kWorkItems];
+    ThreadPool pool(Env::Default(), "test", num_threads);
+    for (int i = 0; i < kWorkItems; i++) {
+      work[i] = false;
+    }
+    std::atomic<bool> threads_running[kNumThreads];
+    for (int i = 0; i < num_threads; i++) {
+      threads_running[i] = false;
+    }
+    pool.ParallelForWithWorkerId(
+        kWorkItems, kHugeCost,
+        [&threads_running, &work](int64 begin, int64 end, int64 id) {
+          // Store true for the current thread, and assert that another thread
+          // is not running with the same id.
+          ASSERT_FALSE(threads_running[id].exchange(true));
+          for (int64 i = begin; i < end; ++i) {
+            ASSERT_FALSE(work[i]);
+            work[i] = true;
+          }
+          ASSERT_TRUE(threads_running[id].exchange(false));
+        });
+    for (int i = 0; i < kWorkItems; i++) {
+      ASSERT_TRUE(work[i]);
+    }
+    for (int i = 0; i < num_threads; i++) {
+      ASSERT_FALSE(threads_running[i]);
+    }
+  }
+}
+
 static void BM_Sequential(int iters) {
   ThreadPool pool(Env::Default(), "test", kNumThreads);
   // Decrement count sequentially until 0.
diff --git a/tensorflow/core/lib/jpeg/jpeg_handle.cc b/tensorflow/core/lib/jpeg/jpeg_handle.cc
index 64e7885ca38bfd..ce6398709263ce 100644
--- a/tensorflow/core/lib/jpeg/jpeg_handle.cc
+++ b/tensorflow/core/lib/jpeg/jpeg_handle.cc
@@ -147,8 +147,16 @@ void MemTermSource(j_decompress_ptr cinfo) {}
 // -----------------------------------------------------------------------------
 void MemSkipInputData(j_decompress_ptr cinfo, long jump) {
   MemSourceMgr *src = reinterpret_cast<MemSourceMgr *>(cinfo->src);
-  src->pub.bytes_in_buffer -= jump;
-  src->pub.next_input_byte += jump;
+  if (jump < 0) {
+    return;
+  }
+  if (jump > src->pub.bytes_in_buffer) {
+    src->pub.bytes_in_buffer = 0;
+    (void)MemFillInputBuffer(cinfo);  // warn with a fake EOI or error
+  } else {
+    src->pub.bytes_in_buffer -= jump;
+    src->pub.next_input_byte += jump;
+  }
 }
 
 // -----------------------------------------------------------------------------
diff --git a/tensorflow/core/ops/compat/ops_history.v0.pbtxt b/tensorflow/core/ops/compat/ops_history.v0.pbtxt
index 84f58f85513bfa..bd165e39d8b51a 100644
--- a/tensorflow/core/ops/compat/ops_history.v0.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v0.pbtxt
@@ -8,6 +8,23 @@ op {
     }
   }
 }
+op {
+  name: "Abort"
+  attr {
+    name: "error_msg"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "exit_without_error"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "Abs"
   input_arg {
@@ -9766,6 +9783,112 @@ op {
     type: DT_STRING
   }
 }
+op {
+  name: "DenseToDenseSetOperation"
+  input_arg {
+    name: "set1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "set2"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "result_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "result_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "result_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "set_operation"
+    type: "string"
+  }
+  attr {
+    name: "validate_indices"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_STRING
+      }
+    }
+  }
+}
+op {
+  name: "DenseToSparseSetOperation"
+  input_arg {
+    name: "set1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "set2_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "set2_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "set2_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "result_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "result_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "result_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "set_operation"
+    type: "string"
+  }
+  attr {
+    name: "validate_indices"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_STRING
+      }
+    }
+  }
+}
 op {
   name: "DepthToSpace"
   input_arg {
@@ -26386,6 +26509,47 @@ op {
     type: "type"
   }
 }
+op {
+  name: "SetSize"
+  input_arg {
+    name: "set_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "set_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "set_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  attr {
+    name: "validate_indices"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_STRING
+      }
+    }
+  }
+}
 op {
   name: "Shape"
   input_arg {
@@ -30732,6 +30896,71 @@ op {
     }
   }
 }
+op {
+  name: "SparseToSparseSetOperation"
+  input_arg {
+    name: "set1_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "set1_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "set1_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "set2_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "set2_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "set2_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "result_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "result_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "result_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "set_operation"
+    type: "string"
+  }
+  attr {
+    name: "validate_indices"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_STRING
+      }
+    }
+  }
+}
 op {
   name: "Split"
   input_arg {
@@ -32512,6 +32741,56 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "TensorArray"
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "dynamic_size"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "clear_after_read"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "tensor_array_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  deprecation {
+    version: 16
+  }
+  is_stateful: true
+}
 op {
   name: "TensorArrayClose"
   input_arg {
@@ -33316,6 +33595,52 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "TensorArrayV2"
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  attr {
+    name: "dynamic_size"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "clear_after_read"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "tensor_array_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "TensorArrayWrite"
   input_arg {
@@ -34667,6 +34992,37 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "VariableV2"
+  output_arg {
+    name: "ref"
+    type_attr: "dtype"
+    is_ref: true
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Where"
   input_arg {
diff --git a/tensorflow/core/ops/control_flow_ops.cc b/tensorflow/core/ops/control_flow_ops.cc
index 645ae435531057..d7aacf75fe41be 100644
--- a/tensorflow/core/ops/control_flow_ops.cc
+++ b/tensorflow/core/ops/control_flow_ops.cc
@@ -314,9 +314,10 @@ Only useful as a placeholder for control edges.
 // --------------------------------------------------------------------------
 REGISTER_OP("Abort")
     .Attr("error_msg: string = ''")
+    .Attr("exit_without_error: bool = false")
     .SetShapeFn(shape_inference::NoOutputs)
     .Doc(R"doc(
-Raise a exception to abort the process when called.
+Raise a exception to abort the process when called. If exit_without_error is true, the process will exit normally, otherwise it will exit with a SIGABORT signal.
 
 Returns nothing but an exception.
 
diff --git a/tensorflow/core/ops/data_flow_ops.cc b/tensorflow/core/ops/data_flow_ops.cc
index 646cd2e5530290..bb1856058c5730 100644
--- a/tensorflow/core/ops/data_flow_ops.cc
+++ b/tensorflow/core/ops/data_flow_ops.cc
@@ -774,6 +774,7 @@ handle: The handle to a stack.
 REGISTER_OP("TensorArrayV2")
     .Input("size: int32")
     .Attr("dtype: type")
+    .Attr("element_shape: shape = { unknown_rank: true }")
     .Attr("dynamic_size: bool = false")
     .Attr("clear_after_read: bool = true")
     .Attr("tensor_array_name: string = ''")
@@ -792,6 +793,9 @@ via Read or Pack.
 handle: The handle to the TensorArray.
 size: The size of the array.
 dtype: The type of the elements on the tensor_array.
+element_shape: The expected shape of an element, if known. Used to
+  validate the shapes of TensorArray elements. If this shape is not
+  fully specified, gathering zero-size TensorArrays is an error.
 dynamic_size: A boolean that determines whether writes to the TensorArray
   are allowed to grow the size.  By default, this is not allowed.
 clear_after_read: If true (default), Tensors in the TensorArray are cleared
@@ -1113,6 +1117,7 @@ REGISTER_OP("TensorArray")
     .Attr("dynamic_size: bool = false")
     .Attr("clear_after_read: bool = true")
     .Attr("tensor_array_name: string = ''")
+    .Attr("element_shape: shape = { unknown_rank: true }")
     .Output("handle: Ref(string)")
     .SetIsStateful()
     .SetShapeFn([](InferenceContext* c) { return Status::OK(); })
diff --git a/tensorflow/core/ops/math_grad.cc b/tensorflow/core/ops/math_grad.cc
index 8b688d3e4c6c3e..58a435e3145395 100644
--- a/tensorflow/core/ops/math_grad.cc
+++ b/tensorflow/core/ops/math_grad.cc
@@ -133,6 +133,18 @@ Status LogGrad(const AttrSlice& attrs, FunctionDef* g) {
 }
 REGISTER_OP_GRADIENT("Log", LogGrad);
 
+Status Log1pGrad(const AttrSlice& attrs, FunctionDef* g) {
+  // clang-format off
+  return GradForUnaryCwise(g, {
+      FDH::Const("const", 1.0f),
+      {{"one"}, "Cast", {"const"}, {{"SrcT", DT_FLOAT}, {"DstT", "$T"}}},
+      {{"a"}, "Add", {"one", "x"}},
+      {{"dx"}, "Div", {"dy", "a"}},           // dy / (1 + x)
+  });
+  // clang-format on
+}
+REGISTER_OP_GRADIENT("Log1p", Log1pGrad);
+
 Status TanhGrad(const AttrSlice& attrs, FunctionDef* g) {
   // clang-format off
   return GradForUnaryCwise(g, {
diff --git a/tensorflow/core/ops/math_grad_test.cc b/tensorflow/core/ops/math_grad_test.cc
index b801a8c5a11e61..1fc5a1b12877e5 100644
--- a/tensorflow/core/ops/math_grad_test.cc
+++ b/tensorflow/core/ops/math_grad_test.cc
@@ -480,6 +480,16 @@ TEST_F(MathGradTest, Log) {
   test::ExpectClose(ans, dx);
 }
 
+TEST_F(MathGradTest, Log1p) {
+  auto x = test::AsTensor<float>({0.1f, 1.f, 2.f, 3.f, 4.f, 10.f},
+                                 TensorShape({2, 3}));
+  auto g = [](float x) { return 1 / (1 + x); };
+  auto dx = test::AsTensor<float>(
+      {g(.1f), g(1.f), g(2.f), g(3.f), g(4.f), g(10.f)}, TensorShape({2, 3}));
+  auto ans = SymGrad("Log1p", x);
+  test::ExpectClose(ans, dx);
+}
+
 TEST_F(MathGradTest, Tanh) {
   auto x = test::AsTensor<float>({-3.f, -2.f, -1.f, 1.f, 2.f, 3.f},
                                  TensorShape({2, 3}));
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index ff46aa2725978f..4b8d58de085987 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -8,7 +8,14 @@ op {
     }
     description: "A string which is the message associated with the exception."
   }
-  summary: "Raise a exception to abort the process when called."
+  attr {
+    name: "exit_without_error"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  summary: "Raise a exception to abort the process when called. If exit_without_error is true, the process will exit normally, otherwise it will exit with a SIGABORT signal."
   description: "Returns nothing but an exception."
 }
 op {
@@ -5359,6 +5366,128 @@ op {
   }
   summary: "Delete the tensor specified by its handle in the session."
 }
+op {
+  name: "DenseToDenseSetOperation"
+  input_arg {
+    name: "set1"
+    description: "`Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.\nDimension `n` contains values in a set, duplicates are allowed but ignored."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "set2"
+    description: "`Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set1`.\nDimension `n` contains values in a set, duplicates are allowed but ignored."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "result_indices"
+    description: "2D indices of a `SparseTensor`."
+    type: DT_INT64
+  }
+  output_arg {
+    name: "result_values"
+    description: "1D values of a `SparseTensor`."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "result_shape"
+    description: "1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is\nthe same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`\nis the max result set size across all `0...n-1` dimensions."
+    type: DT_INT64
+  }
+  attr {
+    name: "set_operation"
+    type: "string"
+  }
+  attr {
+    name: "validate_indices"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_STRING
+      }
+    }
+  }
+  summary: "Applies set operation along last dimension of 2 `Tensor` inputs."
+  description: "See SetOperationOp::SetOperationFromContext for values of `set_operation`.\n\nOutput `result` is a `SparseTensor` represented by `result_indices`,\n`result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this\nhas rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`\ndimension contains the result of `set_operation` applied to the corresponding\n`[0...n-1]` dimension of `set`."
+}
+op {
+  name: "DenseToSparseSetOperation"
+  input_arg {
+    name: "set1"
+    description: "`Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.\nDimension `n` contains values in a set, duplicates are allowed but ignored."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "set2_indices"
+    description: "2D `Tensor`, indices of a `SparseTensor`. Must be in row-major\norder."
+    type: DT_INT64
+  }
+  input_arg {
+    name: "set2_values"
+    description: "1D `Tensor`, values of a `SparseTensor`. Must be in row-major\norder."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "set2_shape"
+    description: "1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must\nbe the same as the 1st `n-1` dimensions of `set1`, `result_shape[n]` is the\nmax set size across `n-1` dimensions."
+    type: DT_INT64
+  }
+  output_arg {
+    name: "result_indices"
+    description: "2D indices of a `SparseTensor`."
+    type: DT_INT64
+  }
+  output_arg {
+    name: "result_values"
+    description: "1D values of a `SparseTensor`."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "result_shape"
+    description: "1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is\nthe same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`\nis the max result set size across all `0...n-1` dimensions."
+    type: DT_INT64
+  }
+  attr {
+    name: "set_operation"
+    type: "string"
+  }
+  attr {
+    name: "validate_indices"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_STRING
+      }
+    }
+  }
+  summary: "Applies set operation along last dimension of `Tensor` and `SparseTensor`."
+  description: "See SetOperationOp::SetOperationFromContext for values of `set_operation`.\n\nInput `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,\nand `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same\nas `set1`. Dimension `n` contains values in a set, duplicates are allowed but\nignored.\n\nIf `validate_indices` is `True`, this op validates the order and range of `set2`\nindices.\n\nOutput `result` is a `SparseTensor` represented by `result_indices`,\n`result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this\nhas rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`\ndimension contains the result of `set_operation` applied to the corresponding\n`[0...n-1]` dimension of `set`."
+}
 op {
   name: "DepthToSpace"
   input_arg {
@@ -16458,6 +16587,53 @@ op {
   }
   summary: "Serialize a `SparseTensor` into a string 3-vector (1-D `Tensor`) object."
 }
+op {
+  name: "SetSize"
+  input_arg {
+    name: "set_indices"
+    description: "2D `Tensor`, indices of a `SparseTensor`."
+    type: DT_INT64
+  }
+  input_arg {
+    name: "set_values"
+    description: "1D `Tensor`, values of a `SparseTensor`."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "set_shape"
+    description: "1D `Tensor`, shape of a `SparseTensor`."
+    type: DT_INT64
+  }
+  output_arg {
+    name: "size"
+    description: "For `set` ranked `n`, this is a `Tensor` with rank `n-1`, and the same 1st\n`n-1` dimensions as `set`. Each value is the number of unique elements in\nthe corresponding `[0...n-1]` dimension of `set`."
+    type: DT_INT32
+  }
+  attr {
+    name: "validate_indices"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_STRING
+      }
+    }
+  }
+  summary: "Number of unique elements along last dimension of input `set`."
+  description: "Input `set` is a `SparseTensor` represented by `set_indices`, `set_values`,\nand `set_shape`. The last dimension contains values in a set, duplicates are\nallowed but ignored.\n\nIf `validate_indices` is `True`, this op validates the order and range of `set`\nindices."
+}
 op {
   name: "Shape"
   input_arg {
@@ -19423,6 +19599,82 @@ op {
   summary: "Converts a sparse representation into a dense tensor."
   description: "Builds an array `dense` with shape `output_shape` such that\n\n```prettyprint\n# If sparse_indices is scalar\ndense[i] = (i == sparse_indices ? sparse_values : default_value)\n\n# If sparse_indices is a vector, then for each i\ndense[sparse_indices[i]] = sparse_values[i]\n\n# If sparse_indices is an n by d matrix, then for each i in [0, n)\ndense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]\n```\n\nAll other values in `dense` are set to `default_value`.  If `sparse_values` is a\nscalar, all sparse indices are set to this single value.\n\nIndices should be sorted in lexicographic order, and indices must not\ncontain any repeats. If `validate_indices` is true, these properties\nare checked during execution."
 }
+op {
+  name: "SparseToSparseSetOperation"
+  input_arg {
+    name: "set1_indices"
+    description: "2D `Tensor`, indices of a `SparseTensor`. Must be in row-major\norder."
+    type: DT_INT64
+  }
+  input_arg {
+    name: "set1_values"
+    description: "1D `Tensor`, values of a `SparseTensor`. Must be in row-major\norder."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "set1_shape"
+    description: "1D `Tensor`, shape of a `SparseTensor`. `set1_shape[0...n-1]` must\nbe the same as `set2_shape[0...n-1]`, `set1_shape[n]` is the\nmax set size across `0...n-1` dimensions."
+    type: DT_INT64
+  }
+  input_arg {
+    name: "set2_indices"
+    description: "2D `Tensor`, indices of a `SparseTensor`. Must be in row-major\norder."
+    type: DT_INT64
+  }
+  input_arg {
+    name: "set2_values"
+    description: "1D `Tensor`, values of a `SparseTensor`. Must be in row-major\norder."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "set2_shape"
+    description: "1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must\nbe the same as `set1_shape[0...n-1]`, `set2_shape[n]` is the\nmax set size across `0...n-1` dimensions."
+    type: DT_INT64
+  }
+  output_arg {
+    name: "result_indices"
+    description: "2D indices of a `SparseTensor`."
+    type: DT_INT64
+  }
+  output_arg {
+    name: "result_values"
+    description: "1D values of a `SparseTensor`."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "result_shape"
+    description: "1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is\nthe same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`\nis the max result set size across all `0...n-1` dimensions."
+    type: DT_INT64
+  }
+  attr {
+    name: "set_operation"
+    type: "string"
+  }
+  attr {
+    name: "validate_indices"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_STRING
+      }
+    }
+  }
+  summary: "Applies set operation along last dimension of 2 `SparseTensor` inputs."
+  description: "See SetOperationOp::SetOperationFromContext for values of `set_operation`.\n\nIf `validate_indices` is `True`, `SparseToSparseSetOperation` validates the\norder and range of `set1` and `set2` indices.\n\nInput `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,\nand `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same\nas `set2`. Dimension `n` contains values in a set, duplicates are allowed but\nignored.\n\nInput `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,\nand `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same\nas `set1`. Dimension `n` contains values in a set, duplicates are allowed but\nignored.\n\nIf `validate_indices` is `True`, this op validates the order and range of `set1`\nand `set2` indices.\n\nOutput `result` is a `SparseTensor` represented by `result_indices`,\n`result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this\nhas rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`\ndimension contains the result of `set_operation` applied to the corresponding\n`[0...n-1]` dimension of `set`."
+}
 op {
   name: "Split"
   input_arg {
@@ -20622,6 +20874,15 @@ op {
       s: ""
     }
   }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
   deprecation {
     version: 16
     explanation: "Use TensorArrayV2"
@@ -21159,6 +21420,16 @@ op {
     type: "type"
     description: "The type of the elements on the tensor_array."
   }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+    description: "The expected shape of an element, if known. Used to\nvalidate the shapes of TensorArray elements. If this shape is not\nfully specified, gathering zero-size TensorArrays is an error."
+  }
   attr {
     name: "dynamic_size"
     type: "bool"
@@ -21957,6 +22228,44 @@ op {
     type_attr: "dtype"
     is_ref: true
   }
+  attr {
+    name: "shape"
+    type: "shape"
+    description: "The shape of the variable tensor, where scalar shapes are\ntreated as undefined."
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    description: "The type of elements in the variable tensor."
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "If non-empty, this variable is placed in the given container.\nOtherwise, a default container is used."
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "If non-empty, this variable is named in the given bucket\nwith this shared_name. Otherwise, the node name is used instead."
+  }
+  summary: "Holds state in the form of a tensor that persists across steps."
+  description: "Outputs a ref to the tensor state so it may be read or modified.\nTODO(zhifengc/mrry): Adds a pointer to a more detail document\nabout sharing states in tensorflow."
+  is_stateful: true
+}
+op {
+  name: "VariableV2"
+  output_arg {
+    name: "ref"
+    description: "A reference to the variable tensor."
+    type_attr: "dtype"
+    is_ref: true
+  }
   attr {
     name: "shape"
     type: "shape"
diff --git a/tensorflow/contrib/metrics/ops/set_ops.cc b/tensorflow/core/ops/set_ops.cc
similarity index 100%
rename from tensorflow/contrib/metrics/ops/set_ops.cc
rename to tensorflow/core/ops/set_ops.cc
diff --git a/tensorflow/contrib/metrics/ops/set_ops_test.cc b/tensorflow/core/ops/set_ops_test.cc
similarity index 100%
rename from tensorflow/contrib/metrics/ops/set_ops_test.cc
rename to tensorflow/core/ops/set_ops_test.cc
diff --git a/tensorflow/core/ops/state_ops.cc b/tensorflow/core/ops/state_ops.cc
index 4c9a41c44868bd..8370e57b88858b 100644
--- a/tensorflow/core/ops/state_ops.cc
+++ b/tensorflow/core/ops/state_ops.cc
@@ -22,6 +22,38 @@ using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
+REGISTER_OP("VariableV2")
+    .Output("ref: Ref(dtype)")
+    .Attr("shape: shape")
+    .Attr("dtype: type")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) {
+      TensorShapeProto shape_proto;
+      TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape_proto));
+      ShapeHandle output_shape;
+      TF_RETURN_IF_ERROR(
+          c->MakeShapeFromShapeProto(shape_proto, &output_shape));
+      c->set_output(0, output_shape);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Holds state in the form of a tensor that persists across steps.
+
+Outputs a ref to the tensor state so it may be read or modified.
+TODO(zhifengc/mrry): Adds a pointer to a more detail document
+about sharing states in tensorflow.
+
+ref: A reference to the variable tensor.
+shape: The shape of the variable tensor.
+dtype: The type of elements in the variable tensor.
+container: If non-empty, this variable is placed in the given container.
+        Otherwise, a default container is used.
+shared_name: If non-empty, this variable is named in the given bucket
+             with this shared_name. Otherwise, the node name is used instead.
+)doc");
+
 REGISTER_OP("Variable")
     .Output("ref: Ref(dtype)")
     .Attr("shape: shape")
@@ -55,7 +87,8 @@ TODO(zhifengc/mrry): Adds a pointer to a more detail document
 about sharing states in tensorflow.
 
 ref: A reference to the variable tensor.
-shape: The shape of the variable tensor.
+shape: The shape of the variable tensor, where scalar shapes are
+  treated as undefined.
 dtype: The type of elements in the variable tensor.
 container: If non-empty, this variable is placed in the given container.
         Otherwise, a default container is used.
diff --git a/tensorflow/core/ops/state_ops_test.cc b/tensorflow/core/ops/state_ops_test.cc
index 4c1ec67e9cf24c..bcc1c924937ec0 100644
--- a/tensorflow/core/ops/state_ops_test.cc
+++ b/tensorflow/core/ops/state_ops_test.cc
@@ -97,4 +97,29 @@ TEST(StateOpsTest, Variable_ShapeFn) {
                    .Finalize(&op.node_def));
   INFER_OK(op, "", "[1,2,3]");
 }
+
+TEST(StateOpsTest, VariableV2_ShapeFn) {
+  ShapeInferenceTestOp op("VariableV2");
+  TensorShapeProto shape_proto;
+
+  // Unknown rank.
+  shape_proto.set_unknown_rank(true);
+  TF_ASSERT_OK(NodeDefBuilder("test", "VariableV2")
+                   .Attr("shape", shape_proto)
+                   .Finalize(&op.node_def));
+  INFER_OK(op, "", "?");
+
+  // Scalar shape.
+  TF_ASSERT_OK(NodeDefBuilder("test", "VariableV2")
+                   .Attr("shape", TensorShape({}))
+                   .Finalize(&op.node_def));
+  INFER_OK(op, "", "[]");
+
+  // Specified shape.
+  TensorShape({1, 2, 3}).AsProto(&shape_proto);
+  TF_ASSERT_OK(NodeDefBuilder("test", "VariableV2")
+                   .Attr("shape", shape_proto)
+                   .Finalize(&op.node_def));
+  INFER_OK(op, "", "[1,2,3]");
+}
 }  // end namespace tensorflow
diff --git a/tensorflow/core/platform/cuda_libdevice_path.cc b/tensorflow/core/platform/cuda_libdevice_path.cc
new file mode 100644
index 00000000000000..4d6532b983d52e
--- /dev/null
+++ b/tensorflow/core/platform/cuda_libdevice_path.cc
@@ -0,0 +1,26 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/cuda_libdevice_path.h"
+
+#include "tensorflow/core/lib/io/path.h"
+
+namespace tensorflow {
+
+string LibdeviceRoot() {
+  return tensorflow::io::JoinPath(tensorflow::CudaRoot(), "nvvm/libdevice");
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/cuda_libdevice_path.h b/tensorflow/core/platform/cuda_libdevice_path.h
new file mode 100644
index 00000000000000..601d0db6d47c7f
--- /dev/null
+++ b/tensorflow/core/platform/cuda_libdevice_path.h
@@ -0,0 +1,32 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_CUDA_LIBDEVICE_PATH_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_CUDA_LIBDEVICE_PATH_H_
+
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// Returns the root directory of the CUDA SDK, which contains sub-folders such
+// as bin, lib64, and nvvm.
+string CudaRoot();
+
+// Returns the directory that contains nvvm libdevice files in the CUDA SDK.
+string LibdeviceRoot();
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_CUDA_LIBDEVICE_PATH_H_
diff --git a/tensorflow/core/platform/cuda_libdevice_path_test.cc b/tensorflow/core/platform/cuda_libdevice_path_test.cc
new file mode 100644
index 00000000000000..86295592a8b639
--- /dev/null
+++ b/tensorflow/core/platform/cuda_libdevice_path_test.cc
@@ -0,0 +1,36 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/cuda_libdevice_path.h"
+
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+#if GOOGLE_CUDA
+TEST(CudaLibdevicePathTest, LibdevicePath) {
+  VLOG(2) << "Libdevice root = " << LibdeviceRoot();
+  std::vector<string> libdevice_files;
+  TF_EXPECT_OK(Env::Default()->GetMatchingPaths(
+      io::JoinPath(LibdeviceRoot(), "libdevice.compute_*.bc"),
+      &libdevice_files));
+  EXPECT_LT(0, libdevice_files.size());
+}
+#endif
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index ed504e9db2bd1c..83a2a17d482c6a 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -148,6 +148,15 @@ def tf_additional_stream_executor_srcs():
 def tf_additional_cupti_wrapper_deps():
   return ["//tensorflow/core/platform/default/gpu:cupti_wrapper"]
 
+def tf_additional_libdevice_data():
+  return ["@local_config_cuda//cuda:libdevice_root"]
+
+def tf_additional_libdevice_deps():
+  return []
+
+def tf_additional_libdevice_srcs():
+  return ["platform/default/cuda_libdevice_path.cc"]
+
 def tf_additional_test_deps():
   return []
 
diff --git a/tensorflow/core/platform/default/cuda_libdevice_path.cc b/tensorflow/core/platform/default/cuda_libdevice_path.cc
new file mode 100644
index 00000000000000..38fc0aba96aea4
--- /dev/null
+++ b/tensorflow/core/platform/default/cuda_libdevice_path.cc
@@ -0,0 +1,38 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/cuda_libdevice_path.h"
+
+#include <stdlib.h>
+
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/default/logging.h"
+
+namespace tensorflow {
+
+string CudaRoot() {
+  // 'bazel test' sets TEST_SRCDIR.
+  const string kRelativeCudaRoot = "local_config_cuda/cuda";
+  const char* env = getenv("TEST_SRCDIR");
+  if (env && env[0] != '\0') {
+    return strings::StrCat(env, "/", kRelativeCudaRoot);
+  } else {
+    LOG(WARNING) << "TEST_SRCDIR environment variable not set: "
+                 << "using $PWD/" << kRelativeCudaRoot << "as the CUDA root.";
+    return kRelativeCudaRoot;
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/default/monitoring.cc b/tensorflow/core/platform/default/monitoring.cc
new file mode 100644
index 00000000000000..267989031d3f20
--- /dev/null
+++ b/tensorflow/core/platform/default/monitoring.cc
@@ -0,0 +1,24 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/monitoring.h"
+
+namespace tensorflow {
+namespace monitoring {
+
+void StartExporter() {}
+
+}  // namespace monitoring
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/default/stream_executor.h b/tensorflow/core/platform/default/stream_executor.h
index f96bf7dc94dd4f..3c178ffce0a2cd 100644
--- a/tensorflow/core/platform/default/stream_executor.h
+++ b/tensorflow/core/platform/default/stream_executor.h
@@ -20,10 +20,12 @@ limitations under the License.
 // IWYU pragma: friend third_party/tensorflow/core/platform/stream_executor.h
 
 #include "tensorflow/core/platform/default/from_stream_executor_status.h"
+#include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/dnn.h"
 #include "tensorflow/stream_executor/dso_loader.h"
 #include "tensorflow/stream_executor/event.h"
+#include "tensorflow/stream_executor/host/host_platform_id.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/multi_platform_manager.h"
 #include "tensorflow/stream_executor/platform.h"
diff --git a/tensorflow/core/platform/default/stream_executor_no_cuda.h b/tensorflow/core/platform/default/stream_executor_no_cuda.h
index 445f63a9e7cb81..faa65256f9adb8 100644
--- a/tensorflow/core/platform/default/stream_executor_no_cuda.h
+++ b/tensorflow/core/platform/default/stream_executor_no_cuda.h
@@ -20,10 +20,12 @@ limitations under the License.
 // IWYU pragma: friend third_party/tensorflow/core/platform/stream_executor_no_cuda.h
 
 #include "tensorflow/core/platform/default/from_stream_executor_status.h"
+#include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/dnn.h"
 #include "tensorflow/stream_executor/dso_loader.h"
 #include "tensorflow/stream_executor/event.h"
+#include "tensorflow/stream_executor/host/host_platform_id.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/multi_platform_manager.h"
 #include "tensorflow/stream_executor/platform.h"
diff --git a/tensorflow/core/platform/env.h b/tensorflow/core/platform/env.h
index 787ebe654b6bd2..428a45576f85f8 100644
--- a/tensorflow/core/platform/env.h
+++ b/tensorflow/core/platform/env.h
@@ -208,12 +208,10 @@ class Env {
   // TODO(jeff,sanjay): if needed, tighten spec so relative to epoch, or
   // provide a routine to get the absolute time.
 
-  /// \brief Returns the number of micro-seconds since some fixed point in
-  /// time. Only useful for computing deltas of time.
+  /// \brief Returns the number of micro-seconds since the Unix epoch.
   virtual uint64 NowMicros() = 0;
 
-  /// \brief Returns the number of seconds since some fixed point in
-  /// time. Only useful for computing deltas of time.
+  /// \brief Returns the number of seconds since the Unix epoch.
   virtual uint64 NowSeconds() { return NowMicros() / 1000000L; }
 
   /// Sleeps/delays the thread for the prescribed number of micro-seconds.
diff --git a/tensorflow/core/platform/monitoring.h b/tensorflow/core/platform/monitoring.h
new file mode 100644
index 00000000000000..743a2b93986a03
--- /dev/null
+++ b/tensorflow/core/platform/monitoring.h
@@ -0,0 +1,30 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_MONITORING_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_MONITORING_H_
+
+namespace tensorflow {
+namespace monitoring {
+
+// A hook to start periodically exporting metrics collected through our
+// monitoring API. The TensorFlow runtime will call this the first time a new
+// session is created using the NewSession method.
+void StartExporter();
+
+}  // namespace monitoring
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_MONITORING_H_
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index b85f501711cbab..e941b936291a0b 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -72,10 +72,11 @@ limitations under the License.
 // 16. Deprecate tensor_array (v1) ops in favor of v2 (10nov2016).
 // 17. Deprecate inv (11nov2016).
 // 17. Expose reverse_v2 (10nov2016)
+// 18. Add VariableV2 (30nov2016)
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 17
+#define TF_GRAPH_DEF_VERSION 18
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //
diff --git a/tensorflow/core/util/example_proto_fast_parsing.cc b/tensorflow/core/util/example_proto_fast_parsing.cc
index f1885bb9809f14..c5cef06b1ad20f 100644
--- a/tensorflow/core/util/example_proto_fast_parsing.cc
+++ b/tensorflow/core/util/example_proto_fast_parsing.cc
@@ -606,11 +606,11 @@ Status FastParseSerializedExample(
   for (size_t d = 0; d < config.dense.size(); ++d) {
     if (dense_feature_last_example[d] == example_index) continue;
     if (config.dense[d].default_value.NumElements() == 0) {
-      return errors::InvalidArgument("Name: ", example_name, ", Feature: ",
-                                     config.dense[d].feature_name,
-                                     " is required but could not be found.");
+      return errors::InvalidArgument(
+          "Name: ", example_name, ", Feature: ", config.dense[d].feature_name,
+          " (data type: ", DataTypeString(config.dense[d].dtype), ")",
+          " is required but could not be found.");
     }
-
     const Tensor& in = config.dense[d].default_value;
     Tensor& out = (*output_dense)[d];
     const std::size_t num_elements = in.shape().num_elements();
diff --git a/tensorflow/core/util/memmapped_file_system.cc b/tensorflow/core/util/memmapped_file_system.cc
index 1a0f10b8305ce3..e077e94cf879ce 100644
--- a/tensorflow/core/util/memmapped_file_system.cc
+++ b/tensorflow/core/util/memmapped_file_system.cc
@@ -177,8 +177,13 @@ const void* MemmappedFileSystem::GetMemoryWithOffset(uint64 offset) const {
   return reinterpret_cast<const uint8*>(mapped_memory_->data()) + offset;
 }
 
+#if defined(COMPILER_MSVC)
 constexpr char* MemmappedFileSystem::kMemmappedPackagePrefix;
 constexpr char* MemmappedFileSystem::kMemmappedPackageDefaultGraphDef;
+#else
+constexpr char MemmappedFileSystem::kMemmappedPackagePrefix[];
+constexpr char MemmappedFileSystem::kMemmappedPackageDefaultGraphDef[];
+#endif
 
 Status MemmappedFileSystem::InitializeFromFile(Env* env,
                                                const string& filename) {
diff --git a/tensorflow/core/util/memmapped_file_system.h b/tensorflow/core/util/memmapped_file_system.h
index 6d5533b53d8b0f..541587aeab0524 100644
--- a/tensorflow/core/util/memmapped_file_system.h
+++ b/tensorflow/core/util/memmapped_file_system.h
@@ -53,9 +53,19 @@ class MemmappedFileSystem : public FileSystem {
  public:
   // Memmapped regions use this prefix to distinguish from
   // the filesystem.
-  static constexpr char* kMemmappedPackagePrefix = "memmapped_package://";
-  // The default graphdef in the package.
+#if defined(COMPILER_MSVC)
+  static constexpr char* kMemmappedPackagePrefix =
+#else
+  static constexpr char kMemmappedPackagePrefix[] =
+#endif
+      "memmapped_package://";
+
+// The default graphdef in the package.
+#if defined(COMPILER_MSVC)
   static constexpr char* kMemmappedPackageDefaultGraphDef =
+#else
+  static constexpr char kMemmappedPackageDefaultGraphDef[] =
+#endif
       "memmapped_package://.";
 
   MemmappedFileSystem();
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java b/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java
index e9e370699d0763..52d493d0f4df0c 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java
@@ -32,6 +32,7 @@
 import android.widget.Toast;
 import java.nio.ByteBuffer;
 import org.tensorflow.demo.env.Logger;
+import org.tensorflow.demo.R;
 
 public abstract class CameraActivity extends Activity implements OnImageAvailableListener {
   private static final Logger LOGGER = new Logger();
@@ -199,7 +200,7 @@ public void requestRender() {
     }
   }
 
-  public void addCallback(OverlayView.DrawCallback callback) {
+  public void addCallback(final OverlayView.DrawCallback callback) {
     final OverlayView overlay = (OverlayView) findViewById(R.id.overlay);
     if (overlay != null) {
       overlay.addCallback(callback);
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/CameraConnectionFragment.java b/tensorflow/examples/android/src/org/tensorflow/demo/CameraConnectionFragment.java
index 2e09e78b8a4168..85df16c3c3c990 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/CameraConnectionFragment.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/CameraConnectionFragment.java
@@ -58,6 +58,7 @@
 import java.util.concurrent.Semaphore;
 import java.util.concurrent.TimeUnit;
 import org.tensorflow.demo.env.Logger;
+import org.tensorflow.demo.R;
 
 public class CameraConnectionFragment extends Fragment {
   private static final Logger LOGGER = new Logger();
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/ClassifierActivity.java b/tensorflow/examples/android/src/org/tensorflow/demo/ClassifierActivity.java
index 3e3a3407fbe713..5985010a4ff74a 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/ClassifierActivity.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/ClassifierActivity.java
@@ -37,6 +37,7 @@
 import org.tensorflow.demo.env.BorderedText;
 import org.tensorflow.demo.env.ImageUtils;
 import org.tensorflow.demo.env.Logger;
+import org.tensorflow.demo.R;
 
 public class ClassifierActivity extends CameraActivity implements OnImageAvailableListener {
   private static final Logger LOGGER = new Logger();
@@ -224,7 +225,7 @@ public void run() {
     Trace.endSection();
   }
 
-  private void renderDebug(Canvas canvas) {
+  private void renderDebug(final Canvas canvas) {
     if (!isDebug()) {
       return;
     }
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java b/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java
index d75136485a825d..309cbbdee64ac1 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java
@@ -42,6 +42,7 @@
 import org.tensorflow.demo.env.ImageUtils;
 import org.tensorflow.demo.env.Logger;
 import org.tensorflow.demo.tracking.MultiBoxTracker;
+import org.tensorflow.demo.R;
 
 /**
  * An activity that uses a TensorFlowMultiboxDetector and ObjectTracker to detect and then track
diff --git a/tensorflow/examples/learn/text_classification.py b/tensorflow/examples/learn/text_classification.py
index a26cdefda87387..9dae03029acecf 100644
--- a/tensorflow/examples/learn/text_classification.py
+++ b/tensorflow/examples/learn/text_classification.py
@@ -63,7 +63,7 @@ def rnn_model(features, target):
   word_list = tf.unstack(word_vectors, axis=1)
 
   # Create a Gated Recurrent Unit cell with hidden size of EMBEDDING_SIZE.
-  cell = tf.nn.rnn_cell.GRUCell(EMBEDDING_SIZE)
+  cell = tf.contrib.rnn.GRUCell(EMBEDDING_SIZE)
 
   # Create an unrolled Recurrent Neural Networks to length of
   # MAX_DOCUMENT_LENGTH and passes word_list as inputs for each unit.
diff --git a/tensorflow/examples/learn/text_classification_character_rnn.py b/tensorflow/examples/learn/text_classification_character_rnn.py
index 485d7592c45d80..d16b67c0bc58f8 100644
--- a/tensorflow/examples/learn/text_classification_character_rnn.py
+++ b/tensorflow/examples/learn/text_classification_character_rnn.py
@@ -50,7 +50,7 @@ def char_rnn_model(features, target):
   byte_list = tf.ont_hot(features, 256, 1, 0)
   byte_list = tf.unstack(byte_list, axis=1)
 
-  cell = tf.nn.rnn_cell.GRUCell(HIDDEN_SIZE)
+  cell = tf.contrib.rnn.GRUCell(HIDDEN_SIZE)
   _, encoding = tf.nn.rnn(cell, byte_list, dtype=tf.float32)
 
   logits = tf.contrib.layers.fully_connected(encoding, 15, activation_fn=None)
diff --git a/tensorflow/examples/tutorials/input_fn/boston.py b/tensorflow/examples/tutorials/input_fn/boston.py
index 13914ea1c4e008..fb2164c3952d0e 100644
--- a/tensorflow/examples/tutorials/input_fn/boston.py
+++ b/tensorflow/examples/tutorials/input_fn/boston.py
@@ -17,6 +17,8 @@
 from __future__ import division
 from __future__ import print_function
 
+import itertools
+
 import pandas as pd
 import tensorflow as tf
 
@@ -64,7 +66,9 @@ def main(unused_argv):
 
   # Print out predictions
   y = regressor.predict(input_fn=lambda: input_fn(prediction_set))
-  print("Predictions: {}".format(str(y)))
+  # .predict() returns an iterator; convert to a list and print predictions
+  predictions = list(itertools.islice(y, 6))
+  print("Predictions: {}".format(str(predictions)))
 
 if __name__ == "__main__":
   tf.app.run()
diff --git a/tensorflow/examples/udacity/6_lstm.ipynb b/tensorflow/examples/udacity/6_lstm.ipynb
index 6a9a5be96470ea..a2550f07b65e32 100644
--- a/tensorflow/examples/udacity/6_lstm.ipynb
+++ b/tensorflow/examples/udacity/6_lstm.ipynb
@@ -167,10 +167,10 @@
       },
       "source": [
         "def read_data(filename):\n",
-        "  f = zipfile.ZipFile(filename)\n",
-        "  for name in f.namelist():\n",
-        "    return tf.compat.as_str(f.read(name))\n",
-        "  f.close()\n",
+        "  with zipfile.ZipFile(filename) as f:\n",
+        "    name = f.namelist()[0]\n",
+        "    data = tf.compat.as_str(f.read(name))\n",
+        "  return data\n",
         "  \n",
         "text = read_data(filename)\n",
         "print('Data size %d' % len(text))"
diff --git a/tensorflow/g3doc/api_docs/python/array_ops.md b/tensorflow/g3doc/api_docs/python/array_ops.md
index adbd947a9f977c..c5ab968a4acae0 100644
--- a/tensorflow/g3doc/api_docs/python/array_ops.md
+++ b/tensorflow/g3doc/api_docs/python/array_ops.md
@@ -441,7 +441,7 @@ For example:
 ```prettyprint
 # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
 shape(squeeze(t)) ==> [2, 3]
-          ```
+```
 
 Or, to remove specific size 1 dimensions:
 
@@ -635,7 +635,7 @@ tf.slice(input, [1, 0, 0], [2, 1, 3]) ==> [[[3, 3, 3]],
 
 - - -
 
-### `tf.strided_slice(input_, begin, end, strides, begin_mask=0, end_mask=0, ellipsis_mask=0, new_axis_mask=0, shrink_axis_mask=0, var=None, name=None)` {#strided_slice}
+### `tf.strided_slice(input_, begin, end, strides=None, begin_mask=0, end_mask=0, ellipsis_mask=0, new_axis_mask=0, shrink_axis_mask=0, var=None, name=None)` {#strided_slice}
 
 Extracts a strided slice from a tensor.
 
@@ -724,12 +724,14 @@ tf.strided_slice(input, [1, 1, 0], [2, -1, 3], [1, -1, 1]) ==>[[[4, 4, 4],
 
 - - -
 
-### `tf.split(split_dim, num_split, value, name='split')` {#split}
+### `tf.split(axis=None, num_or_size_splits=None, value=None, name='split', split_dim=None)` {#split}
+
+DEPRECATED: use split_v; split_v rename to split happening soon.
 
 Splits a tensor into `num_split` tensors along one dimension.
 
-Splits `value` along dimension `split_dim` into `num_split` smaller tensors.
-Requires that `num_split` evenly divide `value.shape[split_dim]`.
+Splits `value` along dimension `axis` into `num_or_size_splits` smaller
+tensors. Requires that `num_or_size_splits` evenly divide `value.shape[axis]`.
 
 For example:
 
@@ -757,11 +759,13 @@ tf.unpack(t, axis=axis)
 ##### Args:
 
 
-*  <b>`split_dim`</b>: A 0-D `int32` `Tensor`. The dimension along which to split.
+*  <b>`axis`</b>: A 0-D `int32` `Tensor`. The dimension along which to split.
     Must be in the range `[0, rank(value))`.
-*  <b>`num_split`</b>: A Python integer. The number of ways to split.
+*  <b>`num_or_size_splits`</b>: A Python integer. The number of ways to split. Has a
+    different meaning in split_v (see docs).
 *  <b>`value`</b>: The `Tensor` to split.
 *  <b>`name`</b>: A name for the operation (optional).
+*  <b>`split_dim`</b>: The old (deprecated) name for axis.
 
 ##### Returns:
 
@@ -770,15 +774,15 @@ tf.unpack(t, axis=axis)
 
 - - -
 
-### `tf.split_v(value, size_splits, split_dim=0, num=None, name='split_v')` {#split_v}
+### `tf.split_v(value=None, num_or_size_splits=None, axis=0, num=None, name='split_v')` {#split_v}
 
 Splits a tensor into sub tensors.
 
-If size_splits is a scalar, `num_split`, then
-splits `value` along dimension `split_dim` into `num_split` smaller tensors.
+If num_or_size_splits is a scalar, `num_split`, then
+splits `value` along dimension `axis` into `num_split` smaller tensors.
 Requires that `num_split` evenly divide `value.shape[split_dim]`.
 
-If size_splits is a tensor, then
+If num_or_size_splits is a tensor, then
 splits `value` into len(size_splits) pieces each the same size as the input
 except along dimension split_dim where the size is size_splits[i].
 
@@ -800,12 +804,12 @@ tf.shape(split0) ==> [5, 10]
 
 
 *  <b>`value`</b>: The `Tensor` to split.
-*  <b>`size_splits`</b>: Either an integer indicating the number of splits along
+*  <b>`num_or_size_splits`</b>: Either an integer indicating the number of splits along
     split_dim or a 1-D Tensor containing the sizes of each output tensor
     along split_dim. If an integer then it must evenly divide
     value.shape[split_dim]; otherwise the sum of sizes along the split
     dimension must match that of the input.
-*  <b>`split_dim`</b>: A 0-D `int32` `Tensor`. The dimension along which to split.
+*  <b>`axis`</b>: A 0-D `int32` `Tensor`. The dimension along which to split.
     Must be in the range `[0, rank(value))`. Defaults to 0.
 *  <b>`num`</b>: Optional, used to specify the number of outputs when it cannot be
        inferred from the shape of size_splits.
@@ -813,7 +817,10 @@ tf.shape(split0) ==> [5, 10]
 
 ##### Returns:
 
-  `len(size_splits)` `Tensor` objects resulting from splitting `value`.
+  if `num_or_size_splits` is a scalar returns `num_or_size_splits` `Tensor`
+  objects; if `num_or_size_splits` is a 1-D Tensor returns
+  `num_or_size_splits.get_shape[0]` `Tensor` objects resulting from splitting
+  `value`.
 
 ##### Raises:
 
@@ -908,9 +915,13 @@ pad(t, paddings, "SYMMETRIC") ==> [[2, 1, 1, 2, 3, 3, 2],
 
 - - -
 
-### `tf.concat(concat_dim, values, name='concat')` {#concat}
+### `tf.concat(*args, **kwargs)` {#concat}
 
-Concatenates tensors along one dimension.
+Concatenates tensors along one dimension. (deprecated)
+
+THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-13.
+Instructions for updating:
+This op will be removed after the deprecation date. Please switch to tf.concat_v2().
 
 Concatenates the list of tensors `values` along dimension `concat_dim`.  If
 `values[i].shape = [D0, D1, ... Dconcat_dim(i), ...Dn]`, the concatenated
@@ -1216,20 +1227,20 @@ This is the opposite of pack.  The numpy equivalent is
 
 - - -
 
-### `tf.reverse_sequence(input, seq_lengths, seq_dim, batch_dim=None, name=None)` {#reverse_sequence}
+### `tf.reverse_sequence(input, seq_lengths, seq_axis=None, batch_axis=None, name=None, seq_dim=None, batch_dim=None)` {#reverse_sequence}
 
 Reverses variable length slices.
 
-This op first slices `input` along the dimension `batch_dim`, and for each
+This op first slices `input` along the dimension `batch_axis`, and for each
 slice `i`, reverses the first `seq_lengths[i]` elements along
-the dimension `seq_dim`.
+the dimension `seq_axis`.
 
 The elements of `seq_lengths` must obey `seq_lengths[i] < input.dims[seq_dim]`,
 and `seq_lengths` must be a vector of length `input.dims[batch_dim]`.
 
-The output slice `i` along dimension `batch_dim` is then given by input
+The output slice `i` along dimension `batch_axis` is then given by input
 slice `i`, with the first `seq_lengths[i]` slices along dimension
-`seq_dim` reversed.
+`seq_axis` reversed.
 
 For example:
 
@@ -1282,8 +1293,8 @@ output[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]
 *  <b>`seq_lengths`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
     1-D with length `input.dims(batch_dim)` and
     `max(seq_lengths) < input.dims(seq_dim)`
-*  <b>`seq_dim`</b>: An `int`. The dimension which is partially reversed.
-*  <b>`batch_dim`</b>: An optional `int`. Defaults to `0`.
+*  <b>`seq_axis`</b>: An `int`. The dimension which is partially reversed.
+*  <b>`batch_axis`</b>: An optional `int`. Defaults to `0`.
     The dimension along which reversal is performed.
 *  <b>`name`</b>: A name for the operation (optional).
 
diff --git a/tensorflow/g3doc/api_docs/python/contrib.framework.md b/tensorflow/g3doc/api_docs/python/contrib.framework.md
index 49892fdcaf88f8..a10cd9d09778d1 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.framework.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.framework.md
@@ -59,7 +59,7 @@ Assert `tensor` is 0-D, of type `tf.int32` or `tf.int64`.
 
 - - -
 
-### `tf.contrib.framework.convert_to_tensor_or_sparse_tensor(value, dtype=None, name=None)` {#convert_to_tensor_or_sparse_tensor}
+### `tf.convert_to_tensor_or_sparse_tensor(value, dtype=None, name=None)` {#convert_to_tensor_or_sparse_tensor}
 
 Converts value to a `SparseTensor` or `Tensor`.
 
diff --git a/tensorflow/g3doc/api_docs/python/contrib.layers.md b/tensorflow/g3doc/api_docs/python/contrib.layers.md
index 3babfa0cabc92b..3a1be18d24c797 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.layers.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.layers.md
@@ -1711,7 +1711,7 @@ Example:
       is a SparseTensor.
    Following are assumed to be true:
      * sparse_tensor.indices = weights_tensor.indices
-     * sparse_tensor.shape = weights_tensor.shape
+     * sparse_tensor.dense_shape = weights_tensor.dense_shape
 
 ##### Args:
 
diff --git a/tensorflow/g3doc/api_docs/python/contrib.learn.md b/tensorflow/g3doc/api_docs/python/contrib.learn.md
index 1808bb94e2cca0..6c588547810800 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.learn.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.learn.md
@@ -67,11 +67,12 @@ available in the SKCompat class, Estimator will only accept input_fn.
 
   est = Estimator(...) -> est = SKCompat(Estimator(...))
 
+##### Raises:
+
 
-*  <b>`Raises`</b>: 
 *  <b>`ValueError`</b>: If at least one of `x` or `y` is provided, and at least one of
-          `input_fn` or `feed_fn` is provided.
-          Or if `metrics` is not `None` or `dict`.
+      `input_fn` or `feed_fn` is provided.
+      Or if `metrics` is not `None` or `dict`.
 
 
 - - -
@@ -84,36 +85,39 @@ SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-23.
 Instructions for updating:
 The signature of the input_fn accepted by export is changing to be consistent with what's used by tf.Learn Estimator's train/evaluate. input_fn (and in most cases, input_feature_key) will become required args, and use_deprecated_input_fn will default to False and be removed altogether.
 
-    Args:
-      export_dir: A string containing a directory to write the exported graph
-        and checkpoints.
-      input_fn: If `use_deprecated_input_fn` is true, then a function that given
-        `Tensor` of `Example` strings, parses it into features that are then
-        passed to the model. Otherwise, a function that takes no argument and
-        returns a tuple of (features, labels), where features is a dict of
-        string key to `Tensor` and labels is a `Tensor` that's currently not
-        used (and so can be `None`).
-      input_feature_key: Only used if `use_deprecated_input_fn` is false. String
-        key into the features dict returned by `input_fn` that corresponds to a
-        the raw `Example` strings `Tensor` that the exported model will take as
-        input. Can only be `None` if you're using a custom `signature_fn` that
-        does not use the first arg (examples).
-      use_deprecated_input_fn: Determines the signature format of `input_fn`.
-      signature_fn: Function that returns a default signature and a named
-        signature map, given `Tensor` of `Example` strings, `dict` of `Tensor`s
-        for features and `Tensor` or `dict` of `Tensor`s for predictions.
-      prediction_key: The key for a tensor in the `predictions` dict (output
-        from the `model_fn`) to use as the `predictions` input to the
-        `signature_fn`. Optional. If `None`, predictions will pass to
-        `signature_fn` without filtering.
-      default_batch_size: Default batch size of the `Example` placeholder.
-      exports_to_keep: Number of exports to keep.
-
-    Returns:
-      The string path to the exported directory. NB: this functionality was
-      added ca. 2016/09/25; clients that depend on the return value may need
-      to handle the case where this function returns None because subclasses
-      are not returning a value.
+##### Args:
+
+
+*  <b>`export_dir`</b>: A string containing a directory to write the exported graph
+    and checkpoints.
+*  <b>`input_fn`</b>: If `use_deprecated_input_fn` is true, then a function that given
+    `Tensor` of `Example` strings, parses it into features that are then
+    passed to the model. Otherwise, a function that takes no argument and
+    returns a tuple of (features, labels), where features is a dict of
+    string key to `Tensor` and labels is a `Tensor` that's currently not
+    used (and so can be `None`).
+*  <b>`input_feature_key`</b>: Only used if `use_deprecated_input_fn` is false. String
+    key into the features dict returned by `input_fn` that corresponds to a
+    the raw `Example` strings `Tensor` that the exported model will take as
+    input. Can only be `None` if you're using a custom `signature_fn` that
+    does not use the first arg (examples).
+*  <b>`use_deprecated_input_fn`</b>: Determines the signature format of `input_fn`.
+*  <b>`signature_fn`</b>: Function that returns a default signature and a named
+    signature map, given `Tensor` of `Example` strings, `dict` of `Tensor`s
+    for features and `Tensor` or `dict` of `Tensor`s for predictions.
+*  <b>`prediction_key`</b>: The key for a tensor in the `predictions` dict (output
+    from the `model_fn`) to use as the `predictions` input to the
+    `signature_fn`. Optional. If `None`, predictions will pass to
+    `signature_fn` without filtering.
+*  <b>`default_batch_size`</b>: Default batch size of the `Example` placeholder.
+*  <b>`exports_to_keep`</b>: Number of exports to keep.
+
+##### Returns:
+
+  The string path to the exported directory. NB: this functionality was
+  added ca. 2016/09/25; clients that depend on the return value may need
+  to handle the case where this function returns None because subclasses
+  are not returning a value.
 
 
 - - -
@@ -132,8 +136,9 @@ available in the SKCompat class, Estimator will only accept input_fn.
 
   est = Estimator(...) -> est = SKCompat(Estimator(...))
 
+##### Raises:
+
 
-*  <b>`Raises`</b>: 
 *  <b>`ValueError`</b>: If `x` or `y` are not `None` while `input_fn` is not `None`.
 *  <b>`ValueError`</b>: If both `steps` and `max_steps` are not `None`.
 
@@ -208,39 +213,41 @@ available in the SKCompat class, Estimator will only accept input_fn.
 
   est = Estimator(...) -> est = SKCompat(Estimator(...))
 
-    This method is expected to be called several times consecutively
-    on different or the same chunks of the dataset. This either can
-    implement iterative training or out-of-core/online training.
+This method is expected to be called several times consecutively
+on different or the same chunks of the dataset. This either can
+implement iterative training or out-of-core/online training.
 
-    This is especially useful when the whole dataset is too big to
-    fit in memory at the same time. Or when model is taking long time
-    to converge, and you want to split up training into subparts.
+This is especially useful when the whole dataset is too big to
+fit in memory at the same time. Or when model is taking long time
+to converge, and you want to split up training into subparts.
+
+##### Args:
 
 
-*  <b>`Args`</b>: 
 *  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-         returns arrays of features. The training input samples for fitting the
-         model. If set, `input_fn` must be `None`.
+     returns arrays of features. The training input samples for fitting the
+     model. If set, `input_fn` must be `None`.
 *  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-         iterator that returns array of labels. The training label values
-         (class labels in classification, real numbers in regression). If set,
-         `input_fn` must be `None`.
+     iterator that returns array of labels. The training label values
+     (class labels in classification, real numbers in regression). If set,
+     `input_fn` must be `None`.
 *  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-        `None`.
+    `None`.
 *  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
 *  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-        dimension of `x`. Must be `None` if `input_fn` is provided.
+    dimension of `x`. Must be `None` if `input_fn` is provided.
 *  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-        inside the training loop.
+    inside the training loop.
 
+##### Returns:
 
-*  <b>`Returns`</b>: 
-      `self`, for chaining.
+  `self`, for chaining.
+
+##### Raises:
 
 
-*  <b>`Raises`</b>: 
 *  <b>`ValueError`</b>: If at least one of `x` and `y` is provided, and `input_fn` is
-          provided.
+      provided.
 
 
 - - -
@@ -259,30 +266,32 @@ available in the SKCompat class, Estimator will only accept input_fn.
 
   est = Estimator(...) -> est = SKCompat(Estimator(...))
 
+##### Args:
+
 
-*  <b>`Args`</b>: 
 *  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-         returns arrays of features. The training input samples for fitting the
-         model. If set, `input_fn` must be `None`.
+     returns arrays of features. The training input samples for fitting the
+     model. If set, `input_fn` must be `None`.
 *  <b>`input_fn`</b>: Input function. If set, `x` and 'batch_size' must be `None`.
 *  <b>`batch_size`</b>: Override default batch size. If set, 'input_fn' must be
-        'None'.
+    'None'.
 *  <b>`outputs`</b>: list of `str`, name of the output to predict.
-        If `None`, returns all.
+    If `None`, returns all.
 *  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
-        for each example until inputs are exhausted. Note: The inputs must
-        terminate if you want the iterable to terminate (e.g. be sure to pass
-        num_epochs=1 if you are using something like read_batch_features).
+    for each example until inputs are exhausted. Note: The inputs must
+    terminate if you want the iterable to terminate (e.g. be sure to pass
+    num_epochs=1 if you are using something like read_batch_features).
+
+##### Returns:
 
+  A numpy array of predicted classes or regression values if the
+  constructor's `model_fn` returns a `Tensor` for `predictions` or a `dict`
+  of numpy arrays if `model_fn` returns a `dict`. Returns an iterable of
+  predictions if as_iterable is True.
 
-*  <b>`Returns`</b>: 
-      A numpy array of predicted classes or regression values if the
-      constructor's `model_fn` returns a `Tensor` for `predictions` or a `dict`
-      of numpy arrays if `model_fn` returns a `dict`. Returns an iterable of
-      predictions if as_iterable is True.
+##### Raises:
 
 
-*  <b>`Raises`</b>: 
 *  <b>`ValueError`</b>: If x and input_fn are both provided or both `None`.
 
 
@@ -408,11 +417,12 @@ available in the SKCompat class, Estimator will only accept input_fn.
 
   est = Estimator(...) -> est = SKCompat(Estimator(...))
 
+##### Raises:
+
 
-*  <b>`Raises`</b>: 
 *  <b>`ValueError`</b>: If at least one of `x` or `y` is provided, and at least one of
-          `input_fn` or `feed_fn` is provided.
-          Or if `metrics` is not `None` or `dict`.
+      `input_fn` or `feed_fn` is provided.
+      Or if `metrics` is not `None` or `dict`.
 
 
 - - -
@@ -425,36 +435,39 @@ SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-23.
 Instructions for updating:
 The signature of the input_fn accepted by export is changing to be consistent with what's used by tf.Learn Estimator's train/evaluate. input_fn (and in most cases, input_feature_key) will become required args, and use_deprecated_input_fn will default to False and be removed altogether.
 
-    Args:
-      export_dir: A string containing a directory to write the exported graph
-        and checkpoints.
-      input_fn: If `use_deprecated_input_fn` is true, then a function that given
-        `Tensor` of `Example` strings, parses it into features that are then
-        passed to the model. Otherwise, a function that takes no argument and
-        returns a tuple of (features, labels), where features is a dict of
-        string key to `Tensor` and labels is a `Tensor` that's currently not
-        used (and so can be `None`).
-      input_feature_key: Only used if `use_deprecated_input_fn` is false. String
-        key into the features dict returned by `input_fn` that corresponds to a
-        the raw `Example` strings `Tensor` that the exported model will take as
-        input. Can only be `None` if you're using a custom `signature_fn` that
-        does not use the first arg (examples).
-      use_deprecated_input_fn: Determines the signature format of `input_fn`.
-      signature_fn: Function that returns a default signature and a named
-        signature map, given `Tensor` of `Example` strings, `dict` of `Tensor`s
-        for features and `Tensor` or `dict` of `Tensor`s for predictions.
-      prediction_key: The key for a tensor in the `predictions` dict (output
-        from the `model_fn`) to use as the `predictions` input to the
-        `signature_fn`. Optional. If `None`, predictions will pass to
-        `signature_fn` without filtering.
-      default_batch_size: Default batch size of the `Example` placeholder.
-      exports_to_keep: Number of exports to keep.
-
-    Returns:
-      The string path to the exported directory. NB: this functionality was
-      added ca. 2016/09/25; clients that depend on the return value may need
-      to handle the case where this function returns None because subclasses
-      are not returning a value.
+##### Args:
+
+
+*  <b>`export_dir`</b>: A string containing a directory to write the exported graph
+    and checkpoints.
+*  <b>`input_fn`</b>: If `use_deprecated_input_fn` is true, then a function that given
+    `Tensor` of `Example` strings, parses it into features that are then
+    passed to the model. Otherwise, a function that takes no argument and
+    returns a tuple of (features, labels), where features is a dict of
+    string key to `Tensor` and labels is a `Tensor` that's currently not
+    used (and so can be `None`).
+*  <b>`input_feature_key`</b>: Only used if `use_deprecated_input_fn` is false. String
+    key into the features dict returned by `input_fn` that corresponds to a
+    the raw `Example` strings `Tensor` that the exported model will take as
+    input. Can only be `None` if you're using a custom `signature_fn` that
+    does not use the first arg (examples).
+*  <b>`use_deprecated_input_fn`</b>: Determines the signature format of `input_fn`.
+*  <b>`signature_fn`</b>: Function that returns a default signature and a named
+    signature map, given `Tensor` of `Example` strings, `dict` of `Tensor`s
+    for features and `Tensor` or `dict` of `Tensor`s for predictions.
+*  <b>`prediction_key`</b>: The key for a tensor in the `predictions` dict (output
+    from the `model_fn`) to use as the `predictions` input to the
+    `signature_fn`. Optional. If `None`, predictions will pass to
+    `signature_fn` without filtering.
+*  <b>`default_batch_size`</b>: Default batch size of the `Example` placeholder.
+*  <b>`exports_to_keep`</b>: Number of exports to keep.
+
+##### Returns:
+
+  The string path to the exported directory. NB: this functionality was
+  added ca. 2016/09/25; clients that depend on the return value may need
+  to handle the case where this function returns None because subclasses
+  are not returning a value.
 
 
 - - -
@@ -466,28 +479,33 @@ Exports inference graph as a SavedModel into given dir. (experimental)
 THIS FUNCTION IS EXPERIMENTAL. It may change or be removed at any time, and without warning.
 
 
-    Args:
-      export_dir_base: A string containing a directory to write the exported
-        graph and checkpoints.
-      input_fn: A function that takes no argument and
-        returns an `InputFnOps`.
-      default_output_alternative_key: the name of the head to serve when none is
-        specified.
-      assets_extra: A dict specifying how to populate the assets.extra directory
-        within the exported SavedModel.  Each key should give the destination
-        path (including the filename) relative to the assets.extra directory.
-        The corresponding value gives the full path of the source file to be
-        copied.  For example, the simple case of copying a single file without
-        renaming it is specified as
-        `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
-      as_text: whether to write the SavedModel proto in text format.
-      exports_to_keep: Number of exports to keep.
+##### Args:
+
+
+*  <b>`export_dir_base`</b>: A string containing a directory to write the exported
+    graph and checkpoints.
+*  <b>`input_fn`</b>: A function that takes no argument and
+    returns an `InputFnOps`.
+*  <b>`default_output_alternative_key`</b>: the name of the head to serve when none is
+    specified.
+*  <b>`assets_extra`</b>: A dict specifying how to populate the assets.extra directory
+    within the exported SavedModel.  Each key should give the destination
+    path (including the filename) relative to the assets.extra directory.
+    The corresponding value gives the full path of the source file to be
+    copied.  For example, the simple case of copying a single file without
+    renaming it is specified as
+    `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
+*  <b>`as_text`</b>: whether to write the SavedModel proto in text format.
+*  <b>`exports_to_keep`</b>: Number of exports to keep.
+
+##### Returns:
+
+  The string path to the exported directory.
 
-    Returns:
-      The string path to the exported directory.
+##### Raises:
 
-    Raises:
-      ValueError: if an unrecognized export_type is requested.
+
+*  <b>`ValueError`</b>: if an unrecognized export_type is requested.
 
 
 - - -
@@ -506,8 +524,9 @@ available in the SKCompat class, Estimator will only accept input_fn.
 
   est = Estimator(...) -> est = SKCompat(Estimator(...))
 
+##### Raises:
+
 
-*  <b>`Raises`</b>: 
 *  <b>`ValueError`</b>: If `x` or `y` are not `None` while `input_fn` is not `None`.
 *  <b>`ValueError`</b>: If both `steps` and `max_steps` are not `None`.
 
@@ -582,39 +601,41 @@ available in the SKCompat class, Estimator will only accept input_fn.
 
   est = Estimator(...) -> est = SKCompat(Estimator(...))
 
-    This method is expected to be called several times consecutively
-    on different or the same chunks of the dataset. This either can
-    implement iterative training or out-of-core/online training.
+This method is expected to be called several times consecutively
+on different or the same chunks of the dataset. This either can
+implement iterative training or out-of-core/online training.
+
+This is especially useful when the whole dataset is too big to
+fit in memory at the same time. Or when model is taking long time
+to converge, and you want to split up training into subparts.
 
-    This is especially useful when the whole dataset is too big to
-    fit in memory at the same time. Or when model is taking long time
-    to converge, and you want to split up training into subparts.
+##### Args:
 
 
-*  <b>`Args`</b>: 
 *  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-         returns arrays of features. The training input samples for fitting the
-         model. If set, `input_fn` must be `None`.
+     returns arrays of features. The training input samples for fitting the
+     model. If set, `input_fn` must be `None`.
 *  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-         iterator that returns array of labels. The training label values
-         (class labels in classification, real numbers in regression). If set,
-         `input_fn` must be `None`.
+     iterator that returns array of labels. The training label values
+     (class labels in classification, real numbers in regression). If set,
+     `input_fn` must be `None`.
 *  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-        `None`.
+    `None`.
 *  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
 *  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-        dimension of `x`. Must be `None` if `input_fn` is provided.
+    dimension of `x`. Must be `None` if `input_fn` is provided.
 *  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-        inside the training loop.
+    inside the training loop.
 
+##### Returns:
 
-*  <b>`Returns`</b>: 
-      `self`, for chaining.
+  `self`, for chaining.
+
+##### Raises:
 
 
-*  <b>`Raises`</b>: 
 *  <b>`ValueError`</b>: If at least one of `x` and `y` is provided, and `input_fn` is
-          provided.
+      provided.
 
 
 - - -
@@ -633,30 +654,32 @@ available in the SKCompat class, Estimator will only accept input_fn.
 
   est = Estimator(...) -> est = SKCompat(Estimator(...))
 
+##### Args:
+
 
-*  <b>`Args`</b>: 
 *  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-         returns arrays of features. The training input samples for fitting the
-         model. If set, `input_fn` must be `None`.
+     returns arrays of features. The training input samples for fitting the
+     model. If set, `input_fn` must be `None`.
 *  <b>`input_fn`</b>: Input function. If set, `x` and 'batch_size' must be `None`.
 *  <b>`batch_size`</b>: Override default batch size. If set, 'input_fn' must be
-        'None'.
+    'None'.
 *  <b>`outputs`</b>: list of `str`, name of the output to predict.
-        If `None`, returns all.
+    If `None`, returns all.
 *  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
-        for each example until inputs are exhausted. Note: The inputs must
-        terminate if you want the iterable to terminate (e.g. be sure to pass
-        num_epochs=1 if you are using something like read_batch_features).
+    for each example until inputs are exhausted. Note: The inputs must
+    terminate if you want the iterable to terminate (e.g. be sure to pass
+    num_epochs=1 if you are using something like read_batch_features).
 
+##### Returns:
+
+  A numpy array of predicted classes or regression values if the
+  constructor's `model_fn` returns a `Tensor` for `predictions` or a `dict`
+  of numpy arrays if `model_fn` returns a `dict`. Returns an iterable of
+  predictions if as_iterable is True.
 
-*  <b>`Returns`</b>: 
-      A numpy array of predicted classes or regression values if the
-      constructor's `model_fn` returns a `Tensor` for `predictions` or a `dict`
-      of numpy arrays if `model_fn` returns a `dict`. Returns an iterable of
-      predictions if as_iterable is True.
+##### Raises:
 
 
-*  <b>`Raises`</b>: 
 *  <b>`ValueError`</b>: If x and input_fn are both provided or both `None`.
 
 
@@ -744,7 +767,7 @@ Trains a model given training data `x` predictions and `y` labels.
 Interface for objects that are evaluatable by, e.g., `Experiment`.
 - - -
 
-#### `tf.contrib.learn.Evaluable.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#Evaluable.evaluate}
+#### `tf.contrib.learn.Evaluable.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None, checkpoint_path=None)` {#Evaluable.evaluate}
 
 Evaluates given model with provided evaluation data.
 
@@ -799,12 +822,21 @@ for which this evaluation was performed.
 
 *  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
     different data sets, such as on training data vs test data.
+*  <b>`checkpoint_path`</b>: Path of a specific checkpoint to evaluate. If `None`, the
+    latest checkpoint in `model_dir` is used.
 
 ##### Returns:
 
   Returns `dict` with evaluation results.
 
 
+- - -
+
+#### `tf.contrib.learn.Evaluable.model_dir` {#Evaluable.model_dir}
+
+Returns a path in which the eval process will look for checkpoints.
+
+
 
 - - -
 
@@ -949,7 +981,7 @@ This method will be removed after the deprecation date. To inspect variables, us
 
 - - -
 
-#### `tf.contrib.learn.DNNClassifier.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#DNNClassifier.evaluate}
+#### `tf.contrib.learn.DNNClassifier.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None, checkpoint_path=None)` {#DNNClassifier.evaluate}
 
 See evaluable.Evaluable. Note: Labels must be integer class indices.
 
@@ -1023,19 +1055,22 @@ The default behavior of predict() is changing. The default value for
 as_iterable will change to True, and then the flag will be removed
 altogether. The behavior of this flag is described below.
 
-    Args:
-      x: features.
-      input_fn: Input function. If set, x must be None.
-      batch_size: Override default batch size.
-      as_iterable: If True, return an iterable which keeps yielding predictions
-        for each example until inputs are exhausted. Note: The inputs must
-        terminate if you want the iterable to terminate (e.g. be sure to pass
-        num_epochs=1 if you are using something like read_batch_features).
+##### Args:
 
-    Returns:
-      Numpy array of predicted classes (or an iterable of predicted classes if
-      as_iterable is True). Each predicted class is represented by its class
-      index (i.e. integer from 0 to n_classes-1).
+
+*  <b>`x`</b>: features.
+*  <b>`input_fn`</b>: Input function. If set, x must be None.
+*  <b>`batch_size`</b>: Override default batch size.
+*  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
+    for each example until inputs are exhausted. Note: The inputs must
+    terminate if you want the iterable to terminate (e.g. be sure to pass
+    num_epochs=1 if you are using something like read_batch_features).
+
+##### Returns:
+
+  Numpy array of predicted classes with shape [batch_size] (or an iterable
+  of predicted classes if as_iterable is True). Each predicted class is
+  represented by its class index (i.e. integer from 0 to n_classes-1).
 
 
 - - -
@@ -1050,19 +1085,21 @@ The default behavior of predict() is changing. The default value for
 as_iterable will change to True, and then the flag will be removed
 altogether. The behavior of this flag is described below.
 
-    Args:
-      x: features.
-      input_fn: Input function. If set, x and y must be None.
-      batch_size: Override default batch size.
-      as_iterable: If True, return an iterable which keeps yielding predictions
-        for each example until inputs are exhausted. Note: The inputs must
-        terminate if you want the iterable to terminate (e.g. be sure to pass
-        num_epochs=1 if you are using something like read_batch_features).
+##### Args:
 
-    Returns:
-      Numpy array of predicted probabilities (or an iterable of predicted
-      probabilities if as_iterable is True). Each predicted class is represented
-      by its class index (i.e. integer from 0 to n_classes-1).
+
+*  <b>`x`</b>: features.
+*  <b>`input_fn`</b>: Input function. If set, x and y must be None.
+*  <b>`batch_size`</b>: Override default batch size.
+*  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
+    for each example until inputs are exhausted. Note: The inputs must
+    terminate if you want the iterable to terminate (e.g. be sure to pass
+    num_epochs=1 if you are using something like read_batch_features).
+
+##### Returns:
+
+  Numpy array of predicted probabilities with shape [batch_size, n_classes]
+  (or an iterable of predicted probabilities if as_iterable is True).
 
 
 - - -
@@ -1134,7 +1171,7 @@ Input of `fit` and `evaluate` should have following features,
     whose `value` is a `Tensor`.
 - - -
 
-#### `tf.contrib.learn.DNNRegressor.__init__(hidden_units, feature_columns, model_dir=None, weight_column_name=None, optimizer=None, activation_fn=relu, dropout=None, gradient_clip_norm=None, enable_centered_bias=False, config=None, feature_engineering_fn=None, label_dimension=1)` {#DNNRegressor.__init__}
+#### `tf.contrib.learn.DNNRegressor.__init__(hidden_units, feature_columns, model_dir=None, weight_column_name=None, optimizer=None, activation_fn=relu, dropout=None, gradient_clip_norm=None, enable_centered_bias=False, config=None, feature_engineering_fn=None, label_dimension=1, embedding_lr_multipliers=None)` {#DNNRegressor.__init__}
 
 Initializes a `DNNRegressor` instance.
 
@@ -1171,30 +1208,15 @@ Initializes a `DNNRegressor` instance.
                     returns features and labels which will be fed
                     into the model.
 *  <b>`label_dimension`</b>: Dimension of the label for multilabels. Defaults to 1.
+*  <b>`embedding_lr_multipliers`</b>: Optional. A dictionary from `EbeddingColumn` to
+      a `float` multiplier. Multiplier will be used to multiply with
+      learning rate for the embedding variables.
 
 ##### Returns:
 
   A `DNNRegressor` estimator.
 
 
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.__repr__()` {#DNNRegressor.__repr__}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.bias_` {#DNNRegressor.bias_}
-
-DEPRECATED FUNCTION
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-10-30.
-Instructions for updating:
-This method will be removed after the deprecation date. To inspect variables, use get_variable_names() and get_variable_value().
-
-
 - - -
 
 #### `tf.contrib.learn.DNNRegressor.config` {#DNNRegressor.config}
@@ -1204,96 +1226,23 @@ This method will be removed after the deprecation date. To inspect variables, us
 
 - - -
 
-#### `tf.contrib.learn.DNNRegressor.dnn_bias_` {#DNNRegressor.dnn_bias_}
-
-Returns bias of deep neural network part. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-10-30.
-Instructions for updating:
-This method will be removed after the deprecation date. To inspect variables, use get_variable_names() and get_variable_value().
-
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.dnn_weights_` {#DNNRegressor.dnn_weights_}
-
-Returns weights of deep neural network part. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-10-30.
-Instructions for updating:
-This method will be removed after the deprecation date. To inspect variables, use get_variable_names() and get_variable_value().
-
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.evaluate(*args, **kwargs)` {#DNNRegressor.evaluate}
-
-See `Evaluable`. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-
-*  <b>`Raises`</b>: 
-*  <b>`ValueError`</b>: If at least one of `x` or `y` is provided, and at least one of
-          `input_fn` or `feed_fn` is provided.
-          Or if `metrics` is not `None` or `dict`.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.export(export_dir, input_fn=None, input_feature_key=None, use_deprecated_input_fn=True, signature_fn=None, default_batch_size=None, exports_to_keep=None)` {#DNNRegressor.export}
-
+#### `tf.contrib.learn.DNNRegressor.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None, checkpoint_path=None)` {#DNNRegressor.evaluate}
 
+See evaluable.Evaluable.
 
 
 - - -
 
-#### `tf.contrib.learn.DNNRegressor.fit(*args, **kwargs)` {#DNNRegressor.fit}
-
-See `Trainable`. (deprecated arguments)
+#### `tf.contrib.learn.DNNRegressor.export(export_dir, input_fn=None, input_feature_key=None, use_deprecated_input_fn=True, signature_fn=None, default_batch_size=1, exports_to_keep=None)` {#DNNRegressor.export}
 
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-
-*  <b>`Raises`</b>: 
-*  <b>`ValueError`</b>: If `x` or `y` are not `None` while `input_fn` is not `None`.
-*  <b>`ValueError`</b>: If both `steps` and `max_steps` are not `None`.
+See BaseEstimator.export.
 
 
 - - -
 
-#### `tf.contrib.learn.DNNRegressor.get_params(deep=True)` {#DNNRegressor.get_params}
-
-Get parameters for this estimator.
+#### `tf.contrib.learn.DNNRegressor.fit(x=None, y=None, input_fn=None, steps=None, batch_size=None, monitors=None, max_steps=None)` {#DNNRegressor.fit}
 
-##### Args:
-
-
-*  <b>`deep`</b>: boolean, optional
-
-    If `True`, will return the parameters for this estimator and
-    contained subobjects that are estimators.
-
-##### Returns:
-
-  params : mapping of string to any
-  Parameter names mapped to their values.
+See trainable.Trainable.
 
 
 - - -
@@ -1320,29 +1269,7 @@ Returns value of the variable given by name.
 
 ##### Returns:
 
-  Numpy array - value of the tensor.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.linear_bias_` {#DNNRegressor.linear_bias_}
-
-Returns bias of the linear part. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-10-30.
-Instructions for updating:
-This method will be removed after the deprecation date. To inspect variables, use get_variable_names() and get_variable_value().
-
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.linear_weights_` {#DNNRegressor.linear_weights_}
-
-Returns weights per feature of the linear part. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-10-30.
-Instructions for updating:
-This method will be removed after the deprecation date. To inspect variables, use get_variable_names() and get_variable_value().
+  `Tensor` object.
 
 
 - - -
@@ -1352,62 +1279,11 @@ This method will be removed after the deprecation date. To inspect variables, us
 
 
 
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.partial_fit(*args, **kwargs)` {#DNNRegressor.partial_fit}
-
-Incremental fit on a batch of samples. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-    This method is expected to be called several times consecutively
-    on different or the same chunks of the dataset. This either can
-    implement iterative training or out-of-core/online training.
-
-    This is especially useful when the whole dataset is too big to
-    fit in memory at the same time. Or when model is taking long time
-    to converge, and you want to split up training into subparts.
-
-
-*  <b>`Args`</b>: 
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-         returns arrays of features. The training input samples for fitting the
-         model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-         iterator that returns array of labels. The training label values
-         (class labels in classification, real numbers in regression). If set,
-         `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-        `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-        dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-        inside the training loop.
-
-
-*  <b>`Returns`</b>: 
-      `self`, for chaining.
-
-
-*  <b>`Raises`</b>: 
-*  <b>`ValueError`</b>: If at least one of `x` and `y` is provided, and `input_fn` is
-          provided.
-
-
 - - -
 
 #### `tf.contrib.learn.DNNRegressor.predict(*args, **kwargs)` {#DNNRegressor.predict}
 
-Runs inference to determine the predicted class. (deprecated arguments)
+Returns predicted scores for given features. (deprecated arguments)
 
 SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-15.
 Instructions for updating:
@@ -1415,42 +1291,22 @@ The default behavior of predict() is changing. The default value for
 as_iterable will change to True, and then the flag will be removed
 altogether. The behavior of this flag is described below.
 
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.set_params(**params)` {#DNNRegressor.set_params}
-
-Set the parameters of this estimator.
-
-The method works on simple estimators as well as on nested objects
-(such as pipelines). The former have parameters of the form
-``<component>__<parameter>`` so that it's possible to update each
-component of a nested object.
-
 ##### Args:
 
 
-*  <b>`**params`</b>: Parameters.
+*  <b>`x`</b>: features.
+*  <b>`input_fn`</b>: Input function. If set, x must be None.
+*  <b>`batch_size`</b>: Override default batch size.
+*  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
+    for each example until inputs are exhausted. Note: The inputs must
+    terminate if you want the iterable to terminate (e.g. be sure to pass
+    num_epochs=1 if you are using something like read_batch_features).
 
 ##### Returns:
 
-  self
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If params contain invalid names.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.weights_` {#DNNRegressor.weights_}
-
-DEPRECATED FUNCTION
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-10-30.
-Instructions for updating:
-This method will be removed after the deprecation date. To inspect variables, use get_variable_names() and get_variable_value().
+  Numpy array of predicted scores (or an iterable of predicted scores if
+  as_iterable is True). If `label_dimension == 1`, the shape of the output
+  is `[batch_size]`, otherwise the shape is `[batch_size, label_dimension]`.
 
 
 
@@ -1587,7 +1443,7 @@ This method will be removed after the deprecation date. To inspect variables, us
 
 - - -
 
-#### `tf.contrib.learn.LinearClassifier.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#LinearClassifier.evaluate}
+#### `tf.contrib.learn.LinearClassifier.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None, checkpoint_path=None)` {#LinearClassifier.evaluate}
 
 See evaluable.Evaluable. Note: Labels must be integer class indices.
 
@@ -1786,7 +1642,7 @@ This method will be removed after the deprecation date. To inspect variables, us
 
 - - -
 
-#### `tf.contrib.learn.LinearRegressor.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#LinearRegressor.evaluate}
+#### `tf.contrib.learn.LinearRegressor.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None, checkpoint_path=None)` {#LinearRegressor.evaluate}
 
 See evaluable.Evaluable.
 
@@ -1924,7 +1780,7 @@ Initializes a LogisticRegressor.
 
 - - -
 
-#### `tf.contrib.learn.LogisticRegressor.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#LogisticRegressor.evaluate}
+#### `tf.contrib.learn.LogisticRegressor.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None, checkpoint_path=None)` {#LogisticRegressor.evaluate}
 
 Evaluates given model with provided evaluation data.
 
@@ -1941,6 +1797,8 @@ See superclass Estimator for more details.
 *  <b>`steps`</b>: Number of steps for which to evaluate model.
 *  <b>`metrics`</b>: Dict of metric ops to run. If None, the default metrics are used.
 *  <b>`name`</b>: Name of the evaluation.
+*  <b>`checkpoint_path`</b>: A specific checkpoint to use. By default, use the latest
+    checkpoint in the `model_dir`.
 
 ##### Returns:
 
@@ -1957,36 +1815,39 @@ SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-23.
 Instructions for updating:
 The signature of the input_fn accepted by export is changing to be consistent with what's used by tf.Learn Estimator's train/evaluate. input_fn (and in most cases, input_feature_key) will become required args, and use_deprecated_input_fn will default to False and be removed altogether.
 
-    Args:
-      export_dir: A string containing a directory to write the exported graph
-        and checkpoints.
-      input_fn: If `use_deprecated_input_fn` is true, then a function that given
-        `Tensor` of `Example` strings, parses it into features that are then
-        passed to the model. Otherwise, a function that takes no argument and
-        returns a tuple of (features, labels), where features is a dict of
-        string key to `Tensor` and labels is a `Tensor` that's currently not
-        used (and so can be `None`).
-      input_feature_key: Only used if `use_deprecated_input_fn` is false. String
-        key into the features dict returned by `input_fn` that corresponds to a
-        the raw `Example` strings `Tensor` that the exported model will take as
-        input. Can only be `None` if you're using a custom `signature_fn` that
-        does not use the first arg (examples).
-      use_deprecated_input_fn: Determines the signature format of `input_fn`.
-      signature_fn: Function that returns a default signature and a named
-        signature map, given `Tensor` of `Example` strings, `dict` of `Tensor`s
-        for features and `Tensor` or `dict` of `Tensor`s for predictions.
-      prediction_key: The key for a tensor in the `predictions` dict (output
-        from the `model_fn`) to use as the `predictions` input to the
-        `signature_fn`. Optional. If `None`, predictions will pass to
-        `signature_fn` without filtering.
-      default_batch_size: Default batch size of the `Example` placeholder.
-      exports_to_keep: Number of exports to keep.
-
-    Returns:
-      The string path to the exported directory. NB: this functionality was
-      added ca. 2016/09/25; clients that depend on the return value may need
-      to handle the case where this function returns None because subclasses
-      are not returning a value.
+##### Args:
+
+
+*  <b>`export_dir`</b>: A string containing a directory to write the exported graph
+    and checkpoints.
+*  <b>`input_fn`</b>: If `use_deprecated_input_fn` is true, then a function that given
+    `Tensor` of `Example` strings, parses it into features that are then
+    passed to the model. Otherwise, a function that takes no argument and
+    returns a tuple of (features, labels), where features is a dict of
+    string key to `Tensor` and labels is a `Tensor` that's currently not
+    used (and so can be `None`).
+*  <b>`input_feature_key`</b>: Only used if `use_deprecated_input_fn` is false. String
+    key into the features dict returned by `input_fn` that corresponds to a
+    the raw `Example` strings `Tensor` that the exported model will take as
+    input. Can only be `None` if you're using a custom `signature_fn` that
+    does not use the first arg (examples).
+*  <b>`use_deprecated_input_fn`</b>: Determines the signature format of `input_fn`.
+*  <b>`signature_fn`</b>: Function that returns a default signature and a named
+    signature map, given `Tensor` of `Example` strings, `dict` of `Tensor`s
+    for features and `Tensor` or `dict` of `Tensor`s for predictions.
+*  <b>`prediction_key`</b>: The key for a tensor in the `predictions` dict (output
+    from the `model_fn`) to use as the `predictions` input to the
+    `signature_fn`. Optional. If `None`, predictions will pass to
+    `signature_fn` without filtering.
+*  <b>`default_batch_size`</b>: Default batch size of the `Example` placeholder.
+*  <b>`exports_to_keep`</b>: Number of exports to keep.
+
+##### Returns:
+
+  The string path to the exported directory. NB: this functionality was
+  added ca. 2016/09/25; clients that depend on the return value may need
+  to handle the case where this function returns None because subclasses
+  are not returning a value.
 
 
 - - -
@@ -1998,28 +1859,33 @@ Exports inference graph as a SavedModel into given dir. (experimental)
 THIS FUNCTION IS EXPERIMENTAL. It may change or be removed at any time, and without warning.
 
 
-    Args:
-      export_dir_base: A string containing a directory to write the exported
-        graph and checkpoints.
-      input_fn: A function that takes no argument and
-        returns an `InputFnOps`.
-      default_output_alternative_key: the name of the head to serve when none is
-        specified.
-      assets_extra: A dict specifying how to populate the assets.extra directory
-        within the exported SavedModel.  Each key should give the destination
-        path (including the filename) relative to the assets.extra directory.
-        The corresponding value gives the full path of the source file to be
-        copied.  For example, the simple case of copying a single file without
-        renaming it is specified as
-        `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
-      as_text: whether to write the SavedModel proto in text format.
-      exports_to_keep: Number of exports to keep.
+##### Args:
+
+
+*  <b>`export_dir_base`</b>: A string containing a directory to write the exported
+    graph and checkpoints.
+*  <b>`input_fn`</b>: A function that takes no argument and
+    returns an `InputFnOps`.
+*  <b>`default_output_alternative_key`</b>: the name of the head to serve when none is
+    specified.
+*  <b>`assets_extra`</b>: A dict specifying how to populate the assets.extra directory
+    within the exported SavedModel.  Each key should give the destination
+    path (including the filename) relative to the assets.extra directory.
+    The corresponding value gives the full path of the source file to be
+    copied.  For example, the simple case of copying a single file without
+    renaming it is specified as
+    `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
+*  <b>`as_text`</b>: whether to write the SavedModel proto in text format.
+*  <b>`exports_to_keep`</b>: Number of exports to keep.
+
+##### Returns:
+
+  The string path to the exported directory.
+
+##### Raises:
 
-    Returns:
-      The string path to the exported directory.
 
-    Raises:
-      ValueError: if an unrecognized export_type is requested.
+*  <b>`ValueError`</b>: if an unrecognized export_type is requested.
 
 
 - - -
@@ -2038,8 +1904,9 @@ available in the SKCompat class, Estimator will only accept input_fn.
 
   est = Estimator(...) -> est = SKCompat(Estimator(...))
 
+##### Raises:
+
 
-*  <b>`Raises`</b>: 
 *  <b>`ValueError`</b>: If `x` or `y` are not `None` while `input_fn` is not `None`.
 *  <b>`ValueError`</b>: If both `steps` and `max_steps` are not `None`.
 
@@ -2131,39 +1998,41 @@ available in the SKCompat class, Estimator will only accept input_fn.
 
   est = Estimator(...) -> est = SKCompat(Estimator(...))
 
-    This method is expected to be called several times consecutively
-    on different or the same chunks of the dataset. This either can
-    implement iterative training or out-of-core/online training.
+This method is expected to be called several times consecutively
+on different or the same chunks of the dataset. This either can
+implement iterative training or out-of-core/online training.
 
-    This is especially useful when the whole dataset is too big to
-    fit in memory at the same time. Or when model is taking long time
-    to converge, and you want to split up training into subparts.
+This is especially useful when the whole dataset is too big to
+fit in memory at the same time. Or when model is taking long time
+to converge, and you want to split up training into subparts.
+
+##### Args:
 
 
-*  <b>`Args`</b>: 
 *  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-         returns arrays of features. The training input samples for fitting the
-         model. If set, `input_fn` must be `None`.
+     returns arrays of features. The training input samples for fitting the
+     model. If set, `input_fn` must be `None`.
 *  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-         iterator that returns array of labels. The training label values
-         (class labels in classification, real numbers in regression). If set,
-         `input_fn` must be `None`.
+     iterator that returns array of labels. The training label values
+     (class labels in classification, real numbers in regression). If set,
+     `input_fn` must be `None`.
 *  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-        `None`.
+    `None`.
 *  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
 *  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-        dimension of `x`. Must be `None` if `input_fn` is provided.
+    dimension of `x`. Must be `None` if `input_fn` is provided.
 *  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-        inside the training loop.
+    inside the training loop.
+
+##### Returns:
 
+  `self`, for chaining.
 
-*  <b>`Returns`</b>: 
-      `self`, for chaining.
+##### Raises:
 
 
-*  <b>`Raises`</b>: 
 *  <b>`ValueError`</b>: If at least one of `x` and `y` is provided, and `input_fn` is
-          provided.
+      provided.
 
 
 - - -
@@ -2182,30 +2051,32 @@ available in the SKCompat class, Estimator will only accept input_fn.
 
   est = Estimator(...) -> est = SKCompat(Estimator(...))
 
+##### Args:
+
 
-*  <b>`Args`</b>: 
 *  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-         returns arrays of features. The training input samples for fitting the
-         model. If set, `input_fn` must be `None`.
+     returns arrays of features. The training input samples for fitting the
+     model. If set, `input_fn` must be `None`.
 *  <b>`input_fn`</b>: Input function. If set, `x` and 'batch_size' must be `None`.
 *  <b>`batch_size`</b>: Override default batch size. If set, 'input_fn' must be
-        'None'.
+    'None'.
 *  <b>`outputs`</b>: list of `str`, name of the output to predict.
-        If `None`, returns all.
+    If `None`, returns all.
 *  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
-        for each example until inputs are exhausted. Note: The inputs must
-        terminate if you want the iterable to terminate (e.g. be sure to pass
-        num_epochs=1 if you are using something like read_batch_features).
+    for each example until inputs are exhausted. Note: The inputs must
+    terminate if you want the iterable to terminate (e.g. be sure to pass
+    num_epochs=1 if you are using something like read_batch_features).
+
+##### Returns:
 
+  A numpy array of predicted classes or regression values if the
+  constructor's `model_fn` returns a `Tensor` for `predictions` or a `dict`
+  of numpy arrays if `model_fn` returns a `dict`. Returns an iterable of
+  predictions if as_iterable is True.
 
-*  <b>`Returns`</b>: 
-      A numpy array of predicted classes or regression values if the
-      constructor's `model_fn` returns a `Tensor` for `predictions` or a `dict`
-      of numpy arrays if `model_fn` returns a `dict`. Returns an iterable of
-      predictions if as_iterable is True.
+##### Raises:
 
 
-*  <b>`Raises`</b>: 
 *  <b>`ValueError`</b>: If x and input_fn are both provided or both `None`.
 
 
diff --git a/tensorflow/g3doc/api_docs/python/contrib.learn.monitors.md b/tensorflow/g3doc/api_docs/python/contrib.learn.monitors.md
index d4d200399ef231..096bdba0fd5564 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.learn.monitors.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.learn.monitors.md
@@ -878,27 +878,31 @@ SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-23.
 Instructions for updating:
 The signature of the input_fn accepted by export is changing to be consistent with what's used by tf.Learn Estimator's train/evaluate. input_fn (and in most cases, input_feature_key) will both become required args.
 
-    Args:
-      every_n_steps: Run monitor every N steps.
-      export_dir: str, folder to export.
-      input_fn: A function that takes no argument and returns a tuple of
-        (features, labels), where features is a dict of string key to `Tensor`
-        and labels is a `Tensor` that's currently not used (and so can be
-        `None`).
-      input_feature_key: String key into the features dict returned by
-        `input_fn` that corresponds to the raw `Example` strings `Tensor` that
-        the exported model will take as input. Should be `None` if and only if
-        you're passing in a `signature_fn` that does not use the first arg
-        (`Tensor` of `Example` strings).
-      exports_to_keep: int, number of exports to keep.
-      signature_fn: Function that returns a default signature and a named
-        signature map, given `Tensor` of `Example` strings, `dict` of `Tensor`s
-        for features and `dict` of `Tensor`s for predictions.
-      default_batch_size: Default batch size of the `Example` placeholder.
-
-    Raises:
-      ValueError: If `input_fn` and `input_feature_key` are not both defined or
-        are not both `None`.
+##### Args:
+
+
+*  <b>`every_n_steps`</b>: Run monitor every N steps.
+*  <b>`export_dir`</b>: str, folder to export.
+*  <b>`input_fn`</b>: A function that takes no argument and returns a tuple of
+    (features, labels), where features is a dict of string key to `Tensor`
+    and labels is a `Tensor` that's currently not used (and so can be
+    `None`).
+*  <b>`input_feature_key`</b>: String key into the features dict returned by
+    `input_fn` that corresponds to the raw `Example` strings `Tensor` that
+    the exported model will take as input. Should be `None` if and only if
+    you're passing in a `signature_fn` that does not use the first arg
+    (`Tensor` of `Example` strings).
+*  <b>`exports_to_keep`</b>: int, number of exports to keep.
+*  <b>`signature_fn`</b>: Function that returns a default signature and a named
+    signature map, given `Tensor` of `Example` strings, `dict` of `Tensor`s
+    for features and `dict` of `Tensor`s for predictions.
+*  <b>`default_batch_size`</b>: Default batch size of the `Example` placeholder.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If `input_fn` and `input_feature_key` are not both defined or
+    are not both `None`.
 
 
 - - -
@@ -2653,6 +2657,37 @@ Wraps monitors into a SessionRunHook.
 
 
 
+- - -
+
+### `class tf.contrib.learn.monitors.SummaryWriterCache` {#SummaryWriterCache}
+
+Cache for file writers.
+
+This class caches file writers, one per directory.
+- - -
+
+#### `tf.contrib.learn.monitors.SummaryWriterCache.clear()` {#SummaryWriterCache.clear}
+
+Clear cached summary writers. Currently only used for unit tests.
+
+
+- - -
+
+#### `tf.contrib.learn.monitors.SummaryWriterCache.get(logdir)` {#SummaryWriterCache.get}
+
+Returns the FileWriter for the specified directory.
+
+##### Args:
+
+
+*  <b>`logdir`</b>: str, name of the directory.
+
+##### Returns:
+
+  A `FileWriter`.
+
+
+
 - - -
 
 ### `tf.contrib.learn.monitors.replace_monitors_with_hooks(monitors_or_hooks, estimator)` {#replace_monitors_with_hooks}
diff --git a/tensorflow/g3doc/api_docs/python/contrib.legacy_seq2seq.md b/tensorflow/g3doc/api_docs/python/contrib.legacy_seq2seq.md
new file mode 100644
index 00000000000000..2350982acfa2f6
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/contrib.legacy_seq2seq.md
@@ -0,0 +1,578 @@
+<!-- This file is machine generated: DO NOT EDIT! -->
+
+# Sequence to Sequence (contrib)
+[TOC]
+
+Deprecated library for creating sequence-to-sequence models in TensorFlow.
+
+- - -
+
+### `tf.contrib.legacy_seq2seq.attention_decoder(decoder_inputs, initial_state, attention_states, cell, output_size=None, num_heads=1, loop_function=None, dtype=None, scope=None, initial_state_attention=False)` {#attention_decoder}
+
+RNN decoder with attention for the sequence-to-sequence model.
+
+In this context "attention" means that, during decoding, the RNN can look up
+information in the additional tensor attention_states, and it does this by
+focusing on a few entries from the tensor. This model has proven to yield
+especially good results in a number of sequence-to-sequence tasks. This
+implementation is based on http://arxiv.org/abs/1412.7449 (see below for
+details). It is recommended for complex sequence-to-sequence tasks.
+
+##### Args:
+
+
+*  <b>`decoder_inputs`</b>: A list of 2D Tensors [batch_size x input_size].
+*  <b>`initial_state`</b>: 2D Tensor [batch_size x cell.state_size].
+*  <b>`attention_states`</b>: 3D Tensor [batch_size x attn_length x attn_size].
+*  <b>`cell`</b>: rnn_cell.RNNCell defining the cell function and size.
+*  <b>`output_size`</b>: Size of the output vectors; if None, we use cell.output_size.
+*  <b>`num_heads`</b>: Number of attention heads that read from attention_states.
+*  <b>`loop_function`</b>: If not None, this function will be applied to i-th output
+    in order to generate i+1-th input, and decoder_inputs will be ignored,
+    except for the first element ("GO" symbol). This can be used for decoding,
+    but also for training to emulate http://arxiv.org/abs/1506.03099.
+    Signature -- loop_function(prev, i) = next
+      * prev is a 2D Tensor of shape [batch_size x output_size],
+      * i is an integer, the step number (when advanced control is needed),
+      * next is a 2D Tensor of shape [batch_size x input_size].
+*  <b>`dtype`</b>: The dtype to use for the RNN initial state (default: tf.float32).
+*  <b>`scope`</b>: VariableScope for the created subgraph; default: "attention_decoder".
+*  <b>`initial_state_attention`</b>: If False (default), initial attentions are zero.
+    If True, initialize the attentions from the initial state and attention
+    states -- useful when we wish to resume decoding from a previously
+    stored decoder state and attention states.
+
+##### Returns:
+
+  A tuple of the form (outputs, state), where:
+
+*  <b>`outputs`</b>: A list of the same length as decoder_inputs of 2D Tensors of
+      shape [batch_size x output_size]. These represent the generated outputs.
+      Output i is computed from input i (which is either the i-th element
+      of decoder_inputs or loop_function(output {i-1}, i)) as follows.
+      First, we run the cell on a combination of the input and previous
+      attention masks:
+        cell_output, new_state = cell(linear(input, prev_attn), prev_state).
+      Then, we calculate new attention masks:
+        new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))
+      and then we calculate the output:
+        output = linear(cell_output, new_attn).
+*  <b>`state`</b>: The state of each decoder cell the final time-step.
+      It is a 2D Tensor of shape [batch_size x cell.state_size].
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: when num_heads is not positive, there are no inputs, shapes
+    of attention_states are not set, or input size cannot be inferred
+    from the input.
+
+
+- - -
+
+### `tf.contrib.legacy_seq2seq.basic_rnn_seq2seq(encoder_inputs, decoder_inputs, cell, dtype=tf.float32, scope=None)` {#basic_rnn_seq2seq}
+
+Basic RNN sequence-to-sequence model.
+
+This model first runs an RNN to encode encoder_inputs into a state vector,
+then runs decoder, initialized with the last encoder state, on decoder_inputs.
+Encoder and decoder use the same RNN cell type, but don't share parameters.
+
+##### Args:
+
+
+*  <b>`encoder_inputs`</b>: A list of 2D Tensors [batch_size x input_size].
+*  <b>`decoder_inputs`</b>: A list of 2D Tensors [batch_size x input_size].
+*  <b>`cell`</b>: rnn_cell.RNNCell defining the cell function and size.
+*  <b>`dtype`</b>: The dtype of the initial state of the RNN cell (default: tf.float32).
+*  <b>`scope`</b>: VariableScope for the created subgraph; default: "basic_rnn_seq2seq".
+
+##### Returns:
+
+  A tuple of the form (outputs, state), where:
+
+*  <b>`outputs`</b>: A list of the same length as decoder_inputs of 2D Tensors with
+      shape [batch_size x output_size] containing the generated outputs.
+*  <b>`state`</b>: The state of each decoder cell in the final time-step.
+      It is a 2D Tensor of shape [batch_size x cell.state_size].
+
+
+- - -
+
+### `tf.contrib.legacy_seq2seq.embedding_attention_decoder(decoder_inputs, initial_state, attention_states, cell, num_symbols, embedding_size, num_heads=1, output_size=None, output_projection=None, feed_previous=False, update_embedding_for_previous=True, dtype=None, scope=None, initial_state_attention=False)` {#embedding_attention_decoder}
+
+RNN decoder with embedding and attention and a pure-decoding option.
+
+##### Args:
+
+
+*  <b>`decoder_inputs`</b>: A list of 1D batch-sized int32 Tensors (decoder inputs).
+*  <b>`initial_state`</b>: 2D Tensor [batch_size x cell.state_size].
+*  <b>`attention_states`</b>: 3D Tensor [batch_size x attn_length x attn_size].
+*  <b>`cell`</b>: rnn_cell.RNNCell defining the cell function.
+*  <b>`num_symbols`</b>: Integer, how many symbols come into the embedding.
+*  <b>`embedding_size`</b>: Integer, the length of the embedding vector for each symbol.
+*  <b>`num_heads`</b>: Number of attention heads that read from attention_states.
+*  <b>`output_size`</b>: Size of the output vectors; if None, use output_size.
+*  <b>`output_projection`</b>: None or a pair (W, B) of output projection weights and
+    biases; W has shape [output_size x num_symbols] and B has shape
+    [num_symbols]; if provided and feed_previous=True, each fed previous
+    output will first be multiplied by W and added B.
+*  <b>`feed_previous`</b>: Boolean; if True, only the first of decoder_inputs will be
+    used (the "GO" symbol), and all other decoder inputs will be generated by:
+      next = embedding_lookup(embedding, argmax(previous_output)),
+    In effect, this implements a greedy decoder. It can also be used
+    during training to emulate http://arxiv.org/abs/1506.03099.
+    If False, decoder_inputs are used as given (the standard decoder case).
+*  <b>`update_embedding_for_previous`</b>: Boolean; if False and feed_previous=True,
+    only the embedding for the first symbol of decoder_inputs (the "GO"
+    symbol) will be updated by back propagation. Embeddings for the symbols
+    generated from the decoder itself remain unchanged. This parameter has
+    no effect if feed_previous=False.
+*  <b>`dtype`</b>: The dtype to use for the RNN initial states (default: tf.float32).
+*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to
+    "embedding_attention_decoder".
+*  <b>`initial_state_attention`</b>: If False (default), initial attentions are zero.
+    If True, initialize the attentions from the initial state and attention
+    states -- useful when we wish to resume decoding from a previously
+    stored decoder state and attention states.
+
+##### Returns:
+
+  A tuple of the form (outputs, state), where:
+
+*  <b>`outputs`</b>: A list of the same length as decoder_inputs of 2D Tensors with
+      shape [batch_size x output_size] containing the generated outputs.
+*  <b>`state`</b>: The state of each decoder cell at the final time-step.
+      It is a 2D Tensor of shape [batch_size x cell.state_size].
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: When output_projection has the wrong shape.
+
+
+- - -
+
+### `tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(encoder_inputs, decoder_inputs, cell, num_encoder_symbols, num_decoder_symbols, embedding_size, num_heads=1, output_projection=None, feed_previous=False, dtype=None, scope=None, initial_state_attention=False)` {#embedding_attention_seq2seq}
+
+Embedding sequence-to-sequence model with attention.
+
+This model first embeds encoder_inputs by a newly created embedding (of shape
+[num_encoder_symbols x input_size]). Then it runs an RNN to encode
+embedded encoder_inputs into a state vector. It keeps the outputs of this
+RNN at every step to use for attention later. Next, it embeds decoder_inputs
+by another newly created embedding (of shape [num_decoder_symbols x
+input_size]). Then it runs attention decoder, initialized with the last
+encoder state, on embedded decoder_inputs and attending to encoder outputs.
+
+Warning: when output_projection is None, the size of the attention vectors
+and variables will be made proportional to num_decoder_symbols, can be large.
+
+##### Args:
+
+
+*  <b>`encoder_inputs`</b>: A list of 1D int32 Tensors of shape [batch_size].
+*  <b>`decoder_inputs`</b>: A list of 1D int32 Tensors of shape [batch_size].
+*  <b>`cell`</b>: rnn_cell.RNNCell defining the cell function and size.
+*  <b>`num_encoder_symbols`</b>: Integer; number of symbols on the encoder side.
+*  <b>`num_decoder_symbols`</b>: Integer; number of symbols on the decoder side.
+*  <b>`embedding_size`</b>: Integer, the length of the embedding vector for each symbol.
+*  <b>`num_heads`</b>: Number of attention heads that read from attention_states.
+*  <b>`output_projection`</b>: None or a pair (W, B) of output projection weights and
+    biases; W has shape [output_size x num_decoder_symbols] and B has
+    shape [num_decoder_symbols]; if provided and feed_previous=True, each
+    fed previous output will first be multiplied by W and added B.
+*  <b>`feed_previous`</b>: Boolean or scalar Boolean Tensor; if True, only the first
+    of decoder_inputs will be used (the "GO" symbol), and all other decoder
+    inputs will be taken from previous outputs (as in embedding_rnn_decoder).
+    If False, decoder_inputs are used as given (the standard decoder case).
+*  <b>`dtype`</b>: The dtype of the initial RNN state (default: tf.float32).
+*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to
+    "embedding_attention_seq2seq".
+*  <b>`initial_state_attention`</b>: If False (default), initial attentions are zero.
+    If True, initialize the attentions from the initial state and attention
+    states.
+
+##### Returns:
+
+  A tuple of the form (outputs, state), where:
+
+*  <b>`outputs`</b>: A list of the same length as decoder_inputs of 2D Tensors with
+      shape [batch_size x num_decoder_symbols] containing the generated
+      outputs.
+*  <b>`state`</b>: The state of each decoder cell at the final time-step.
+      It is a 2D Tensor of shape [batch_size x cell.state_size].
+
+
+- - -
+
+### `tf.contrib.legacy_seq2seq.embedding_rnn_decoder(decoder_inputs, initial_state, cell, num_symbols, embedding_size, output_projection=None, feed_previous=False, update_embedding_for_previous=True, scope=None)` {#embedding_rnn_decoder}
+
+RNN decoder with embedding and a pure-decoding option.
+
+##### Args:
+
+
+*  <b>`decoder_inputs`</b>: A list of 1D batch-sized int32 Tensors (decoder inputs).
+*  <b>`initial_state`</b>: 2D Tensor [batch_size x cell.state_size].
+*  <b>`cell`</b>: rnn_cell.RNNCell defining the cell function.
+*  <b>`num_symbols`</b>: Integer, how many symbols come into the embedding.
+*  <b>`embedding_size`</b>: Integer, the length of the embedding vector for each symbol.
+*  <b>`output_projection`</b>: None or a pair (W, B) of output projection weights and
+    biases; W has shape [output_size x num_symbols] and B has
+    shape [num_symbols]; if provided and feed_previous=True, each fed
+    previous output will first be multiplied by W and added B.
+*  <b>`feed_previous`</b>: Boolean; if True, only the first of decoder_inputs will be
+    used (the "GO" symbol), and all other decoder inputs will be generated by:
+      next = embedding_lookup(embedding, argmax(previous_output)),
+    In effect, this implements a greedy decoder. It can also be used
+    during training to emulate http://arxiv.org/abs/1506.03099.
+    If False, decoder_inputs are used as given (the standard decoder case).
+*  <b>`update_embedding_for_previous`</b>: Boolean; if False and feed_previous=True,
+    only the embedding for the first symbol of decoder_inputs (the "GO"
+    symbol) will be updated by back propagation. Embeddings for the symbols
+    generated from the decoder itself remain unchanged. This parameter has
+    no effect if feed_previous=False.
+*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to
+    "embedding_rnn_decoder".
+
+##### Returns:
+
+  A tuple of the form (outputs, state), where:
+
+*  <b>`outputs`</b>: A list of the same length as decoder_inputs of 2D Tensors. The
+      output is of shape [batch_size x cell.output_size] when
+      output_projection is not None (and represents the dense representation
+      of predicted tokens). It is of shape [batch_size x num_decoder_symbols]
+      when output_projection is None.
+*  <b>`state`</b>: The state of each decoder cell in each time-step. This is a list
+      with length len(decoder_inputs) -- one item for each time-step.
+      It is a 2D Tensor of shape [batch_size x cell.state_size].
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: When output_projection has the wrong shape.
+
+
+- - -
+
+### `tf.contrib.legacy_seq2seq.embedding_rnn_seq2seq(encoder_inputs, decoder_inputs, cell, num_encoder_symbols, num_decoder_symbols, embedding_size, output_projection=None, feed_previous=False, dtype=None, scope=None)` {#embedding_rnn_seq2seq}
+
+Embedding RNN sequence-to-sequence model.
+
+This model first embeds encoder_inputs by a newly created embedding (of shape
+[num_encoder_symbols x input_size]). Then it runs an RNN to encode
+embedded encoder_inputs into a state vector. Next, it embeds decoder_inputs
+by another newly created embedding (of shape [num_decoder_symbols x
+input_size]). Then it runs RNN decoder, initialized with the last
+encoder state, on embedded decoder_inputs.
+
+##### Args:
+
+
+*  <b>`encoder_inputs`</b>: A list of 1D int32 Tensors of shape [batch_size].
+*  <b>`decoder_inputs`</b>: A list of 1D int32 Tensors of shape [batch_size].
+*  <b>`cell`</b>: rnn_cell.RNNCell defining the cell function and size.
+*  <b>`num_encoder_symbols`</b>: Integer; number of symbols on the encoder side.
+*  <b>`num_decoder_symbols`</b>: Integer; number of symbols on the decoder side.
+*  <b>`embedding_size`</b>: Integer, the length of the embedding vector for each symbol.
+*  <b>`output_projection`</b>: None or a pair (W, B) of output projection weights and
+    biases; W has shape [output_size x num_decoder_symbols] and B has
+    shape [num_decoder_symbols]; if provided and feed_previous=True, each
+    fed previous output will first be multiplied by W and added B.
+*  <b>`feed_previous`</b>: Boolean or scalar Boolean Tensor; if True, only the first
+    of decoder_inputs will be used (the "GO" symbol), and all other decoder
+    inputs will be taken from previous outputs (as in embedding_rnn_decoder).
+    If False, decoder_inputs are used as given (the standard decoder case).
+*  <b>`dtype`</b>: The dtype of the initial state for both the encoder and encoder
+    rnn cells (default: tf.float32).
+*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to
+    "embedding_rnn_seq2seq"
+
+##### Returns:
+
+  A tuple of the form (outputs, state), where:
+
+*  <b>`outputs`</b>: A list of the same length as decoder_inputs of 2D Tensors. The
+      output is of shape [batch_size x cell.output_size] when
+      output_projection is not None (and represents the dense representation
+      of predicted tokens). It is of shape [batch_size x num_decoder_symbols]
+      when output_projection is None.
+*  <b>`state`</b>: The state of each decoder cell in each time-step. This is a list
+      with length len(decoder_inputs) -- one item for each time-step.
+      It is a 2D Tensor of shape [batch_size x cell.state_size].
+
+
+- - -
+
+### `tf.contrib.legacy_seq2seq.embedding_tied_rnn_seq2seq(encoder_inputs, decoder_inputs, cell, num_symbols, embedding_size, num_decoder_symbols=None, output_projection=None, feed_previous=False, dtype=None, scope=None)` {#embedding_tied_rnn_seq2seq}
+
+Embedding RNN sequence-to-sequence model with tied (shared) parameters.
+
+This model first embeds encoder_inputs by a newly created embedding (of shape
+[num_symbols x input_size]). Then it runs an RNN to encode embedded
+encoder_inputs into a state vector. Next, it embeds decoder_inputs using
+the same embedding. Then it runs RNN decoder, initialized with the last
+encoder state, on embedded decoder_inputs. The decoder output is over symbols
+from 0 to num_decoder_symbols - 1 if num_decoder_symbols is none; otherwise it
+is over 0 to num_symbols - 1.
+
+##### Args:
+
+
+*  <b>`encoder_inputs`</b>: A list of 1D int32 Tensors of shape [batch_size].
+*  <b>`decoder_inputs`</b>: A list of 1D int32 Tensors of shape [batch_size].
+*  <b>`cell`</b>: rnn_cell.RNNCell defining the cell function and size.
+*  <b>`num_symbols`</b>: Integer; number of symbols for both encoder and decoder.
+*  <b>`embedding_size`</b>: Integer, the length of the embedding vector for each symbol.
+*  <b>`num_decoder_symbols`</b>: Integer; number of output symbols for decoder. If
+    provided, the decoder output is over symbols 0 to num_decoder_symbols - 1.
+    Otherwise, decoder output is over symbols 0 to num_symbols - 1. Note that
+    this assumes that the vocabulary is set up such that the first
+    num_decoder_symbols of num_symbols are part of decoding.
+*  <b>`output_projection`</b>: None or a pair (W, B) of output projection weights and
+    biases; W has shape [output_size x num_symbols] and B has
+    shape [num_symbols]; if provided and feed_previous=True, each
+    fed previous output will first be multiplied by W and added B.
+*  <b>`feed_previous`</b>: Boolean or scalar Boolean Tensor; if True, only the first
+    of decoder_inputs will be used (the "GO" symbol), and all other decoder
+    inputs will be taken from previous outputs (as in embedding_rnn_decoder).
+    If False, decoder_inputs are used as given (the standard decoder case).
+*  <b>`dtype`</b>: The dtype to use for the initial RNN states (default: tf.float32).
+*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to
+    "embedding_tied_rnn_seq2seq".
+
+##### Returns:
+
+  A tuple of the form (outputs, state), where:
+
+*  <b>`outputs`</b>: A list of the same length as decoder_inputs of 2D Tensors with
+      shape [batch_size x output_symbols] containing the generated
+      outputs where output_symbols = num_decoder_symbols if
+      num_decoder_symbols is not None otherwise output_symbols = num_symbols.
+*  <b>`state`</b>: The state of each decoder cell at the final time-step.
+      It is a 2D Tensor of shape [batch_size x cell.state_size].
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: When output_projection has the wrong shape.
+
+
+- - -
+
+### `tf.contrib.legacy_seq2seq.model_with_buckets(encoder_inputs, decoder_inputs, targets, weights, buckets, seq2seq, softmax_loss_function=None, per_example_loss=False, name=None)` {#model_with_buckets}
+
+Create a sequence-to-sequence model with support for bucketing.
+
+The seq2seq argument is a function that defines a sequence-to-sequence model,
+e.g., seq2seq = lambda x, y: basic_rnn_seq2seq(x, y, rnn_cell.GRUCell(24))
+
+##### Args:
+
+
+*  <b>`encoder_inputs`</b>: A list of Tensors to feed the encoder; first seq2seq input.
+*  <b>`decoder_inputs`</b>: A list of Tensors to feed the decoder; second seq2seq input.
+*  <b>`targets`</b>: A list of 1D batch-sized int32 Tensors (desired output sequence).
+*  <b>`weights`</b>: List of 1D batch-sized float-Tensors to weight the targets.
+*  <b>`buckets`</b>: A list of pairs of (input size, output size) for each bucket.
+*  <b>`seq2seq`</b>: A sequence-to-sequence model function; it takes 2 input that
+    agree with encoder_inputs and decoder_inputs, and returns a pair
+    consisting of outputs and states (as, e.g., basic_rnn_seq2seq).
+*  <b>`softmax_loss_function`</b>: Function (inputs-batch, labels-batch) -> loss-batch
+    to be used instead of the standard softmax (the default if this is None).
+*  <b>`per_example_loss`</b>: Boolean. If set, the returned loss will be a batch-sized
+    tensor of losses for each sequence in the batch. If unset, it will be
+    a scalar with the averaged loss from all examples.
+*  <b>`name`</b>: Optional name for this operation, defaults to "model_with_buckets".
+
+##### Returns:
+
+  A tuple of the form (outputs, losses), where:
+
+*  <b>`outputs`</b>: The outputs for each bucket. Its j'th element consists of a list
+      of 2D Tensors. The shape of output tensors can be either
+      [batch_size x output_size] or [batch_size x num_decoder_symbols]
+      depending on the seq2seq model used.
+*  <b>`losses`</b>: List of scalar Tensors, representing losses for each bucket, or,
+      if per_example_loss is set, a list of 1D batch-sized float Tensors.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If length of encoder_inputsut, targets, or weights is smaller
+    than the largest (last) bucket.
+
+
+- - -
+
+### `tf.contrib.legacy_seq2seq.one2many_rnn_seq2seq(encoder_inputs, decoder_inputs_dict, cell, num_encoder_symbols, num_decoder_symbols_dict, embedding_size, feed_previous=False, dtype=None, scope=None)` {#one2many_rnn_seq2seq}
+
+One-to-many RNN sequence-to-sequence model (multi-task).
+
+This is a multi-task sequence-to-sequence model with one encoder and multiple
+decoders. Reference to multi-task sequence-to-sequence learning can be found
+here: http://arxiv.org/abs/1511.06114
+
+##### Args:
+
+
+*  <b>`encoder_inputs`</b>: A list of 1D int32 Tensors of shape [batch_size].
+*  <b>`decoder_inputs_dict`</b>: A dictionany mapping decoder name (string) to
+    the corresponding decoder_inputs; each decoder_inputs is a list of 1D
+    Tensors of shape [batch_size]; num_decoders is defined as
+    len(decoder_inputs_dict).
+*  <b>`cell`</b>: rnn_cell.RNNCell defining the cell function and size.
+*  <b>`num_encoder_symbols`</b>: Integer; number of symbols on the encoder side.
+*  <b>`num_decoder_symbols_dict`</b>: A dictionary mapping decoder name (string) to an
+    integer specifying number of symbols for the corresponding decoder;
+    len(num_decoder_symbols_dict) must be equal to num_decoders.
+*  <b>`embedding_size`</b>: Integer, the length of the embedding vector for each symbol.
+*  <b>`feed_previous`</b>: Boolean or scalar Boolean Tensor; if True, only the first of
+    decoder_inputs will be used (the "GO" symbol), and all other decoder
+    inputs will be taken from previous outputs (as in embedding_rnn_decoder).
+    If False, decoder_inputs are used as given (the standard decoder case).
+*  <b>`dtype`</b>: The dtype of the initial state for both the encoder and encoder
+    rnn cells (default: tf.float32).
+*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to
+    "one2many_rnn_seq2seq"
+
+##### Returns:
+
+  A tuple of the form (outputs_dict, state_dict), where:
+
+*  <b>`outputs_dict`</b>: A mapping from decoder name (string) to a list of the same
+      length as decoder_inputs_dict[name]; each element in the list is a 2D
+      Tensors with shape [batch_size x num_decoder_symbol_list[name]]
+      containing the generated outputs.
+*  <b>`state_dict`</b>: A mapping from decoder name (string) to the final state of the
+      corresponding decoder RNN; it is a 2D Tensor of shape
+      [batch_size x cell.state_size].
+
+
+- - -
+
+### `tf.contrib.legacy_seq2seq.rnn_decoder(decoder_inputs, initial_state, cell, loop_function=None, scope=None)` {#rnn_decoder}
+
+RNN decoder for the sequence-to-sequence model.
+
+##### Args:
+
+
+*  <b>`decoder_inputs`</b>: A list of 2D Tensors [batch_size x input_size].
+*  <b>`initial_state`</b>: 2D Tensor with shape [batch_size x cell.state_size].
+*  <b>`cell`</b>: rnn_cell.RNNCell defining the cell function and size.
+*  <b>`loop_function`</b>: If not None, this function will be applied to the i-th output
+    in order to generate the i+1-st input, and decoder_inputs will be ignored,
+    except for the first element ("GO" symbol). This can be used for decoding,
+    but also for training to emulate http://arxiv.org/abs/1506.03099.
+    Signature -- loop_function(prev, i) = next
+      * prev is a 2D Tensor of shape [batch_size x output_size],
+      * i is an integer, the step number (when advanced control is needed),
+      * next is a 2D Tensor of shape [batch_size x input_size].
+*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to "rnn_decoder".
+
+##### Returns:
+
+  A tuple of the form (outputs, state), where:
+
+*  <b>`outputs`</b>: A list of the same length as decoder_inputs of 2D Tensors with
+      shape [batch_size x output_size] containing generated outputs.
+*  <b>`state`</b>: The state of each cell at the final time-step.
+      It is a 2D Tensor of shape [batch_size x cell.state_size].
+      (Note that in some cases, like basic RNN cell or GRU cell, outputs and
+       states can be the same. They are different for LSTM cells though.)
+
+
+- - -
+
+### `tf.contrib.legacy_seq2seq.sequence_loss(logits, targets, weights, average_across_timesteps=True, average_across_batch=True, softmax_loss_function=None, name=None)` {#sequence_loss}
+
+Weighted cross-entropy loss for a sequence of logits, batch-collapsed.
+
+##### Args:
+
+
+*  <b>`logits`</b>: List of 2D Tensors of shape [batch_size x num_decoder_symbols].
+*  <b>`targets`</b>: List of 1D batch-sized int32 Tensors of the same length as logits.
+*  <b>`weights`</b>: List of 1D batch-sized float-Tensors of the same length as logits.
+*  <b>`average_across_timesteps`</b>: If set, divide the returned cost by the total
+    label weight.
+*  <b>`average_across_batch`</b>: If set, divide the returned cost by the batch size.
+*  <b>`softmax_loss_function`</b>: Function (inputs-batch, labels-batch) -> loss-batch
+    to be used instead of the standard softmax (the default if this is None).
+*  <b>`name`</b>: Optional name for this operation, defaults to "sequence_loss".
+
+##### Returns:
+
+  A scalar float Tensor: The average log-perplexity per symbol (weighted).
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If len(logits) is different from len(targets) or len(weights).
+
+
+- - -
+
+### `tf.contrib.legacy_seq2seq.sequence_loss_by_example(logits, targets, weights, average_across_timesteps=True, softmax_loss_function=None, name=None)` {#sequence_loss_by_example}
+
+Weighted cross-entropy loss for a sequence of logits (per example).
+
+##### Args:
+
+
+*  <b>`logits`</b>: List of 2D Tensors of shape [batch_size x num_decoder_symbols].
+*  <b>`targets`</b>: List of 1D batch-sized int32 Tensors of the same length as logits.
+*  <b>`weights`</b>: List of 1D batch-sized float-Tensors of the same length as logits.
+*  <b>`average_across_timesteps`</b>: If set, divide the returned cost by the total
+    label weight.
+*  <b>`softmax_loss_function`</b>: Function (labels-batch, inputs-batch) -> loss-batch
+    to be used instead of the standard softmax (the default if this is None).
+*  <b>`name`</b>: Optional name for this operation, default: "sequence_loss_by_example".
+
+##### Returns:
+
+  1D batch-sized float Tensor: The log-perplexity for each sequence.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If len(logits) is different from len(targets) or len(weights).
+
+
+- - -
+
+### `tf.contrib.legacy_seq2seq.tied_rnn_seq2seq(encoder_inputs, decoder_inputs, cell, loop_function=None, dtype=tf.float32, scope=None)` {#tied_rnn_seq2seq}
+
+RNN sequence-to-sequence model with tied encoder and decoder parameters.
+
+This model first runs an RNN to encode encoder_inputs into a state vector, and
+then runs decoder, initialized with the last encoder state, on decoder_inputs.
+Encoder and decoder use the same RNN cell and share parameters.
+
+##### Args:
+
+
+*  <b>`encoder_inputs`</b>: A list of 2D Tensors [batch_size x input_size].
+*  <b>`decoder_inputs`</b>: A list of 2D Tensors [batch_size x input_size].
+*  <b>`cell`</b>: rnn_cell.RNNCell defining the cell function and size.
+*  <b>`loop_function`</b>: If not None, this function will be applied to i-th output
+    in order to generate i+1-th input, and decoder_inputs will be ignored,
+    except for the first element ("GO" symbol), see rnn_decoder for details.
+*  <b>`dtype`</b>: The dtype of the initial state of the rnn cell (default: tf.float32).
+*  <b>`scope`</b>: VariableScope for the created subgraph; default: "tied_rnn_seq2seq".
+
+##### Returns:
+
+  A tuple of the form (outputs, state), where:
+
+*  <b>`outputs`</b>: A list of the same length as decoder_inputs of 2D Tensors with
+      shape [batch_size x output_size] containing the generated outputs.
+*  <b>`state`</b>: The state of each decoder cell in each time-step. This is a list
+      with length len(decoder_inputs) -- one item for each time-step.
+      It is a 2D Tensor of shape [batch_size x cell.state_size].
+
+
diff --git a/tensorflow/g3doc/api_docs/python/contrib.linalg.md b/tensorflow/g3doc/api_docs/python/contrib.linalg.md
index d5edaa3e826509..146aaf515f48e1 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.linalg.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.linalg.md
@@ -609,7 +609,7 @@ These have the following meaning
   way.
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorDiag.__init__(diag, is_non_singular=None, is_self_adjoint=True, is_positive_definite=None, name='LinearOperatorDiag')` {#LinearOperatorDiag.__init__}
+#### `tf.contrib.linalg.LinearOperatorDiag.__init__(diag, is_non_singular=None, is_self_adjoint=None, is_positive_definite=None, name='LinearOperatorDiag')` {#LinearOperatorDiag.__init__}
 
 Initialize a `LinearOperatorDiag`.
 
@@ -618,11 +618,10 @@ Initialize a `LinearOperatorDiag`.
 
 *  <b>`diag`</b>: Shape `[B1,...,Bb, N]` `Tensor` with `b >= 0` `N >= 0`.
     The diagonal of the operator.  Allowed dtypes: `float32`, `float64`,
-    `complex64`, `complex128`.
+      `complex64`, `complex128`.
 *  <b>`is_non_singular`</b>: Expect that this operator is non-singular.
 *  <b>`is_self_adjoint`</b>: Expect that this operator is equal to its hermitian
-    transpose.  Since this is a real (not complex) diagonal operator, it is
-    always self adjoint.
+    transpose.  If `diag.dtype` is real, this is auto-set to `True`.
 *  <b>`is_positive_definite`</b>: Expect that this operator is positive definite,
     meaning the real part of all eigenvalues is positive.  We do not require
     the operator to be self-adjoint to be positive-definite.  See:
@@ -634,7 +633,7 @@ Initialize a `LinearOperatorDiag`.
 
 
 *  <b>`TypeError`</b>: If `diag.dtype` is not an allowed type.
-*  <b>`ValueError`</b>: If `is_self_adjoint` is not `True`.
+*  <b>`ValueError`</b>: If `diag.dtype` is real, and `is_self_adjoint` is not `True`.
 
 
 - - -
@@ -1010,3 +1009,488 @@ Return a dense (batch) matrix representing this operator.
 
 
 
+- - -
+
+### `class tf.contrib.linalg.LinearOperatorTriL` {#LinearOperatorTriL}
+
+`LinearOperator` acting like a [batch] square lower triangular matrix.
+
+This operator acts like a [batch] matrix `A` with shape
+`[B1,...,Bb, N, N]` for some `b >= 0`.  The first `b` indices index a
+batch member.  For every batch index `(i1,...,ib)`, `A[i1,...,ib, : :]` is
+an `N x N` matrix.
+
+`LinearOperatorTriL` is initialized with a `Tensor` having dimensions
+`[B1,...,Bb, N, N]`. The upper triangle of the last two dimensions is ignored.
+
+```python
+# Create a 2 x 2 lower-triangular linear operator.
+tril = [[1., 2.], [3., 4.]]
+operator = LinearOperatorTriL(tril)
+
+# The upper triangle is ignored.
+operator.to_dense()
+==> [[1., 0.]
+     [3., 4.]]
+
+operator.shape
+==> [2, 2]
+
+operator.log_determinant()
+==> scalar Tensor
+
+x = ... Shape [2, 4] Tensor
+operator.apply(x)
+==> Shape [2, 4] Tensor
+
+# Create a [2, 3] batch of 4 x 4 linear operators.
+tril = tf.random_normal(shape=[2, 3, 4, 4])
+operator = LinearOperatorTriL(tril)
+
+# Create a shape [2, 1, 4, 2] vector.  Note that this shape is compatible
+# since the batch dimensions, [2, 1], are brodcast to
+# operator.batch_shape = [2, 3].
+y = tf.random_normal(shape=[2, 1, 4, 2])
+x = operator.solve(y)
+==> operator.apply(x) = y
+```
+
+### Shape compatibility
+
+This operator acts on [batch] matrix with compatible shape.
+`x` is a batch matrix with compatible shape for `apply` and `solve` if
+
+```
+operator.shape = [B1,...,Bb] + [N, N],  with b >= 0
+x.shape =        [B1,...,Bb] + [N, R],  with R >= 0.
+```
+
+### Performance
+
+Suppose `operator` is a `LinearOperatorTriL` of shape `[N, N]`,
+and `x.shape = [N, R]`.  Then
+
+* `operator.apply(x)` involves `N^2 * R` multiplications.
+* `operator.solve(x)` involves `N * R` size `N` back-substitutions.
+* `operator.determinant()` involves a size `N` `reduce_prod`.
+
+If instead `operator` and `x` have shape `[B1,...,Bb, N, N]` and
+`[B1,...,Bb, N, R]`, every operation increases in complexity by `B1*...*Bb`.
+
+### Matrix property hints
+
+This `LinearOperator` is initialized with boolean flags of the form `is_X`,
+for `X = non_singular, self_adjoint` etc...
+These have the following meaning
+* If `is_X == True`, callers should expect the operator to have the
+  property `X`.  This is a promise that should be fulfilled, but is *not* a
+  runtime assert.  For example, finite floating point precision may result
+  in these promises being violated.
+* If `is_X == False`, callers should expect the operator to not have `X`.
+* If `is_X == None` (the default), callers should have no expectation either
+  way.
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.__init__(tril, is_non_singular=None, is_self_adjoint=None, is_positive_definite=None, name='LinearOperatorTriL')` {#LinearOperatorTriL.__init__}
+
+Initialize a `LinearOperatorTriL`.
+
+##### Args:
+
+
+*  <b>`tril`</b>: Shape `[B1,...,Bb, N, N]` with `b >= 0`, `N >= 0`.
+    The lower triangular part of `tril` defines this operator.  The strictly
+    upper triangle is ignored.  Allowed dtypes: `float32`, `float64`.
+*  <b>`is_non_singular`</b>: Expect that this operator is non-singular.
+    This operator is non-singular if and only if its diagonal elements are
+    all non-zero.
+*  <b>`is_self_adjoint`</b>: Expect that this operator is equal to its hermitian
+    transpose.  This operator is self-adjoint only if it is diagonal with
+    real-valued diagonal entries.  In this case it is advised to use
+    `LinearOperatorDiag`.
+*  <b>`is_positive_definite`</b>: Expect that this operator is positive definite,
+    meaning the real part of all eigenvalues is positive.  We do not require
+    the operator to be self-adjoint to be positive-definite.  See:
+*  <b>`https`</b>: //en.wikipedia.org/wiki/Positive-definite_matrix
+        #Extension_for_non_symmetric_matrices
+*  <b>`name`</b>: A name for this `LinearOperator`.
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: If `diag.dtype` is not an allowed type.
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.add_to_tensor(x, name='add_to_tensor')` {#LinearOperatorTriL.add_to_tensor}
+
+Add matrix represented by this operator to `x`.  Equivalent to `A + x`.
+
+##### Args:
+
+
+*  <b>`x`</b>: `Tensor` with same `dtype` and shape broadcastable to `self.shape`.
+*  <b>`name`</b>: A name to give this `Op`.
+
+##### Returns:
+
+  A `Tensor` with broadcast shape and same `dtype` as `self`.
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.apply(x, adjoint=False, name='apply')` {#LinearOperatorTriL.apply}
+
+Transform `x` with left multiplication:  `x --> Ax`.
+
+##### Args:
+
+
+*  <b>`x`</b>: `Tensor` with compatible shape and same `dtype` as `self`.
+    See class docstring for definition of compatibility.
+*  <b>`adjoint`</b>: Python `bool`.  If `True`, left multiply by the adjoint.
+*  <b>`name`</b>: A name for this `Op.
+
+##### Returns:
+
+  A `Tensor` with shape `[..., M, R]` and same `dtype` as `self`.
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.assert_non_singular(name='assert_non_singular')` {#LinearOperatorTriL.assert_non_singular}
+
+Returns an `Op` that asserts this operator is non singular.
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.assert_positive_definite(name='assert_positive_definite')` {#LinearOperatorTriL.assert_positive_definite}
+
+Returns an `Op` that asserts this operator is positive definite.
+
+Here, positive definite means the real part of all eigenvalues is positive.
+We do not require the operator to be self-adjoint.
+
+##### Args:
+
+
+*  <b>`name`</b>: A name to give this `Op`.
+
+##### Returns:
+
+  An `Op` that asserts this operator is positive definite.
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.assert_self_adjoint(name='assert_self_adjoint')` {#LinearOperatorTriL.assert_self_adjoint}
+
+Returns an `Op` that asserts this operator is self-adjoint.
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.batch_shape` {#LinearOperatorTriL.batch_shape}
+
+`TensorShape` of batch dimensions of this `LinearOperator`.
+
+If this operator acts like the batch matrix `A` with
+`A.shape = [B1,...,Bb, M, N]`, then this returns
+`TensorShape([B1,...,Bb])`, equivalent to `A.get_shape()[:-2]`
+
+##### Returns:
+
+  `TensorShape`, statically determined, may be undefined.
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.batch_shape_dynamic(name='batch_shape_dynamic')` {#LinearOperatorTriL.batch_shape_dynamic}
+
+Shape of batch dimensions of this operator, determined at runtime.
+
+If this operator acts like the batch matrix `A` with
+`A.shape = [B1,...,Bb, M, N]`, then this returns a `Tensor` holding
+`[B1,...,Bb]`.
+
+##### Args:
+
+
+*  <b>`name`</b>: A name for this `Op.
+
+##### Returns:
+
+  `int32` `Tensor`
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.determinant(name='det')` {#LinearOperatorTriL.determinant}
+
+Determinant for every batch member.
+
+##### Args:
+
+
+*  <b>`name`</b>: A name for this `Op.
+
+##### Returns:
+
+  `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.domain_dimension` {#LinearOperatorTriL.domain_dimension}
+
+Dimension (in the sense of vector spaces) of the domain of this operator.
+
+If this operator acts like the batch matrix `A` with
+`A.shape = [B1,...,Bb, M, N]`, then this returns `N`.
+
+##### Returns:
+
+  Python integer if vector space dimension can be determined statically,
+    otherwise `None`.
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.domain_dimension_dynamic(name='domain_dimension_dynamic')` {#LinearOperatorTriL.domain_dimension_dynamic}
+
+Dimension (in the sense of vector spaces) of the domain of this operator.
+
+Determined at runtime.
+
+If this operator acts like the batch matrix `A` with
+`A.shape = [B1,...,Bb, M, N]`, then this returns `N`.
+
+##### Args:
+
+
+*  <b>`name`</b>: A name for this `Op`.
+
+##### Returns:
+
+  `int32` `Tensor`
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.dtype` {#LinearOperatorTriL.dtype}
+
+The `DType` of `Tensor`s handled by this `LinearOperator`.
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.graph_parents` {#LinearOperatorTriL.graph_parents}
+
+List of graph dependencies of this `LinearOperator`.
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.is_non_singular` {#LinearOperatorTriL.is_non_singular}
+
+
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.is_positive_definite` {#LinearOperatorTriL.is_positive_definite}
+
+
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.is_self_adjoint` {#LinearOperatorTriL.is_self_adjoint}
+
+
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.log_abs_determinant(name='log_abs_det')` {#LinearOperatorTriL.log_abs_determinant}
+
+Log absolute value of determinant for every batch member.
+
+##### Args:
+
+
+*  <b>`name`</b>: A name for this `Op.
+
+##### Returns:
+
+  `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.name` {#LinearOperatorTriL.name}
+
+Name prepended to all ops created by this `LinearOperator`.
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.range_dimension` {#LinearOperatorTriL.range_dimension}
+
+Dimension (in the sense of vector spaces) of the range of this operator.
+
+If this operator acts like the batch matrix `A` with
+`A.shape = [B1,...,Bb, M, N]`, then this returns `M`.
+
+##### Returns:
+
+  Python integer if vector space dimension can be determined statically,
+    otherwise `None`.
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.range_dimension_dynamic(name='range_dimension_dynamic')` {#LinearOperatorTriL.range_dimension_dynamic}
+
+Dimension (in the sense of vector spaces) of the range of this operator.
+
+Determined at runtime.
+
+If this operator acts like the batch matrix `A` with
+`A.shape = [B1,...,Bb, M, N]`, then this returns `M`.
+
+##### Args:
+
+
+*  <b>`name`</b>: A name for this `Op`.
+
+##### Returns:
+
+  `int32` `Tensor`
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.shape` {#LinearOperatorTriL.shape}
+
+`TensorShape` of this `LinearOperator`.
+
+If this operator acts like the batch matrix `A` with
+`A.shape = [B1,...,Bb, M, N]`, then this returns
+`TensorShape([B1,...,Bb, M, N])`, equivalent to `A.get_shape()`.
+
+##### Returns:
+
+  `TensorShape`, statically determined, may be undefined.
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.shape_dynamic(name='shape_dynamic')` {#LinearOperatorTriL.shape_dynamic}
+
+Shape of this `LinearOperator`, determined at runtime.
+
+If this operator acts like the batch matrix `A` with
+`A.shape = [B1,...,Bb, M, N]`, then this returns a `Tensor` holding
+`[B1,...,Bb, M, N]`, equivalent to `tf.shape(A)`.
+
+##### Args:
+
+
+*  <b>`name`</b>: A name for this `Op.
+
+##### Returns:
+
+  `int32` `Tensor`
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.solve(rhs, adjoint=False, name='solve')` {#LinearOperatorTriL.solve}
+
+Solve `R` (batch) systems of equations exactly: `A X = rhs`.
+
+Examples:
+
+```python
+# Create an operator acting like a 10 x 2 x 2 matrix.
+operator = LinearOperator(...)
+operator.shape # = 10 x 2 x 2
+
+# Solve one linear system (R = 1) for every member of the length 10 batch.
+RHS = ... # shape 10 x 2 x 1
+X = operator.solve(RHS)  # shape 10 x 2 x 1
+
+# Solve five linear systems (R = 5) for every member of the length 10 batch.
+RHS = ... # shape 10 x 2 x 5
+X = operator.solve(RHS)
+X[3, :, 2]  # Solution to the linear system A[3, :, :] X = RHS[3, :, 2]
+```
+
+##### Args:
+
+
+*  <b>`rhs`</b>: `Tensor` with same `dtype` as this operator and compatible shape.
+    See class docstring for definition of compatibility.
+*  <b>`adjoint`</b>: Python `bool`.  If `True`, solve the system involving the adjoint
+    of this `LinearOperator`.
+*  <b>`name`</b>: A name scope to use for ops added by this method.
+
+##### Returns:
+
+  `Tensor` with shape `[...,N, R]` and same `dtype` as `rhs`.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If self.is_non_singular is False.
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.tensor_rank` {#LinearOperatorTriL.tensor_rank}
+
+Rank (in the sense of tensors) of matrix corresponding to this operator.
+
+If this operator acts like the batch matrix `A` with
+`A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
+
+##### Args:
+
+
+*  <b>`name`</b>: A name for this `Op.
+
+##### Returns:
+
+  Python integer, or None if the tensor rank is undefined.
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.tensor_rank_dynamic(name='tensor_rank_dynamic')` {#LinearOperatorTriL.tensor_rank_dynamic}
+
+Rank (in the sense of tensors) of matrix corresponding to this operator.
+
+If this operator acts like the batch matrix `A` with
+`A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
+
+##### Args:
+
+
+*  <b>`name`</b>: A name for this `Op.
+
+##### Returns:
+
+  `int32` `Tensor`, determined at runtime.
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.to_dense(name='to_dense')` {#LinearOperatorTriL.to_dense}
+
+Return a dense (batch) matrix representing this operator.
+
+
+
diff --git a/tensorflow/g3doc/api_docs/python/contrib.losses.md b/tensorflow/g3doc/api_docs/python/contrib.losses.md
index e6b0e136a9af66..677d96b9188495 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.losses.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.losses.md
@@ -10,42 +10,55 @@ Ops for building neural network losses.
 
 ### `tf.contrib.losses.absolute_difference(*args, **kwargs)` {#absolute_difference}
 
-Adds an Absolute Difference loss to the training procedure. (deprecated arguments)
+Adds an Absolute Difference loss to the training procedure. (deprecated arguments) (deprecated)
+
+THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
+Instructions for updating:
+Use tf.losses.absolute_difference instead.
 
 SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-11-25.
 Instructions for updating:
 `targets` is being deprecated, use `labels`. `weight` is being deprecated, use `weights`.
 
-  `weight` acts as a coefficient for the loss. If a scalar is provided, then the
-  loss is simply scaled by the given value. If `weight` is a tensor of size
-  [batch_size], then the total loss for each sample of the batch is rescaled
-  by the corresponding element in the `weight` vector. If the shape of
-  `weight` matches the shape of `predictions`, then the loss of each
-  measurable element of `predictions` is scaled by the corresponding value of
-  `weight`.
+`weight` acts as a coefficient for the loss. If a scalar is provided, then the
+loss is simply scaled by the given value. If `weight` is a tensor of size
+[batch_size], then the total loss for each sample of the batch is rescaled
+by the corresponding element in the `weight` vector. If the shape of
+`weight` matches the shape of `predictions`, then the loss of each
+measurable element of `predictions` is scaled by the corresponding value of
+`weight`.
+
+##### Args:
+
+
+*  <b>`predictions`</b>: The predicted outputs.
+*  <b>`labels`</b>: The ground truth output tensor, same dimensions as 'predictions'.
+*  <b>`weights`</b>: Coefficients for the loss a scalar, a tensor of shape
+    [batch_size] or a tensor whose shape matches `predictions`.
+*  <b>`scope`</b>: The scope for the operations performed in computing the loss.
+*  <b>`targets`</b>: Deprecated alias for `labels`.
+*  <b>`weight`</b>: Deprecated alias for `weights`.
 
-  Args:
-    predictions: The predicted outputs.
-    labels: The ground truth output tensor, same dimensions as 'predictions'.
-    weights: Coefficients for the loss a scalar, a tensor of shape
-      [batch_size] or a tensor whose shape matches `predictions`.
-    scope: The scope for the operations performed in computing the loss.
-    targets: Deprecated alias for `labels`.
-    weight: Deprecated alias for `weights`.
+##### Returns:
 
-  Returns:
-    A scalar `Tensor` representing the loss value.
+  A scalar `Tensor` representing the loss value.
 
-  Raises:
-    ValueError: If the shape of `predictions` doesn't match that of `labels` or
-      if the shape of `weight` is invalid.
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If the shape of `predictions` doesn't match that of `labels` or
+    if the shape of `weight` is invalid.
 
 
 - - -
 
 ### `tf.contrib.losses.add_loss(*args, **kwargs)` {#add_loss}
 
-Adds a externally defined loss to the collection of losses.
+Adds a externally defined loss to the collection of losses. (deprecated)
+
+THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
+Instructions for updating:
+Use tf.losses.add_loss instead.
 
 ##### Args:
 
@@ -58,63 +71,85 @@ Adds a externally defined loss to the collection of losses.
 
 ### `tf.contrib.losses.compute_weighted_loss(*args, **kwargs)` {#compute_weighted_loss}
 
-Computes the weighted loss. (deprecated arguments)
+Computes the weighted loss. (deprecated arguments) (deprecated)
+
+THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
+Instructions for updating:
+Use tf.losses.compute_weighted_loss instead.
 
 SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-11-25.
 Instructions for updating:
 `weight` is being deprecated, use `weights`.
 
-  Args:
-    losses: A tensor of size [batch_size, d1, ... dN].
-    weights: A tensor of size [1] or [batch_size, d1, ... dK] where K < N.
-    scope: the scope for the operations performed in computing the loss.
-    weight: Deprecated alias for `weights`.
+##### Args:
+
+
+*  <b>`losses`</b>: A tensor of size [batch_size, d1, ... dN].
+*  <b>`weights`</b>: A tensor of size [1] or [batch_size, d1, ... dK] where K < N.
+*  <b>`scope`</b>: the scope for the operations performed in computing the loss.
+*  <b>`weight`</b>: Deprecated alias for `weights`.
+
+##### Returns:
+
+  A scalar `Tensor` that returns the weighted loss.
+
+##### Raises:
 
-  Returns:
-    A scalar `Tensor` that returns the weighted loss.
 
-  Raises:
-    ValueError: If `weights` is `None` or the shape is not compatible with
-      `losses`, or if the number of dimensions (rank) of either `losses` or
-      `weights` is missing.
+*  <b>`ValueError`</b>: If `weights` is `None` or the shape is not compatible with
+    `losses`, or if the number of dimensions (rank) of either `losses` or
+    `weights` is missing.
 
 
 - - -
 
 ### `tf.contrib.losses.cosine_distance(*args, **kwargs)` {#cosine_distance}
 
-Adds a cosine-distance loss to the training procedure. (deprecated arguments)
+Adds a cosine-distance loss to the training procedure. (deprecated arguments) (deprecated)
+
+THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
+Instructions for updating:
+Use tf.losses.cosine_distance instead.
 
 SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-11-25.
 Instructions for updating:
 `targets` is being deprecated, use `labels`. `weight` is being deprecated, use `weights`.
 
-  Note that the function assumes that `predictions` and `labels` are already
-  unit-normalized.
+Note that the function assumes that `predictions` and `labels` are already
+unit-normalized.
+
+##### Args:
 
-  Args:
-    predictions: An arbitrary matrix.
-    labels: A `Tensor` whose shape matches 'predictions'
-    dim: The dimension along which the cosine distance is computed.
-    weights: Coefficients for the loss a scalar, a tensor of shape
-      [batch_size] or a tensor whose shape matches `predictions`.
-    scope: The scope for the operations performed in computing the loss.
-    targets: Deprecated alias for `labels`.
-    weight: Deprecated alias for `weights`.
 
-  Returns:
-    A scalar `Tensor` representing the loss value.
+*  <b>`predictions`</b>: An arbitrary matrix.
+*  <b>`labels`</b>: A `Tensor` whose shape matches 'predictions'
+*  <b>`dim`</b>: The dimension along which the cosine distance is computed.
+*  <b>`weights`</b>: Coefficients for the loss a scalar, a tensor of shape
+    [batch_size] or a tensor whose shape matches `predictions`.
+*  <b>`scope`</b>: The scope for the operations performed in computing the loss.
+*  <b>`targets`</b>: Deprecated alias for `labels`.
+*  <b>`weight`</b>: Deprecated alias for `weights`.
 
-  Raises:
-    ValueError: If `predictions` shape doesn't match `labels` shape, or
-      `weights` is `None`.
+##### Returns:
+
+  A scalar `Tensor` representing the loss value.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If `predictions` shape doesn't match `labels` shape, or
+    `weights` is `None`.
 
 
 - - -
 
-### `tf.contrib.losses.get_losses(scope=None, loss_collection='losses')` {#get_losses}
+### `tf.contrib.losses.get_losses(*args, **kwargs)` {#get_losses}
 
-Gets the list of losses from the loss_collection.
+Gets the list of losses from the loss_collection. (deprecated)
+
+THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
+Instructions for updating:
+Use tf.losses.get_losses instead.
 
 ##### Args:
 
@@ -129,9 +164,13 @@ Gets the list of losses from the loss_collection.
 
 - - -
 
-### `tf.contrib.losses.get_regularization_losses(scope=None)` {#get_regularization_losses}
+### `tf.contrib.losses.get_regularization_losses(*args, **kwargs)` {#get_regularization_losses}
 
-Gets the regularization losses.
+Gets the regularization losses. (deprecated)
+
+THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
+Instructions for updating:
+Use tf.losses.get_regularization_losses instead.
 
 ##### Args:
 
@@ -145,9 +184,13 @@ Gets the regularization losses.
 
 - - -
 
-### `tf.contrib.losses.get_total_loss(add_regularization_losses=True, name='total_loss')` {#get_total_loss}
+### `tf.contrib.losses.get_total_loss(*args, **kwargs)` {#get_total_loss}
 
-Returns a tensor whose value represents the total loss.
+Returns a tensor whose value represents the total loss. (deprecated)
+
+THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
+Instructions for updating:
+Use tf.losses.get_total_loss instead.
 
 Notice that the function adds the given losses to the regularization losses.
 
@@ -172,250 +215,313 @@ Notice that the function adds the given losses to the regularization losses.
 
 ### `tf.contrib.losses.hinge_loss(*args, **kwargs)` {#hinge_loss}
 
-Method that returns the loss tensor for hinge loss. (deprecated arguments)
+Method that returns the loss tensor for hinge loss. (deprecated arguments) (deprecated)
+
+THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
+Instructions for updating:
+Use tf.losses.hinge_loss instead.
 
 SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-11-25.
 Instructions for updating:
 `target` is being deprecated, use `labels`.
 
-  Args:
-    logits: The logits, a float tensor.
-    labels: The ground truth output tensor. Its shape should match the shape of
-      logits. The values of the tensor are expected to be 0.0 or 1.0.
-    scope: The scope for the operations performed in computing the loss.
-    target: Deprecated alias for `labels`.
+##### Args:
 
-  Returns:
-    A `Tensor` of same shape as logits and target representing the loss values
-      across the batch.
 
-  Raises:
-    ValueError: If the shapes of `logits` and `labels` don't match.
+*  <b>`logits`</b>: The logits, a float tensor.
+*  <b>`labels`</b>: The ground truth output tensor. Its shape should match the shape of
+    logits. The values of the tensor are expected to be 0.0 or 1.0.
+*  <b>`scope`</b>: The scope for the operations performed in computing the loss.
+*  <b>`target`</b>: Deprecated alias for `labels`.
+
+##### Returns:
+
+  A `Tensor` of same shape as logits and target representing the loss values
+    across the batch.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If the shapes of `logits` and `labels` don't match.
 
 
 - - -
 
 ### `tf.contrib.losses.log_loss(*args, **kwargs)` {#log_loss}
 
-Adds a Log Loss term to the training procedure. (deprecated arguments)
+Adds a Log Loss term to the training procedure. (deprecated arguments) (deprecated)
+
+THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
+Instructions for updating:
+Use tf.losses.log_loss instead.
 
 SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-11-25.
 Instructions for updating:
 `targets` is being deprecated, use `labels`. `weight` is being deprecated, use `weights`.
 
-  `weight` acts as a coefficient for the loss. If a scalar is provided, then the
-  loss is simply scaled by the given value. If `weight` is a tensor of size
-  [batch_size], then the total loss for each sample of the batch is rescaled
-  by the corresponding element in the `weight` vector. If the shape of
-  `weight` matches the shape of `predictions`, then the loss of each
-  measurable element of `predictions` is scaled by the corresponding value of
-  `weight`.
+`weight` acts as a coefficient for the loss. If a scalar is provided, then the
+loss is simply scaled by the given value. If `weight` is a tensor of size
+[batch_size], then the total loss for each sample of the batch is rescaled
+by the corresponding element in the `weight` vector. If the shape of
+`weight` matches the shape of `predictions`, then the loss of each
+measurable element of `predictions` is scaled by the corresponding value of
+`weight`.
+
+##### Args:
+
+
+*  <b>`predictions`</b>: The predicted outputs.
+*  <b>`labels`</b>: The ground truth output tensor, same dimensions as 'predictions'.
+*  <b>`weights`</b>: Coefficients for the loss a scalar, a tensor of shape
+    [batch_size] or a tensor whose shape matches `predictions`.
+*  <b>`epsilon`</b>: A small increment to add to avoid taking a log of zero.
+*  <b>`scope`</b>: The scope for the operations performed in computing the loss.
+*  <b>`targets`</b>: Deprecated alias for `labels`.
+*  <b>`weight`</b>: Deprecated alias for `weights`.
 
-  Args:
-    predictions: The predicted outputs.
-    labels: The ground truth output tensor, same dimensions as 'predictions'.
-    weights: Coefficients for the loss a scalar, a tensor of shape
-      [batch_size] or a tensor whose shape matches `predictions`.
-    epsilon: A small increment to add to avoid taking a log of zero.
-    scope: The scope for the operations performed in computing the loss.
-    targets: Deprecated alias for `labels`.
-    weight: Deprecated alias for `weights`.
+##### Returns:
 
-  Returns:
-    A scalar `Tensor` representing the loss value.
+  A scalar `Tensor` representing the loss value.
 
-  Raises:
-    ValueError: If the shape of `predictions` doesn't match that of `labels` or
-      if the shape of `weight` is invalid.
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If the shape of `predictions` doesn't match that of `labels` or
+    if the shape of `weight` is invalid.
 
 
 - - -
 
 ### `tf.contrib.losses.mean_pairwise_squared_error(*args, **kwargs)` {#mean_pairwise_squared_error}
 
-Adds a pairwise-errors-squared loss to the training procedure. (deprecated arguments)
+Adds a pairwise-errors-squared loss to the training procedure. (deprecated arguments) (deprecated)
+
+THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
+Instructions for updating:
+Use tf.losses.mean_pairwise_squared_error instead.
 
 SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-11-25.
 Instructions for updating:
 `targets` is being deprecated, use `labels`. `weight` is being deprecated, use `weights`.
 
-  Unlike `mean_squared_error`, which is a measure of the differences between
-  corresponding elements of `predictions` and `labels`,
-  `mean_pairwise_squared_error` is a measure of the differences between pairs of
-  corresponding elements of `predictions` and `labels`.
-
-  For example, if `labels`=[a, b, c] and `predictions`=[x, y, z], there are
-  three pairs of differences are summed to compute the loss:
-    loss = [ ((a-b) - (x-y)).^2 + ((a-c) - (x-z)).^2 + ((b-c) - (y-z)).^2 ] / 3
-
-  Note that since the inputs are of size [batch_size, d0, ... dN], the
-  corresponding pairs are computed within each batch sample but not across
-  samples within a batch. For example, if `predictions` represents a batch of
-  16 grayscale images of dimension [batch_size, 100, 200], then the set of pairs
-  is drawn from each image, but not across images.
-
-  `weight` acts as a coefficient for the loss. If a scalar is provided, then the
-  loss is simply scaled by the given value. If `weight` is a tensor of size
-  [batch_size], then the total loss for each sample of the batch is rescaled
-  by the corresponding element in the `weight` vector.
-
-  Args:
-    predictions: The predicted outputs, a tensor of size [batch_size, d0, .. dN]
-      where N+1 is the total number of dimensions in `predictions`.
-    labels: The ground truth output tensor, whose shape must match the shape of
-      the `predictions` tensor.
-    weights: Coefficients for the loss a scalar, a tensor of shape [batch_size]
-      or a tensor whose shape matches `predictions`.
-    scope: The scope for the operations performed in computing the loss.
-    targets: Deprecated alias for `labels`.
-    weight: Deprecated alias for `weights`.
-
-  Returns:
-    A scalar `Tensor` representing the loss value.
-
-  Raises:
-    ValueError: If the shape of `predictions` doesn't match that of `labels` or
-      if the shape of `weight` is invalid.
+Unlike `mean_squared_error`, which is a measure of the differences between
+corresponding elements of `predictions` and `labels`,
+`mean_pairwise_squared_error` is a measure of the differences between pairs of
+corresponding elements of `predictions` and `labels`.
+
+For example, if `labels`=[a, b, c] and `predictions`=[x, y, z], there are
+three pairs of differences are summed to compute the loss:
+  loss = [ ((a-b) - (x-y)).^2 + ((a-c) - (x-z)).^2 + ((b-c) - (y-z)).^2 ] / 3
+
+Note that since the inputs are of size [batch_size, d0, ... dN], the
+corresponding pairs are computed within each batch sample but not across
+samples within a batch. For example, if `predictions` represents a batch of
+16 grayscale images of dimension [batch_size, 100, 200], then the set of pairs
+is drawn from each image, but not across images.
+
+`weight` acts as a coefficient for the loss. If a scalar is provided, then the
+loss is simply scaled by the given value. If `weight` is a tensor of size
+[batch_size], then the total loss for each sample of the batch is rescaled
+by the corresponding element in the `weight` vector.
+
+##### Args:
+
+
+*  <b>`predictions`</b>: The predicted outputs, a tensor of size [batch_size, d0, .. dN]
+    where N+1 is the total number of dimensions in `predictions`.
+*  <b>`labels`</b>: The ground truth output tensor, whose shape must match the shape of
+    the `predictions` tensor.
+*  <b>`weights`</b>: Coefficients for the loss a scalar, a tensor of shape [batch_size]
+    or a tensor whose shape matches `predictions`.
+*  <b>`scope`</b>: The scope for the operations performed in computing the loss.
+*  <b>`targets`</b>: Deprecated alias for `labels`.
+*  <b>`weight`</b>: Deprecated alias for `weights`.
+
+##### Returns:
+
+  A scalar `Tensor` representing the loss value.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If the shape of `predictions` doesn't match that of `labels` or
+    if the shape of `weight` is invalid.
 
 
 - - -
 
 ### `tf.contrib.losses.mean_squared_error(*args, **kwargs)` {#mean_squared_error}
 
-Adds a Sum-of-Squares loss to the training procedure. (deprecated arguments)
+Adds a Sum-of-Squares loss to the training procedure. (deprecated arguments) (deprecated)
+
+THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
+Instructions for updating:
+Use tf.losses.mean_squared_error instead.
 
 SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-11-25.
 Instructions for updating:
 `targets` is being deprecated, use `labels`. `weight` is being deprecated, use `weights`.
 
-  `weight` acts as a coefficient for the loss. If a scalar is provided, then the
-  loss is simply scaled by the given value. If `weight` is a tensor of size
-  [batch_size], then the total loss for each sample of the batch is rescaled
-  by the corresponding element in the `weight` vector. If the shape of
-  `weight` matches the shape of `predictions`, then the loss of each
-  measurable element of `predictions` is scaled by the corresponding value of
-  `weight`.
+`weight` acts as a coefficient for the loss. If a scalar is provided, then the
+loss is simply scaled by the given value. If `weight` is a tensor of size
+[batch_size], then the total loss for each sample of the batch is rescaled
+by the corresponding element in the `weight` vector. If the shape of
+`weight` matches the shape of `predictions`, then the loss of each
+measurable element of `predictions` is scaled by the corresponding value of
+`weight`.
+
+##### Args:
+
+
+*  <b>`predictions`</b>: The predicted outputs.
+*  <b>`labels`</b>: The ground truth output tensor, same dimensions as 'predictions'.
+*  <b>`weights`</b>: Coefficients for the loss a scalar, a tensor of shape
+    [batch_size] or a tensor whose shape matches `predictions`.
+*  <b>`scope`</b>: The scope for the operations performed in computing the loss.
+*  <b>`targets`</b>: Deprecated alias for `labels`.
+*  <b>`weight`</b>: Deprecated alias for `weights`.
+
+##### Returns:
+
+  A scalar `Tensor` representing the loss value.
 
-  Args:
-    predictions: The predicted outputs.
-    labels: The ground truth output tensor, same dimensions as 'predictions'.
-    weights: Coefficients for the loss a scalar, a tensor of shape
-      [batch_size] or a tensor whose shape matches `predictions`.
-    scope: The scope for the operations performed in computing the loss.
-    targets: Deprecated alias for `labels`.
-    weight: Deprecated alias for `weights`.
+##### Raises:
 
-  Returns:
-    A scalar `Tensor` representing the loss value.
 
-  Raises:
-    ValueError: If the shape of `predictions` doesn't match that of `labels` or
-      if the shape of `weight` is invalid.
+*  <b>`ValueError`</b>: If the shape of `predictions` doesn't match that of `labels` or
+    if the shape of `weight` is invalid.
 
 
 - - -
 
 ### `tf.contrib.losses.sigmoid_cross_entropy(*args, **kwargs)` {#sigmoid_cross_entropy}
 
-Creates a cross-entropy loss using tf.nn.sigmoid_cross_entropy_with_logits. (deprecated arguments)
+Creates a cross-entropy loss using tf.nn.sigmoid_cross_entropy_with_logits. (deprecated arguments) (deprecated)
+
+THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
+Instructions for updating:
+Use tf.losses.sigmoid_cross_entropy instead.
 
 SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-11-25.
 Instructions for updating:
 `weight` is being deprecated, use `weights`
 
-  `weight` acts as a coefficient for the loss. If a scalar is provided,
-  then the loss is simply scaled by the given value. If `weight` is a
-  tensor of size [`batch_size`], then the loss weights apply to each
-  corresponding sample.
+`weight` acts as a coefficient for the loss. If a scalar is provided,
+then the loss is simply scaled by the given value. If `weight` is a
+tensor of size [`batch_size`], then the loss weights apply to each
+corresponding sample.
+
+If `label_smoothing` is nonzero, smooth the labels towards 1/2:
+
+    new_multiclass_labels = multiclass_labels * (1 - label_smoothing)
+                            + 0.5 * label_smoothing
 
-  If `label_smoothing` is nonzero, smooth the labels towards 1/2:
+##### Args:
+
+
+*  <b>`logits`</b>: [batch_size, num_classes] logits outputs of the network .
+*  <b>`multi_class_labels`</b>: [batch_size, num_classes] target labels in (0, 1).
+*  <b>`weights`</b>: Coefficients for the loss. The tensor must be a scalar, a tensor of
+    shape [batch_size] or shape [batch_size, num_classes].
+*  <b>`label_smoothing`</b>: If greater than 0 then smooth the labels.
+*  <b>`scope`</b>: The scope for the operations performed in computing the loss.
+*  <b>`weight`</b>: Deprecated alias for `weights`.
+
+##### Returns:
 
-      new_multiclass_labels = multiclass_labels * (1 - label_smoothing)
-                              + 0.5 * label_smoothing
+  A scalar `Tensor` representing the loss value.
 
-  Args:
-    logits: [batch_size, num_classes] logits outputs of the network .
-    multi_class_labels: [batch_size, num_classes] target labels in (0, 1).
-    weights: Coefficients for the loss. The tensor must be a scalar, a tensor of
-      shape [batch_size] or shape [batch_size, num_classes].
-    label_smoothing: If greater than 0 then smooth the labels.
-    scope: The scope for the operations performed in computing the loss.
-    weight: Deprecated alias for `weights`.
+##### Raises:
 
-  Returns:
-    A scalar `Tensor` representing the loss value.
 
-  Raises:
-    ValueError: If the shape of `logits` doesn't match that of
-      `multi_class_labels` or if the shape of `weight` is invalid, or if
-      `weight` is None.
+*  <b>`ValueError`</b>: If the shape of `logits` doesn't match that of
+    `multi_class_labels` or if the shape of `weight` is invalid, or if
+    `weight` is None.
 
 
 - - -
 
 ### `tf.contrib.losses.softmax_cross_entropy(*args, **kwargs)` {#softmax_cross_entropy}
 
-Creates a cross-entropy loss using tf.nn.softmax_cross_entropy_with_logits. (deprecated arguments)
+Creates a cross-entropy loss using tf.nn.softmax_cross_entropy_with_logits. (deprecated arguments) (deprecated)
+
+THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
+Instructions for updating:
+Use tf.losses.softmax_cross_entropy instead.
 
 SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-11-25.
 Instructions for updating:
 `weight` is being deprecated, use `weights`
 
-  `weight` acts as a coefficient for the loss. If a scalar is provided,
-  then the loss is simply scaled by the given value. If `weight` is a
-  tensor of size [`batch_size`], then the loss weights apply to each
-  corresponding sample.
+`weight` acts as a coefficient for the loss. If a scalar is provided,
+then the loss is simply scaled by the given value. If `weight` is a
+tensor of size [`batch_size`], then the loss weights apply to each
+corresponding sample.
 
-  If `label_smoothing` is nonzero, smooth the labels towards 1/num_classes:
-      new_onehot_labels = onehot_labels * (1 - label_smoothing)
-                          + label_smoothing / num_classes
+If `label_smoothing` is nonzero, smooth the labels towards 1/num_classes:
+    new_onehot_labels = onehot_labels * (1 - label_smoothing)
+                        + label_smoothing / num_classes
+
+##### Args:
 
-  Args:
-    logits: [batch_size, num_classes] logits outputs of the network .
-    onehot_labels: [batch_size, num_classes] target one_hot_encoded labels.
-    weights: Coefficients for the loss. The tensor must be a scalar or a tensor
-      of shape [batch_size].
-    label_smoothing: If greater than 0 then smooth the labels.
-    scope: the scope for the operations performed in computing the loss.
-    weight: Deprecated alias for `weights`.
 
-  Returns:
-    A scalar `Tensor` representing the loss value.
+*  <b>`logits`</b>: [batch_size, num_classes] logits outputs of the network .
+*  <b>`onehot_labels`</b>: [batch_size, num_classes] target one_hot_encoded labels.
+*  <b>`weights`</b>: Coefficients for the loss. The tensor must be a scalar or a tensor
+    of shape [batch_size].
+*  <b>`label_smoothing`</b>: If greater than 0 then smooth the labels.
+*  <b>`scope`</b>: the scope for the operations performed in computing the loss.
+*  <b>`weight`</b>: Deprecated alias for `weights`.
+
+##### Returns:
+
+  A scalar `Tensor` representing the loss value.
+
+##### Raises:
 
-  Raises:
-    ValueError: If the shape of `logits` doesn't match that of `onehot_labels`
-      or if the shape of `weight` is invalid or if `weight` is None.
+
+*  <b>`ValueError`</b>: If the shape of `logits` doesn't match that of `onehot_labels`
+    or if the shape of `weight` is invalid or if `weight` is None.
 
 
 - - -
 
 ### `tf.contrib.losses.sparse_softmax_cross_entropy(*args, **kwargs)` {#sparse_softmax_cross_entropy}
 
-Cross-entropy loss using `tf.nn.sparse_softmax_cross_entropy_with_logits`. (deprecated arguments)
+Cross-entropy loss using `tf.nn.sparse_softmax_cross_entropy_with_logits`. (deprecated arguments) (deprecated)
+
+THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
+Instructions for updating:
+Use tf.losses.sparse_softmax_cross_entropy instead.
 
 SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-11-25.
 Instructions for updating:
 `weight` is being deprecated, use `weights`
 
-  `weight` acts as a coefficient for the loss. If a scalar is provided,
-  then the loss is simply scaled by the given value. If `weight` is a
-  tensor of size [`batch_size`], then the loss weights apply to each
-  corresponding sample.
-
-  Args:
-    logits: [batch_size, num_classes] logits outputs of the network .
-    labels: [batch_size, 1] or [batch_size] target labels of dtype `int32` or
-      `int64` in the range `[0, num_classes)`.
-    weights: Coefficients for the loss. The tensor must be a scalar or a tensor
-      of shape [batch_size] or [batch_size, 1].
-    scope: the scope for the operations performed in computing the loss.
-    weight: Deprecated alias for `weights`.
-
-  Returns:
-    A scalar `Tensor` representing the loss value.
-
-  Raises:
-    ValueError: If the shapes of logits, labels, and weight are incompatible, or
-      if `weight` is None.
+`weight` acts as a coefficient for the loss. If a scalar is provided,
+then the loss is simply scaled by the given value. If `weight` is a
+tensor of size [`batch_size`], then the loss weights apply to each
+corresponding sample.
+
+##### Args:
+
+
+*  <b>`logits`</b>: [batch_size, num_classes] logits outputs of the network .
+*  <b>`labels`</b>: [batch_size, 1] or [batch_size] target labels of dtype `int32` or
+    `int64` in the range `[0, num_classes)`.
+*  <b>`weights`</b>: Coefficients for the loss. The tensor must be a scalar or a tensor
+    of shape [batch_size] or [batch_size, 1].
+*  <b>`scope`</b>: the scope for the operations performed in computing the loss.
+*  <b>`weight`</b>: Deprecated alias for `weights`.
+
+##### Returns:
+
+  A scalar `Tensor` representing the loss value.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If the shapes of logits, labels, and weight are incompatible, or
+    if `weight` is None.
 
 
diff --git a/tensorflow/g3doc/api_docs/python/contrib.metrics.md b/tensorflow/g3doc/api_docs/python/contrib.metrics.md
index 70ee33434303d6..756bdf6ac99347 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.metrics.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.metrics.md
@@ -457,44 +457,50 @@ THIS FUNCTION IS DEPRECATED. It will be removed after 2016-11-08.
 Instructions for updating:
 Please use `streaming_sparse_recall_at_k`, and reshape labels from [batch_size] to [batch_size, 1].
 
-  The `streaming_recall_at_k` function creates two local variables, `total` and
-  `count`, that are used to compute the recall@k frequency. This frequency is
-  ultimately returned as `recall_at_<k>`: an idempotent operation that simply
-  divides `total` by `count`.
-
-  For estimation of the metric over a stream of data, the function creates an
-  `update_op` operation that updates these variables and returns the
-  `recall_at_<k>`. Internally, an `in_top_k` operation computes a `Tensor` with
-  shape [batch_size] whose elements indicate whether or not the corresponding
-  label is in the top `k` `predictions`. Then `update_op` increments `total`
-  with the reduced sum of `weights` where `in_top_k` is `True`, and it
-  increments `count` with the reduced sum of `weights`.
-
-  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-  Args:
-    predictions: A float `Tensor` of dimension [batch_size, num_classes].
-    labels: A `Tensor` of dimension [batch_size] whose type is in `int32`,
-      `int64`.
-    k: The number of top elements to look at for computing recall.
-    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
-    metrics_collections: An optional list of collections that `recall_at_k`
-      should be added to.
-    updates_collections: An optional list of collections `update_op` should be
-      added to.
-    name: An optional variable_scope name.
-
-  Returns:
-    recall_at_k: A `Tensor` representing the recall@k, the fraction of labels
-      which fall into the top `k` predictions.
-    update_op: An operation that increments the `total` and `count` variables
-      appropriately and whose value matches `recall_at_k`.
-
-  Raises:
-    ValueError: If `predictions` and `labels` have mismatched shapes, or if
-      `weights` is not `None` and its shape doesn't match `predictions`, or if
-      either `metrics_collections` or `updates_collections` are not a list or
-      tuple.
+The `streaming_recall_at_k` function creates two local variables, `total` and
+`count`, that are used to compute the recall@k frequency. This frequency is
+ultimately returned as `recall_at_<k>`: an idempotent operation that simply
+divides `total` by `count`.
+
+For estimation of the metric over a stream of data, the function creates an
+`update_op` operation that updates these variables and returns the
+`recall_at_<k>`. Internally, an `in_top_k` operation computes a `Tensor` with
+shape [batch_size] whose elements indicate whether or not the corresponding
+label is in the top `k` `predictions`. Then `update_op` increments `total`
+with the reduced sum of `weights` where `in_top_k` is `True`, and it
+increments `count` with the reduced sum of `weights`.
+
+If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+##### Args:
+
+
+*  <b>`predictions`</b>: A float `Tensor` of dimension [batch_size, num_classes].
+*  <b>`labels`</b>: A `Tensor` of dimension [batch_size] whose type is in `int32`,
+    `int64`.
+*  <b>`k`</b>: The number of top elements to look at for computing recall.
+*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to `predictions`.
+*  <b>`metrics_collections`</b>: An optional list of collections that `recall_at_k`
+    should be added to.
+*  <b>`updates_collections`</b>: An optional list of collections `update_op` should be
+    added to.
+*  <b>`name`</b>: An optional variable_scope name.
+
+##### Returns:
+
+
+*  <b>`recall_at_k`</b>: A `Tensor` representing the recall@k, the fraction of labels
+    which fall into the top `k` predictions.
+*  <b>`update_op`</b>: An operation that increments the `total` and `count` variables
+    appropriately and whose value matches `recall_at_k`.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, or if
+    `weights` is not `None` and its shape doesn't match `predictions`, or if
+    either `metrics_collections` or `updates_collections` are not a list or
+    tuple.
 
 
 - - -
@@ -1644,69 +1650,6 @@ Computes the percentage of times that predictions matches labels.
               if dtype is not bool, integer, or string.
 
 
-- - -
-
-### `tf.contrib.metrics.confusion_matrix(predictions, labels, num_classes=None, dtype=tf.int32, name=None, weights=None)` {#confusion_matrix}
-
-Computes the confusion matrix from predictions and labels.
-
-Calculate the Confusion Matrix for a pair of prediction and
-label 1-D int arrays.
-
-The matrix rows represent the prediction labels and the columns
-represents the real labels. The confusion matrix is always a 2-D array
-of shape `[n, n]`, where `n` is the number of valid labels for a given
-classification task. Both prediction and labels must be 1-D arrays of
-the same shape in order for this function to work.
-
-If `num_classes` is None, then `num_classes` will be set to the one plus
-the maximum value in either predictions or labels.
-Class labels are expected to start at 0. E.g., if `num_classes` was
-three, then the possible labels would be `[0, 1, 2]`.
-
-If `weights` is not `None`, then each prediction contributes its
-corresponding weight to the total value of the confusion matrix cell.
-
-For example:
-
-```python
-  tf.contrib.metrics.confusion_matrix([1, 2, 4], [2, 2, 4]) ==>
-      [[0 0 0 0 0]
-       [0 0 1 0 0]
-       [0 0 1 0 0]
-       [0 0 0 0 0]
-       [0 0 0 0 1]]
-```
-
-Note that the possible labels are assumed to be `[0, 1, 2, 3, 4]`,
-resulting in a 5x5 confusion matrix.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: A 1-D array representing the predictions for a given
-               classification.
-*  <b>`labels`</b>: A 1-D representing the real labels for the classification task.
-*  <b>`num_classes`</b>: The possible number of labels the classification task can
-               have. If this value is not provided, it will be calculated
-               using both predictions and labels array.
-*  <b>`dtype`</b>: Data type of the confusion matrix.
-*  <b>`name`</b>: Scope name.
-*  <b>`weights`</b>: An optional `Tensor` whose shape matches `predictions`.
-
-##### Returns:
-
-  A k X k matrix representing the confusion matrix, where k is the number of
-  possible labels in the classification task.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If both predictions and labels are not 1-D vectors and have
-    mismatched shapes, or if `weights` is not `None` and its shape doesn't
-    match `predictions`.
-
-
 
 - - -
 
diff --git a/tensorflow/g3doc/api_docs/python/contrib.rnn.md b/tensorflow/g3doc/api_docs/python/contrib.rnn.md
index 8f28c1923236d5..53b10e221d3bd2 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.rnn.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.rnn.md
@@ -1,11 +1,854 @@
 <!-- This file is machine generated: DO NOT EDIT! -->
 
-# RNN (contrib)
+# RNN and Cells (contrib)
 [TOC]
 
-Additional RNN operations and cells.
+Module for constructing RNN Cells and additional RNN operations.
+
+## Base interface for all RNN Cells
+
+- - -
+
+### `class tf.contrib.rnn.RNNCell` {#RNNCell}
+
+Abstract object representing an RNN cell.
+
+The definition of cell in this package differs from the definition used in the
+literature. In the literature, cell refers to an object with a single scalar
+output. The definition in this package refers to a horizontal array of such
+units.
+
+An RNN cell, in the most abstract setting, is anything that has
+a state and performs some operation that takes a matrix of inputs.
+This operation results in an output matrix with `self.output_size` columns.
+If `self.state_size` is an integer, this operation also results in a new
+state matrix with `self.state_size` columns.  If `self.state_size` is a
+tuple of integers, then it results in a tuple of `len(state_size)` state
+matrices, each with a column size corresponding to values in `state_size`.
+
+This module provides a number of basic commonly used RNN cells, such as
+LSTM (Long Short Term Memory) or GRU (Gated Recurrent Unit), and a number
+of operators that allow add dropouts, projections, or embeddings for inputs.
+Constructing multi-layer cells is supported by the class `MultiRNNCell`,
+or by calling the `rnn` ops several times. Every `RNNCell` must have the
+properties below and and implement `__call__` with the following signature.
+- - -
+
+#### `tf.contrib.rnn.RNNCell.__call__(inputs, state, scope=None)` {#RNNCell.__call__}
+
+Run this RNN cell on inputs, starting from the given state.
+
+##### Args:
+
+
+*  <b>`inputs`</b>: `2-D` tensor with shape `[batch_size x input_size]`.
+*  <b>`state`</b>: if `self.state_size` is an integer, this should be a `2-D Tensor`
+    with shape `[batch_size x self.state_size]`.  Otherwise, if
+    `self.state_size` is a tuple of integers, this should be a tuple
+    with shapes `[batch_size x s] for s in self.state_size`.
+*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to class name.
+
+##### Returns:
+
+  A pair containing:
+
+  - Output: A `2-D` tensor with shape `[batch_size x self.output_size]`.
+  - New state: Either a single `2-D` tensor, or a tuple of tensors matching
+    the arity and shapes of `state`.
+
+
+- - -
+
+#### `tf.contrib.rnn.RNNCell.output_size` {#RNNCell.output_size}
+
+Integer or TensorShape: size of outputs produced by this cell.
+
+
+- - -
+
+#### `tf.contrib.rnn.RNNCell.state_size` {#RNNCell.state_size}
+
+size(s) of state(s) used by this cell.
+
+It can be represented by an Integer, a TensorShape or a tuple of Integers
+or TensorShapes.
+
+
+- - -
+
+#### `tf.contrib.rnn.RNNCell.zero_state(batch_size, dtype)` {#RNNCell.zero_state}
+
+Return zero-filled state tensor(s).
+
+##### Args:
+
+
+*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
+*  <b>`dtype`</b>: the data type to use for the state.
+
+##### Returns:
+
+  If `state_size` is an int or TensorShape, then the return value is a
+  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
+
+  If `state_size` is a nested list or tuple, then the return value is
+  a nested list or tuple (of the same structure) of `2-D` tensors with
+the shapes `[batch_size x s]` for each s in `state_size`.
+
+
+
+
+## RNN Cells for use with TensorFlow's core RNN methods
+
+- - -
+
+### `class tf.contrib.rnn.BasicRNNCell` {#BasicRNNCell}
+
+The most basic RNN cell.
+- - -
+
+#### `tf.contrib.rnn.BasicRNNCell.__call__(inputs, state, scope=None)` {#BasicRNNCell.__call__}
+
+Most basic RNN: output = new_state = act(W * input + U * state + B).
+
+
+- - -
+
+#### `tf.contrib.rnn.BasicRNNCell.__init__(num_units, input_size=None, activation=tanh)` {#BasicRNNCell.__init__}
+
+
+
+
+- - -
+
+#### `tf.contrib.rnn.BasicRNNCell.output_size` {#BasicRNNCell.output_size}
+
+
+
+
+- - -
+
+#### `tf.contrib.rnn.BasicRNNCell.state_size` {#BasicRNNCell.state_size}
+
+
+
+
+- - -
+
+#### `tf.contrib.rnn.BasicRNNCell.zero_state(batch_size, dtype)` {#BasicRNNCell.zero_state}
+
+Return zero-filled state tensor(s).
+
+##### Args:
+
+
+*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
+*  <b>`dtype`</b>: the data type to use for the state.
+
+##### Returns:
+
+  If `state_size` is an int or TensorShape, then the return value is a
+  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
+
+  If `state_size` is a nested list or tuple, then the return value is
+  a nested list or tuple (of the same structure) of `2-D` tensors with
+the shapes `[batch_size x s]` for each s in `state_size`.
+
+
+
+- - -
+
+### `class tf.contrib.rnn.BasicLSTMCell` {#BasicLSTMCell}
+
+Basic LSTM recurrent network cell.
+
+The implementation is based on: http://arxiv.org/abs/1409.2329.
+
+We add forget_bias (default: 1) to the biases of the forget gate in order to
+reduce the scale of forgetting in the beginning of the training.
+
+It does not allow cell clipping, a projection layer, and does not
+use peep-hole connections: it is the basic baseline.
+
+For advanced models, please use the full LSTMCell that follows.
+- - -
+
+#### `tf.contrib.rnn.BasicLSTMCell.__call__(inputs, state, scope=None)` {#BasicLSTMCell.__call__}
+
+Long short-term memory cell (LSTM).
+
+
+- - -
+
+#### `tf.contrib.rnn.BasicLSTMCell.__init__(num_units, forget_bias=1.0, input_size=None, state_is_tuple=True, activation=tanh)` {#BasicLSTMCell.__init__}
+
+Initialize the basic LSTM cell.
+
+##### Args:
+
+
+*  <b>`num_units`</b>: int, The number of units in the LSTM cell.
+*  <b>`forget_bias`</b>: float, The bias added to forget gates (see above).
+*  <b>`input_size`</b>: Deprecated and unused.
+*  <b>`state_is_tuple`</b>: If True, accepted and returned states are 2-tuples of
+    the `c_state` and `m_state`.  If False, they are concatenated
+    along the column axis.  The latter behavior will soon be deprecated.
+*  <b>`activation`</b>: Activation function of the inner states.
+
+
+- - -
+
+#### `tf.contrib.rnn.BasicLSTMCell.output_size` {#BasicLSTMCell.output_size}
+
+
+
+
+- - -
+
+#### `tf.contrib.rnn.BasicLSTMCell.state_size` {#BasicLSTMCell.state_size}
+
+
+
+
+- - -
+
+#### `tf.contrib.rnn.BasicLSTMCell.zero_state(batch_size, dtype)` {#BasicLSTMCell.zero_state}
+
+Return zero-filled state tensor(s).
+
+##### Args:
+
+
+*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
+*  <b>`dtype`</b>: the data type to use for the state.
+
+##### Returns:
+
+  If `state_size` is an int or TensorShape, then the return value is a
+  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
+
+  If `state_size` is a nested list or tuple, then the return value is
+  a nested list or tuple (of the same structure) of `2-D` tensors with
+the shapes `[batch_size x s]` for each s in `state_size`.
+
+
+
+- - -
+
+### `class tf.contrib.rnn.GRUCell` {#GRUCell}
+
+Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078).
+- - -
+
+#### `tf.contrib.rnn.GRUCell.__call__(inputs, state, scope=None)` {#GRUCell.__call__}
+
+Gated recurrent unit (GRU) with nunits cells.
+
+
+- - -
+
+#### `tf.contrib.rnn.GRUCell.__init__(num_units, input_size=None, activation=tanh)` {#GRUCell.__init__}
+
+
+
+
+- - -
+
+#### `tf.contrib.rnn.GRUCell.output_size` {#GRUCell.output_size}
+
+
+
+
+- - -
+
+#### `tf.contrib.rnn.GRUCell.state_size` {#GRUCell.state_size}
+
+
+
+
+- - -
+
+#### `tf.contrib.rnn.GRUCell.zero_state(batch_size, dtype)` {#GRUCell.zero_state}
+
+Return zero-filled state tensor(s).
+
+##### Args:
+
+
+*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
+*  <b>`dtype`</b>: the data type to use for the state.
+
+##### Returns:
+
+  If `state_size` is an int or TensorShape, then the return value is a
+  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
+
+  If `state_size` is a nested list or tuple, then the return value is
+  a nested list or tuple (of the same structure) of `2-D` tensors with
+the shapes `[batch_size x s]` for each s in `state_size`.
+
+
+
+- - -
+
+### `class tf.contrib.rnn.LSTMCell` {#LSTMCell}
+
+Long short-term memory unit (LSTM) recurrent network cell.
+
+The default non-peephole implementation is based on:
+
+  http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf
+
+S. Hochreiter and J. Schmidhuber.
+"Long Short-Term Memory". Neural Computation, 9(8):1735-1780, 1997.
+
+The peephole implementation is based on:
+
+  https://research.google.com/pubs/archive/43905.pdf
+
+Hasim Sak, Andrew Senior, and Francoise Beaufays.
+"Long short-term memory recurrent neural network architectures for
+ large scale acoustic modeling." INTERSPEECH, 2014.
+
+The class uses optional peep-hole connections, optional cell clipping, and
+an optional projection layer.
+- - -
+
+#### `tf.contrib.rnn.LSTMCell.__call__(inputs, state, scope=None)` {#LSTMCell.__call__}
+
+Run one step of LSTM.
+
+##### Args:
+
+
+*  <b>`inputs`</b>: input Tensor, 2D, batch x num_units.
+*  <b>`state`</b>: if `state_is_tuple` is False, this must be a state Tensor,
+    `2-D, batch x state_size`.  If `state_is_tuple` is True, this must be a
+    tuple of state Tensors, both `2-D`, with column sizes `c_state` and
+    `m_state`.
+*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to "lstm_cell".
+
+##### Returns:
+
+  A tuple containing:
+
+  - A `2-D, [batch x output_dim]`, Tensor representing the output of the
+    LSTM after reading `inputs` when previous state was `state`.
+    Here output_dim is:
+       num_proj if num_proj was set,
+       num_units otherwise.
+  - Tensor(s) representing the new state of LSTM after reading `inputs` when
+    the previous state was `state`.  Same type and shape(s) as `state`.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If input size cannot be inferred from inputs via
+    static shape inference.
+
+
+- - -
+
+#### `tf.contrib.rnn.LSTMCell.__init__(num_units, input_size=None, use_peepholes=False, cell_clip=None, initializer=None, num_proj=None, proj_clip=None, num_unit_shards=None, num_proj_shards=None, forget_bias=1.0, state_is_tuple=True, activation=tanh)` {#LSTMCell.__init__}
+
+Initialize the parameters for an LSTM cell.
+
+##### Args:
+
+
+*  <b>`num_units`</b>: int, The number of units in the LSTM cell
+*  <b>`input_size`</b>: Deprecated and unused.
+*  <b>`use_peepholes`</b>: bool, set True to enable diagonal/peephole connections.
+*  <b>`cell_clip`</b>: (optional) A float value, if provided the cell state is clipped
+    by this value prior to the cell output activation.
+*  <b>`initializer`</b>: (optional) The initializer to use for the weight and
+    projection matrices.
+*  <b>`num_proj`</b>: (optional) int, The output dimensionality for the projection
+    matrices.  If None, no projection is performed.
+*  <b>`proj_clip`</b>: (optional) A float value.  If `num_proj > 0` and `proj_clip` is
+    provided, then the projected values are clipped elementwise to within
+    `[-proj_clip, proj_clip]`.
+*  <b>`num_unit_shards`</b>: Deprecated, will be removed by Jan. 2017.
+    Use a variable_scope partitioner instead.
+*  <b>`num_proj_shards`</b>: Deprecated, will be removed by Jan. 2017.
+    Use a variable_scope partitioner instead.
+*  <b>`forget_bias`</b>: Biases of the forget gate are initialized by default to 1
+    in order to reduce the scale of forgetting at the beginning of
+    the training.
+*  <b>`state_is_tuple`</b>: If True, accepted and returned states are 2-tuples of
+    the `c_state` and `m_state`.  If False, they are concatenated
+    along the column axis.  This latter behavior will soon be deprecated.
+*  <b>`activation`</b>: Activation function of the inner states.
+
+
+- - -
+
+#### `tf.contrib.rnn.LSTMCell.output_size` {#LSTMCell.output_size}
+
+
+
+
+- - -
+
+#### `tf.contrib.rnn.LSTMCell.state_size` {#LSTMCell.state_size}
+
+
+
+
+- - -
+
+#### `tf.contrib.rnn.LSTMCell.zero_state(batch_size, dtype)` {#LSTMCell.zero_state}
+
+Return zero-filled state tensor(s).
+
+##### Args:
+
+
+*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
+*  <b>`dtype`</b>: the data type to use for the state.
+
+##### Returns:
+
+  If `state_size` is an int or TensorShape, then the return value is a
+  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
+
+  If `state_size` is a nested list or tuple, then the return value is
+  a nested list or tuple (of the same structure) of `2-D` tensors with
+the shapes `[batch_size x s]` for each s in `state_size`.
+
+
+
+
+## Classes storing split `RNNCell` state
+
+- - -
+
+### `class tf.contrib.rnn.LSTMStateTuple` {#LSTMStateTuple}
+
+Tuple used by LSTM Cells for `state_size`, `zero_state`, and output state.
+
+Stores two elements: `(c, h)`, in that order.
+
+Only used when `state_is_tuple=True`.
+- - -
+
+#### `tf.contrib.rnn.LSTMStateTuple.__getnewargs__()` {#LSTMStateTuple.__getnewargs__}
+
+Return self as a plain tuple.  Used by copy and pickle.
+
+
+- - -
+
+#### `tf.contrib.rnn.LSTMStateTuple.__getstate__()` {#LSTMStateTuple.__getstate__}
+
+Exclude the OrderedDict from pickling
+
+
+- - -
+
+#### `tf.contrib.rnn.LSTMStateTuple.__new__(_cls, c, h)` {#LSTMStateTuple.__new__}
+
+Create new instance of LSTMStateTuple(c, h)
+
+
+- - -
+
+#### `tf.contrib.rnn.LSTMStateTuple.__repr__()` {#LSTMStateTuple.__repr__}
+
+Return a nicely formatted representation string
+
+
+- - -
+
+#### `tf.contrib.rnn.LSTMStateTuple.c` {#LSTMStateTuple.c}
+
+Alias for field number 0
+
+
+- - -
+
+#### `tf.contrib.rnn.LSTMStateTuple.dtype` {#LSTMStateTuple.dtype}
+
+
+
+
+- - -
+
+#### `tf.contrib.rnn.LSTMStateTuple.h` {#LSTMStateTuple.h}
+
+Alias for field number 1
+
+
+
+
+## RNN Cell wrappers (RNNCells that wrap other RNNCells)
+
+- - -
+
+### `class tf.contrib.rnn.MultiRNNCell` {#MultiRNNCell}
+
+RNN cell composed sequentially of multiple simple cells.
+- - -
+
+#### `tf.contrib.rnn.MultiRNNCell.__call__(inputs, state, scope=None)` {#MultiRNNCell.__call__}
+
+Run this multi-layer cell on inputs, starting from state.
+
+
+- - -
+
+#### `tf.contrib.rnn.MultiRNNCell.__init__(cells, state_is_tuple=True)` {#MultiRNNCell.__init__}
+
+Create a RNN cell composed sequentially of a number of RNNCells.
+
+##### Args:
+
+
+*  <b>`cells`</b>: list of RNNCells that will be composed in this order.
+*  <b>`state_is_tuple`</b>: If True, accepted and returned states are n-tuples, where
+    `n = len(cells)`.  If False, the states are all
+    concatenated along the column axis.  This latter behavior will soon be
+    deprecated.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: if cells is empty (not allowed), or at least one of the cells
+    returns a state tuple but the flag `state_is_tuple` is `False`.
+
+
+- - -
+
+#### `tf.contrib.rnn.MultiRNNCell.output_size` {#MultiRNNCell.output_size}
+
+
+
+
+- - -
+
+#### `tf.contrib.rnn.MultiRNNCell.state_size` {#MultiRNNCell.state_size}
+
+
+
+
+- - -
+
+#### `tf.contrib.rnn.MultiRNNCell.zero_state(batch_size, dtype)` {#MultiRNNCell.zero_state}
+
+Return zero-filled state tensor(s).
+
+##### Args:
+
+
+*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
+*  <b>`dtype`</b>: the data type to use for the state.
+
+##### Returns:
+
+  If `state_size` is an int or TensorShape, then the return value is a
+  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
+
+  If `state_size` is a nested list or tuple, then the return value is
+  a nested list or tuple (of the same structure) of `2-D` tensors with
+the shapes `[batch_size x s]` for each s in `state_size`.
+
+
+
+- - -
+
+### `class tf.contrib.rnn.DropoutWrapper` {#DropoutWrapper}
+
+Operator adding dropout to inputs and outputs of the given cell.
+- - -
+
+#### `tf.contrib.rnn.DropoutWrapper.__call__(inputs, state, scope=None)` {#DropoutWrapper.__call__}
+
+Run the cell with the declared dropouts.
+
+
+- - -
+
+#### `tf.contrib.rnn.DropoutWrapper.__init__(cell, input_keep_prob=1.0, output_keep_prob=1.0, seed=None)` {#DropoutWrapper.__init__}
+
+Create a cell with added input and/or output dropout.
+
+Dropout is never used on the state.
+
+##### Args:
+
+
+*  <b>`cell`</b>: an RNNCell, a projection to output_size is added to it.
+*  <b>`input_keep_prob`</b>: unit Tensor or float between 0 and 1, input keep
+    probability; if it is float and 1, no input dropout will be added.
+*  <b>`output_keep_prob`</b>: unit Tensor or float between 0 and 1, output keep
+    probability; if it is float and 1, no output dropout will be added.
+*  <b>`seed`</b>: (optional) integer, the randomness seed.
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if cell is not an RNNCell.
+*  <b>`ValueError`</b>: if keep_prob is not between 0 and 1.
+
+
+- - -
+
+#### `tf.contrib.rnn.DropoutWrapper.output_size` {#DropoutWrapper.output_size}
+
+
+
+
+- - -
+
+#### `tf.contrib.rnn.DropoutWrapper.state_size` {#DropoutWrapper.state_size}
+
+
+
+
+- - -
+
+#### `tf.contrib.rnn.DropoutWrapper.zero_state(batch_size, dtype)` {#DropoutWrapper.zero_state}
+
+Return zero-filled state tensor(s).
+
+##### Args:
+
+
+*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
+*  <b>`dtype`</b>: the data type to use for the state.
+
+##### Returns:
+
+  If `state_size` is an int or TensorShape, then the return value is a
+  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
+
+  If `state_size` is a nested list or tuple, then the return value is
+  a nested list or tuple (of the same structure) of `2-D` tensors with
+the shapes `[batch_size x s]` for each s in `state_size`.
+
+
+
+- - -
+
+### `class tf.contrib.rnn.EmbeddingWrapper` {#EmbeddingWrapper}
+
+Operator adding input embedding to the given cell.
+
+Note: in many cases it may be more efficient to not use this wrapper,
+but instead concatenate the whole sequence of your inputs in time,
+do the embedding on this batch-concatenated sequence, then split it and
+feed into your RNN.
+- - -
+
+#### `tf.contrib.rnn.EmbeddingWrapper.__call__(inputs, state, scope=None)` {#EmbeddingWrapper.__call__}
+
+Run the cell on embedded inputs.
+
+
+- - -
+
+#### `tf.contrib.rnn.EmbeddingWrapper.__init__(cell, embedding_classes, embedding_size, initializer=None)` {#EmbeddingWrapper.__init__}
+
+Create a cell with an added input embedding.
+
+##### Args:
+
+
+*  <b>`cell`</b>: an RNNCell, an embedding will be put before its inputs.
+*  <b>`embedding_classes`</b>: integer, how many symbols will be embedded.
+*  <b>`embedding_size`</b>: integer, the size of the vectors we embed into.
+*  <b>`initializer`</b>: an initializer to use when creating the embedding;
+    if None, the initializer from variable scope or a default one is used.
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if cell is not an RNNCell.
+*  <b>`ValueError`</b>: if embedding_classes is not positive.
+
+
+- - -
+
+#### `tf.contrib.rnn.EmbeddingWrapper.output_size` {#EmbeddingWrapper.output_size}
+
+
+
+
+- - -
+
+#### `tf.contrib.rnn.EmbeddingWrapper.state_size` {#EmbeddingWrapper.state_size}
+
+
+
+
+- - -
+
+#### `tf.contrib.rnn.EmbeddingWrapper.zero_state(batch_size, dtype)` {#EmbeddingWrapper.zero_state}
+
+Return zero-filled state tensor(s).
+
+##### Args:
+
+
+*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
+*  <b>`dtype`</b>: the data type to use for the state.
+
+##### Returns:
+
+  If `state_size` is an int or TensorShape, then the return value is a
+  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
+
+  If `state_size` is a nested list or tuple, then the return value is
+  a nested list or tuple (of the same structure) of `2-D` tensors with
+the shapes `[batch_size x s]` for each s in `state_size`.
+
+
+
+- - -
+
+### `class tf.contrib.rnn.InputProjectionWrapper` {#InputProjectionWrapper}
+
+Operator adding an input projection to the given cell.
+
+Note: in many cases it may be more efficient to not use this wrapper,
+but instead concatenate the whole sequence of your inputs in time,
+do the projection on this batch-concatenated sequence, then split it.
+- - -
+
+#### `tf.contrib.rnn.InputProjectionWrapper.__call__(inputs, state, scope=None)` {#InputProjectionWrapper.__call__}
+
+Run the input projection and then the cell.
+
+
+- - -
+
+#### `tf.contrib.rnn.InputProjectionWrapper.__init__(cell, num_proj, input_size=None)` {#InputProjectionWrapper.__init__}
+
+Create a cell with input projection.
+
+##### Args:
+
+
+*  <b>`cell`</b>: an RNNCell, a projection of inputs is added before it.
+*  <b>`num_proj`</b>: Python integer.  The dimension to project to.
+*  <b>`input_size`</b>: Deprecated and unused.
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if cell is not an RNNCell.
+
+
+- - -
+
+#### `tf.contrib.rnn.InputProjectionWrapper.output_size` {#InputProjectionWrapper.output_size}
+
+
+
+
+- - -
+
+#### `tf.contrib.rnn.InputProjectionWrapper.state_size` {#InputProjectionWrapper.state_size}
+
+
+
+
+- - -
+
+#### `tf.contrib.rnn.InputProjectionWrapper.zero_state(batch_size, dtype)` {#InputProjectionWrapper.zero_state}
+
+Return zero-filled state tensor(s).
+
+##### Args:
+
+
+*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
+*  <b>`dtype`</b>: the data type to use for the state.
+
+##### Returns:
+
+  If `state_size` is an int or TensorShape, then the return value is a
+  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
+
+  If `state_size` is a nested list or tuple, then the return value is
+  a nested list or tuple (of the same structure) of `2-D` tensors with
+the shapes `[batch_size x s]` for each s in `state_size`.
+
+
+
+- - -
+
+### `class tf.contrib.rnn.OutputProjectionWrapper` {#OutputProjectionWrapper}
+
+Operator adding an output projection to the given cell.
+
+Note: in many cases it may be more efficient to not use this wrapper,
+but instead concatenate the whole sequence of your outputs in time,
+do the projection on this batch-concatenated sequence, then split it
+if needed or directly feed into a softmax.
+- - -
+
+#### `tf.contrib.rnn.OutputProjectionWrapper.__call__(inputs, state, scope=None)` {#OutputProjectionWrapper.__call__}
+
+Run the cell and output projection on inputs, starting from state.
+
+
+- - -
+
+#### `tf.contrib.rnn.OutputProjectionWrapper.__init__(cell, output_size)` {#OutputProjectionWrapper.__init__}
+
+Create a cell with output projection.
+
+##### Args:
+
+
+*  <b>`cell`</b>: an RNNCell, a projection to output_size is added to it.
+*  <b>`output_size`</b>: integer, the size of the output after projection.
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if cell is not an RNNCell.
+*  <b>`ValueError`</b>: if output_size is not positive.
+
+
+- - -
+
+#### `tf.contrib.rnn.OutputProjectionWrapper.output_size` {#OutputProjectionWrapper.output_size}
+
+
+
+
+- - -
+
+#### `tf.contrib.rnn.OutputProjectionWrapper.state_size` {#OutputProjectionWrapper.state_size}
+
+
+
+
+- - -
+
+#### `tf.contrib.rnn.OutputProjectionWrapper.zero_state(batch_size, dtype)` {#OutputProjectionWrapper.zero_state}
+
+Return zero-filled state tensor(s).
+
+##### Args:
+
+
+*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
+*  <b>`dtype`</b>: the data type to use for the state.
+
+##### Returns:
+
+  If `state_size` is an int or TensorShape, then the return value is a
+  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
+
+  If `state_size` is a nested list or tuple, then the return value is
+  a nested list or tuple (of the same structure) of `2-D` tensors with
+the shapes `[batch_size x s]` for each s in `state_size`.
+
+
 
-## This package provides additional contributed RNNCells.
 
 ### Block RNNCells
 - - -
@@ -258,7 +1101,7 @@ This is an adaptor to time-reverse a FusedRNNCell.
 For example,
 
 ```python
-cell = tf.nn.rnn_cell.BasicRNNCell(10)
+cell = tf.contrib.rnn.BasicRNNCell(10)
 fw_lstm = tf.contrib.rnn.FusedRNNCellAdaptor(cell, use_dynamic_rnn=True)
 bw_lstm = tf.contrib.rnn.TimeReversedFusedRNN(fw_lstm)
 fw_out, fw_state = fw_lstm(inputs)
@@ -1194,3 +2037,158 @@ As described in https://arxiv.org/abs/1303.5778
 *  <b>`ValueError`</b>: If inputs is None, not a list or an empty list.
 
 
+- - -
+
+### `tf.contrib.rnn.static_bidirectional_rnn(cell_fw, cell_bw, inputs, initial_state_fw=None, initial_state_bw=None, dtype=None, sequence_length=None, scope=None)` {#static_bidirectional_rnn}
+
+Creates a bidirectional recurrent neural network.
+
+Similar to the unidirectional case above (rnn) but takes input and builds
+independent forward and backward RNNs with the final forward and backward
+outputs depth-concatenated, such that the output will have the format
+[time][batch][cell_fw.output_size + cell_bw.output_size]. The input_size of
+forward and backward cell must match. The initial state for both directions
+is zero by default (but can be set optionally) and no intermediate states are
+ever returned -- the network is fully unrolled for the given (passed in)
+length(s) of the sequence(s) or completely unrolled if length(s) is not given.
+
+##### Args:
+
+
+*  <b>`cell_fw`</b>: An instance of RNNCell, to be used for forward direction.
+*  <b>`cell_bw`</b>: An instance of RNNCell, to be used for backward direction.
+*  <b>`inputs`</b>: A length T list of inputs, each a tensor of shape
+    [batch_size, input_size], or a nested tuple of such elements.
+*  <b>`initial_state_fw`</b>: (optional) An initial state for the forward RNN.
+    This must be a tensor of appropriate type and shape
+    `[batch_size, cell_fw.state_size]`.
+    If `cell_fw.state_size` is a tuple, this should be a tuple of
+    tensors having shapes `[batch_size, s] for s in cell_fw.state_size`.
+*  <b>`initial_state_bw`</b>: (optional) Same as for `initial_state_fw`, but using
+    the corresponding properties of `cell_bw`.
+*  <b>`dtype`</b>: (optional) The data type for the initial state.  Required if
+    either of the initial states are not provided.
+*  <b>`sequence_length`</b>: (optional) An int32/int64 vector, size `[batch_size]`,
+    containing the actual lengths for each of the sequences.
+*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to
+    "bidirectional_rnn"
+
+##### Returns:
+
+  A tuple (outputs, output_state_fw, output_state_bw) where:
+    outputs is a length `T` list of outputs (one for each input), which
+      are depth-concatenated forward and backward outputs.
+    output_state_fw is the final state of the forward rnn.
+    output_state_bw is the final state of the backward rnn.
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: If `cell_fw` or `cell_bw` is not an instance of `RNNCell`.
+*  <b>`ValueError`</b>: If inputs is None or an empty list.
+
+
+- - -
+
+### `tf.contrib.rnn.static_rnn(cell, inputs, initial_state=None, dtype=None, sequence_length=None, scope=None)` {#static_rnn}
+
+Creates a recurrent neural network specified by RNNCell `cell`.
+
+The simplest form of RNN network generated is:
+
+```python
+  state = cell.zero_state(...)
+  outputs = []
+  for input_ in inputs:
+    output, state = cell(input_, state)
+    outputs.append(output)
+  return (outputs, state)
+```
+However, a few other options are available:
+
+An initial state can be provided.
+If the sequence_length vector is provided, dynamic calculation is performed.
+This method of calculation does not compute the RNN steps past the maximum
+sequence length of the minibatch (thus saving computational time),
+and properly propagates the state at an example's sequence length
+to the final state output.
+
+The dynamic calculation performed is, at time `t` for batch row `b`,
+
+```python
+  (output, state)(b, t) =
+    (t >= sequence_length(b))
+      ? (zeros(cell.output_size), states(b, sequence_length(b) - 1))
+      : cell(input(b, t), state(b, t - 1))
+```
+
+##### Args:
+
+
+*  <b>`cell`</b>: An instance of RNNCell.
+*  <b>`inputs`</b>: A length T list of inputs, each a `Tensor` of shape
+    `[batch_size, input_size]`, or a nested tuple of such elements.
+*  <b>`initial_state`</b>: (optional) An initial state for the RNN.
+    If `cell.state_size` is an integer, this must be
+    a `Tensor` of appropriate type and shape `[batch_size, cell.state_size]`.
+    If `cell.state_size` is a tuple, this should be a tuple of
+    tensors having shapes `[batch_size, s] for s in cell.state_size`.
+*  <b>`dtype`</b>: (optional) The data type for the initial state and expected output.
+    Required if initial_state is not provided or RNN state has a heterogeneous
+    dtype.
+*  <b>`sequence_length`</b>: Specifies the length of each sequence in inputs.
+    An int32 or int64 vector (tensor) size `[batch_size]`, values in `[0, T)`.
+*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to "rnn".
+
+##### Returns:
+
+  A pair (outputs, state) where:
+
+  - outputs is a length T list of outputs (one for each input), or a nested
+    tuple of such elements.
+  - state is the final state
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: If `cell` is not an instance of RNNCell.
+*  <b>`ValueError`</b>: If `inputs` is `None` or an empty list, or if the input depth
+    (column size) cannot be inferred from inputs via shape inference.
+
+
+- - -
+
+### `tf.contrib.rnn.static_state_saving_rnn(cell, inputs, state_saver, state_name, sequence_length=None, scope=None)` {#static_state_saving_rnn}
+
+RNN that accepts a state saver for time-truncated RNN calculation.
+
+##### Args:
+
+
+*  <b>`cell`</b>: An instance of `RNNCell`.
+*  <b>`inputs`</b>: A length T list of inputs, each a `Tensor` of shape
+    `[batch_size, input_size]`.
+*  <b>`state_saver`</b>: A state saver object with methods `state` and `save_state`.
+*  <b>`state_name`</b>: Python string or tuple of strings.  The name to use with the
+    state_saver. If the cell returns tuples of states (i.e.,
+    `cell.state_size` is a tuple) then `state_name` should be a tuple of
+    strings having the same length as `cell.state_size`.  Otherwise it should
+    be a single string.
+*  <b>`sequence_length`</b>: (optional) An int32/int64 vector size [batch_size].
+    See the documentation for rnn() for more details about sequence_length.
+*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to "rnn".
+
+##### Returns:
+
+  A pair (outputs, state) where:
+    outputs is a length T list of outputs (one for each input)
+    states is the final state
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: If `cell` is not an instance of RNNCell.
+*  <b>`ValueError`</b>: If `inputs` is `None` or an empty list, or if the arity and
+   type of `state_name` does not match that of `cell.state_size`.
+
+
diff --git a/tensorflow/g3doc/api_docs/python/contrib.training.md b/tensorflow/g3doc/api_docs/python/contrib.training.md
index 67ce73a347475b..721075273706bf 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.training.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.training.md
@@ -59,7 +59,7 @@ batch_size = 32
 num_unroll = 20
 num_enqueue_threads = 3
 lstm_size = 8
-cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=lstm_size)
+cell = tf.contrib.rnn.BasicLSTMCell(num_units=lstm_size)
 
 key, sequences, context = my_parser(raw_data)
 initial_state_values = tf.zeros((state_size,), dtype=tf.float32)
@@ -506,7 +506,7 @@ Example usage:
 batch_size = 32
 num_unroll = 20
 lstm_size = 8
-cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=lstm_size)
+cell = tf.contrib.rnn.BasicLSTMCell(num_units=lstm_size)
 initial_state_values = tf.zeros(cell.state_size, dtype=tf.float32)
 
 raw_data = get_single_input_from_input_reader()
diff --git a/tensorflow/g3doc/api_docs/python/control_flow_ops.md b/tensorflow/g3doc/api_docs/python/control_flow_ops.md
index 31435d5ec30519..79994776cb1b15 100644
--- a/tensorflow/g3doc/api_docs/python/control_flow_ops.md
+++ b/tensorflow/g3doc/api_docs/python/control_flow_ops.md
@@ -309,7 +309,7 @@ a) If a loop variable is a SparseTensor, the shape invariant must be
 TensorShape([r]) where r is the rank of the dense tensor represented
 by the sparse tensor. It means the shapes of the three tensors of the
 SparseTensor are ([None], [None, r], [r]). NOTE: The shape invariant here
-is the shape of the SparseTensor.shape property. It must be the shape of
+is the shape of the SparseTensor.dense_shape property. It must be the shape of
 a vector.
 
 b) If a loop variable is an IndexedSlices, the shape invariant must be
diff --git a/tensorflow/g3doc/api_docs/python/framework.md b/tensorflow/g3doc/api_docs/python/framework.md
index 7c799283fd6e0c..91e15c10289b84 100644
--- a/tensorflow/g3doc/api_docs/python/framework.md
+++ b/tensorflow/g3doc/api_docs/python/framework.md
@@ -2549,6 +2549,31 @@ unmodified. Otherwise, it is converted to a `Tensor` using
 *  <b>`ValueError`</b>: If `dtype` does not match the element type of `value`.
 
 
+- - -
+
+### `tf.convert_to_tensor_or_sparse_tensor(value, dtype=None, name=None)` {#convert_to_tensor_or_sparse_tensor}
+
+Converts value to a `SparseTensor` or `Tensor`.
+
+##### Args:
+
+
+*  <b>`value`</b>: A `SparseTensor`, `SparseTensorValue`, or an object whose type has a
+    registered `Tensor` conversion function.
+*  <b>`dtype`</b>: Optional element type for the returned tensor. If missing, the
+    type is inferred from the type of `value`.
+*  <b>`name`</b>: Optional name to use if a new `Tensor` is created.
+
+##### Returns:
+
+  A `SparseTensor` or `Tensor` based on `value`.
+
+##### Raises:
+
+
+*  <b>`RuntimeError`</b>: If result type is incompatible with `dtype`.
+
+
 - - -
 
 ### `tf.get_default_graph()` {#get_default_graph}
@@ -2795,7 +2820,7 @@ The following standard keys are defined:
   for more details.
 * `SUMMARIES`: the summary `Tensor` objects that have been created in the
   graph. See
-  [`tf.contrib.deprecated.merge_all_summaries()`](../../api_docs/python/train.md#merge_all_summaries)
+  [`tf.summary.merge_all()`](../../api_docs/python/summary.md#merge_all)
   for more details.
 * `QUEUE_RUNNERS`: the `QueueRunner` objects that are used to
   produce input for a computation. See
diff --git a/tensorflow/g3doc/api_docs/python/functional_ops.md b/tensorflow/g3doc/api_docs/python/functional_ops.md
index 3102cad0e5571d..02338eb97ec260 100644
--- a/tensorflow/g3doc/api_docs/python/functional_ops.md
+++ b/tensorflow/g3doc/api_docs/python/functional_ops.md
@@ -46,13 +46,14 @@ one of the following methods is recommended. First, if the function is
 expressible as TensorFlow ops, use
 
 ```python
-  result = SparseTensor(input.indices, fn(input.values), input.shape)
+  result = SparseTensor(input.indices, fn(input.values), input.dense_shape)
 ```
 
 If, however, the function is not expressible as a TensorFlow op, then use
 
 ```python
-result = SparseTensor(input.indices, map_fn(fn, input.values), input.shape)
+result = SparseTensor(
+  input.indices, map_fn(fn, input.values), input.dense_shape)
 ```
 
 instead.
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.SparseFeature.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.SparseFeature.md
new file mode 100644
index 00000000000000..8f967de160ce17
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.SparseFeature.md
@@ -0,0 +1,76 @@
+Configuration for parsing a sparse input feature.
+
+Fields:
+  index_key: Name of index feature.  The underlying feature's type must
+    be `int64` and its length must always match that of the `value_key`
+    feature.
+  value_key: Name of value feature.  The underlying feature's type must
+    be `dtype` and its length must always match that of the `index_key`
+    feature.
+  dtype: Data type of the `value_key` feature.
+  size: Each value in the `index_key` feature must be in `[0, size)`.
+  already_sorted: A boolean to specify whether the values in `index_key` are
+    already sorted. If so skip sorting, False by default (optional).
+- - -
+
+#### `tf.SparseFeature.__getnewargs__()` {#SparseFeature.__getnewargs__}
+
+Return self as a plain tuple.  Used by copy and pickle.
+
+
+- - -
+
+#### `tf.SparseFeature.__getstate__()` {#SparseFeature.__getstate__}
+
+Exclude the OrderedDict from pickling
+
+
+- - -
+
+#### `tf.SparseFeature.__new__(_cls, index_key, value_key, dtype, size, already_sorted=False)` {#SparseFeature.__new__}
+
+Create new instance of SparseFeature(index_key, value_key, dtype, size, already_sorted)
+
+
+- - -
+
+#### `tf.SparseFeature.__repr__()` {#SparseFeature.__repr__}
+
+Return a nicely formatted representation string
+
+
+- - -
+
+#### `tf.SparseFeature.already_sorted` {#SparseFeature.already_sorted}
+
+Alias for field number 4
+
+
+- - -
+
+#### `tf.SparseFeature.dtype` {#SparseFeature.dtype}
+
+Alias for field number 2
+
+
+- - -
+
+#### `tf.SparseFeature.index_key` {#SparseFeature.index_key}
+
+Alias for field number 0
+
+
+- - -
+
+#### `tf.SparseFeature.size` {#SparseFeature.size}
+
+Alias for field number 3
+
+
+- - -
+
+#### `tf.SparseFeature.value_key` {#SparseFeature.value_key}
+
+Alias for field number 1
+
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.SparseTensorValue.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.SparseTensorValue.md
index ad46827ac2e1cc..b6b84d7d94c777 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.SparseTensorValue.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.SparseTensorValue.md
@@ -1,50 +1,59 @@
-SparseTensorValue(indices, values, shape)
+Stores the calculated numpy arrays representing a `SparseTensor`.
+
+Returned as the output of a session.run on a `SparseTensor` object.
 - - -
 
-#### `tf.SparseTensorValue.__getnewargs__()` {#SparseTensorValue.__getnewargs__}
+#### `tf.SparseTensorValue.__getitem__(i)` {#SparseTensorValue.__getitem__}
+
 
-Return self as a plain tuple.  Used by copy and pickle.
 
 
 - - -
 
-#### `tf.SparseTensorValue.__getstate__()` {#SparseTensorValue.__getstate__}
+#### `tf.SparseTensorValue.__init__(indices, values, dense_shape=None, shape=None)` {#SparseTensorValue.__init__}
+
 
-Exclude the OrderedDict from pickling
 
 
 - - -
 
-#### `tf.SparseTensorValue.__new__(_cls, indices, values, shape)` {#SparseTensorValue.__new__}
+#### `tf.SparseTensorValue.__iter__()` {#SparseTensorValue.__iter__}
+
 
-Create new instance of SparseTensorValue(indices, values, shape)
 
 
 - - -
 
 #### `tf.SparseTensorValue.__repr__()` {#SparseTensorValue.__repr__}
 
-Return a nicely formatted representation string
+
+
+
+- - -
+
+#### `tf.SparseTensorValue.dense_shape` {#SparseTensorValue.dense_shape}
+
+
 
 
 - - -
 
 #### `tf.SparseTensorValue.indices` {#SparseTensorValue.indices}
 
-Alias for field number 0
+
 
 
 - - -
 
 #### `tf.SparseTensorValue.shape` {#SparseTensorValue.shape}
 
-Alias for field number 2
+
 
 
 - - -
 
 #### `tf.SparseTensorValue.values` {#SparseTensorValue.values}
 
-Alias for field number 1
+
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.TensorArray.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.TensorArray.md
index baffb1789ab3c4..a0252d096a61d9 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.TensorArray.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.TensorArray.md
@@ -59,11 +59,12 @@ must all match.
 
 - - -
 
-#### `tf.TensorArray.pack(name=None)` {#TensorArray.pack}
+#### `tf.TensorArray.pack(*args, **kwargs)` {#TensorArray.pack}
 
-Return the values in the TensorArray as a packed `Tensor`.
+Return the values in the TensorArray as a stacked `Tensor`.
 
 All of the values must have been written and their shapes must all match.
+If input shapes have rank-`R`, then output shape will have rank-`(R+1)`.
 
 ##### Args:
 
@@ -72,7 +73,7 @@ All of the values must have been written and their shapes must all match.
 
 ##### Returns:
 
-  All the tensors in the TensorArray packed into one tensor.
+  All the tensors in the TensorArray stacked into one tensor.
 
 
 - - -
@@ -146,19 +147,22 @@ Scatter the values of a `Tensor` in specific indices of a `TensorArray`.
 
 - - -
 
-#### `tf.TensorArray.unpack(value, name=None)` {#TensorArray.unpack}
+#### `tf.TensorArray.unpack(*args, **kwargs)` {#TensorArray.unpack}
 
-Pack the values of a `Tensor` in the TensorArray.
+Unstack the values of a `Tensor` in the TensorArray.
+
+If input value shapes have rank-`R`, then the output TensorArray will
+contain elements whose shapes are rank-`(R-1)`.
 
 ##### Args:
 
 
-*  <b>`value`</b>: (N+1)-D.  Tensor of type `dtype`.  The Tensor to unpack.
+*  <b>`value`</b>: (N+1)-D.  Tensor of type `dtype`.  The Tensor to unstack.
 *  <b>`name`</b>: A name for the operation (optional).
 
 ##### Returns:
 
-  A new TensorArray object with flow that ensures the unpack occurs.
+  A new TensorArray object with flow that ensures the unstack occurs.
   Use this object all for subsequent operations.
 
 ##### Raises:
@@ -204,7 +208,7 @@ Split the values of a `Tensor` into the TensorArray.
 #### Other Methods
 - - -
 
-#### `tf.TensorArray.__init__(dtype, size=None, dynamic_size=None, clear_after_read=None, tensor_array_name=None, handle=None, flow=None, infer_shape=True, elem_shape=None, name=None)` {#TensorArray.__init__}
+#### `tf.TensorArray.__init__(dtype, size=None, dynamic_size=None, clear_after_read=None, tensor_array_name=None, handle=None, flow=None, infer_shape=True, element_shape=None, name=None)` {#TensorArray.__init__}
 
 Construct a new TensorArray or wrap an existing TensorArray handle.
 
@@ -235,8 +239,9 @@ is created within a `while_loop`.
     `TensorArray.flow`.
 *  <b>`infer_shape`</b>: (optional, default: True) If True, shape inference
     is enabled.  In this case, all elements must have the same shape.
-*  <b>`elem_shape`</b>: (optional, default: None) A TensorShape object specifying
-    the shape of all the elements of the TensorArray.
+*  <b>`element_shape`</b>: (optional, default: None) A `TensorShape` object specifying
+    the shape constraints of each of the elements of the TensorArray.
+    Need not be fully defined.
 *  <b>`name`</b>: A name for the operation (optional).
 
 ##### Raises:
@@ -267,3 +272,48 @@ The data type of this TensorArray.
 Return the size of the TensorArray.
 
 
+- - -
+
+#### `tf.TensorArray.stack(name=None)` {#TensorArray.stack}
+
+Return the values in the TensorArray as a stacked `Tensor`.
+
+All of the values must have been written and their shapes must all match.
+If input shapes have rank-`R`, then output shape will have rank-`(R+1)`.
+
+##### Args:
+
+
+*  <b>`name`</b>: A name for the operation (optional).
+
+##### Returns:
+
+  All the tensors in the TensorArray stacked into one tensor.
+
+
+- - -
+
+#### `tf.TensorArray.unstack(value, name=None)` {#TensorArray.unstack}
+
+Unstack the values of a `Tensor` in the TensorArray.
+
+If input value shapes have rank-`R`, then the output TensorArray will
+contain elements whose shapes are rank-`(R-1)`.
+
+##### Args:
+
+
+*  <b>`value`</b>: (N+1)-D.  Tensor of type `dtype`.  The Tensor to unstack.
+*  <b>`name`</b>: A name for the operation (optional).
+
+##### Returns:
+
+  A new TensorArray object with flow that ensures the unstack occurs.
+  Use this object all for subsequent operations.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: if the shape inference fails.
+
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.layers.weighted_sparse_column.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.layers.weighted_sparse_column.md
index ca524575fe14d6..cba516920d017e 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.layers.weighted_sparse_column.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.layers.weighted_sparse_column.md
@@ -19,7 +19,7 @@ Example:
       is a SparseTensor.
    Following are assumed to be true:
      * sparse_tensor.indices = weights_tensor.indices
-     * sparse_tensor.shape = weights_tensor.shape
+     * sparse_tensor.dense_shape = weights_tensor.dense_shape
 
 ##### Args:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.LinearRegressor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.LinearRegressor.md
index 81405d1ab56a16..5c49703aa8229c 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.LinearRegressor.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.LinearRegressor.md
@@ -99,7 +99,7 @@ This method will be removed after the deprecation date. To inspect variables, us
 
 - - -
 
-#### `tf.contrib.learn.LinearRegressor.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#LinearRegressor.evaluate}
+#### `tf.contrib.learn.LinearRegressor.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None, checkpoint_path=None)` {#LinearRegressor.evaluate}
 
 See evaluable.Evaluable.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.legacy_seq2seq.attention_decoder.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.legacy_seq2seq.attention_decoder.md
new file mode 100644
index 00000000000000..8043a4dad5497d
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.legacy_seq2seq.attention_decoder.md
@@ -0,0 +1,60 @@
+### `tf.contrib.legacy_seq2seq.attention_decoder(decoder_inputs, initial_state, attention_states, cell, output_size=None, num_heads=1, loop_function=None, dtype=None, scope=None, initial_state_attention=False)` {#attention_decoder}
+
+RNN decoder with attention for the sequence-to-sequence model.
+
+In this context "attention" means that, during decoding, the RNN can look up
+information in the additional tensor attention_states, and it does this by
+focusing on a few entries from the tensor. This model has proven to yield
+especially good results in a number of sequence-to-sequence tasks. This
+implementation is based on http://arxiv.org/abs/1412.7449 (see below for
+details). It is recommended for complex sequence-to-sequence tasks.
+
+##### Args:
+
+
+*  <b>`decoder_inputs`</b>: A list of 2D Tensors [batch_size x input_size].
+*  <b>`initial_state`</b>: 2D Tensor [batch_size x cell.state_size].
+*  <b>`attention_states`</b>: 3D Tensor [batch_size x attn_length x attn_size].
+*  <b>`cell`</b>: rnn_cell.RNNCell defining the cell function and size.
+*  <b>`output_size`</b>: Size of the output vectors; if None, we use cell.output_size.
+*  <b>`num_heads`</b>: Number of attention heads that read from attention_states.
+*  <b>`loop_function`</b>: If not None, this function will be applied to i-th output
+    in order to generate i+1-th input, and decoder_inputs will be ignored,
+    except for the first element ("GO" symbol). This can be used for decoding,
+    but also for training to emulate http://arxiv.org/abs/1506.03099.
+    Signature -- loop_function(prev, i) = next
+      * prev is a 2D Tensor of shape [batch_size x output_size],
+      * i is an integer, the step number (when advanced control is needed),
+      * next is a 2D Tensor of shape [batch_size x input_size].
+*  <b>`dtype`</b>: The dtype to use for the RNN initial state (default: tf.float32).
+*  <b>`scope`</b>: VariableScope for the created subgraph; default: "attention_decoder".
+*  <b>`initial_state_attention`</b>: If False (default), initial attentions are zero.
+    If True, initialize the attentions from the initial state and attention
+    states -- useful when we wish to resume decoding from a previously
+    stored decoder state and attention states.
+
+##### Returns:
+
+  A tuple of the form (outputs, state), where:
+
+*  <b>`outputs`</b>: A list of the same length as decoder_inputs of 2D Tensors of
+      shape [batch_size x output_size]. These represent the generated outputs.
+      Output i is computed from input i (which is either the i-th element
+      of decoder_inputs or loop_function(output {i-1}, i)) as follows.
+      First, we run the cell on a combination of the input and previous
+      attention masks:
+        cell_output, new_state = cell(linear(input, prev_attn), prev_state).
+      Then, we calculate new attention masks:
+        new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))
+      and then we calculate the output:
+        output = linear(cell_output, new_attn).
+*  <b>`state`</b>: The state of each decoder cell the final time-step.
+      It is a 2D Tensor of shape [batch_size x cell.state_size].
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: when num_heads is not positive, there are no inputs, shapes
+    of attention_states are not set, or input size cannot be inferred
+    from the input.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.legacy_seq2seq.embedding_rnn_decoder.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.legacy_seq2seq.embedding_rnn_decoder.md
new file mode 100644
index 00000000000000..3ba228523faf19
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.legacy_seq2seq.embedding_rnn_decoder.md
@@ -0,0 +1,48 @@
+### `tf.contrib.legacy_seq2seq.embedding_rnn_decoder(decoder_inputs, initial_state, cell, num_symbols, embedding_size, output_projection=None, feed_previous=False, update_embedding_for_previous=True, scope=None)` {#embedding_rnn_decoder}
+
+RNN decoder with embedding and a pure-decoding option.
+
+##### Args:
+
+
+*  <b>`decoder_inputs`</b>: A list of 1D batch-sized int32 Tensors (decoder inputs).
+*  <b>`initial_state`</b>: 2D Tensor [batch_size x cell.state_size].
+*  <b>`cell`</b>: rnn_cell.RNNCell defining the cell function.
+*  <b>`num_symbols`</b>: Integer, how many symbols come into the embedding.
+*  <b>`embedding_size`</b>: Integer, the length of the embedding vector for each symbol.
+*  <b>`output_projection`</b>: None or a pair (W, B) of output projection weights and
+    biases; W has shape [output_size x num_symbols] and B has
+    shape [num_symbols]; if provided and feed_previous=True, each fed
+    previous output will first be multiplied by W and added B.
+*  <b>`feed_previous`</b>: Boolean; if True, only the first of decoder_inputs will be
+    used (the "GO" symbol), and all other decoder inputs will be generated by:
+      next = embedding_lookup(embedding, argmax(previous_output)),
+    In effect, this implements a greedy decoder. It can also be used
+    during training to emulate http://arxiv.org/abs/1506.03099.
+    If False, decoder_inputs are used as given (the standard decoder case).
+*  <b>`update_embedding_for_previous`</b>: Boolean; if False and feed_previous=True,
+    only the embedding for the first symbol of decoder_inputs (the "GO"
+    symbol) will be updated by back propagation. Embeddings for the symbols
+    generated from the decoder itself remain unchanged. This parameter has
+    no effect if feed_previous=False.
+*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to
+    "embedding_rnn_decoder".
+
+##### Returns:
+
+  A tuple of the form (outputs, state), where:
+
+*  <b>`outputs`</b>: A list of the same length as decoder_inputs of 2D Tensors. The
+      output is of shape [batch_size x cell.output_size] when
+      output_projection is not None (and represents the dense representation
+      of predicted tokens). It is of shape [batch_size x num_decoder_symbols]
+      when output_projection is None.
+*  <b>`state`</b>: The state of each decoder cell in each time-step. This is a list
+      with length len(decoder_inputs) -- one item for each time-step.
+      It is a 2D Tensor of shape [batch_size x cell.state_size].
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: When output_projection has the wrong shape.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.legacy_seq2seq.embedding_rnn_seq2seq.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.legacy_seq2seq.embedding_rnn_seq2seq.md
new file mode 100644
index 00000000000000..2eafffe765d030
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.legacy_seq2seq.embedding_rnn_seq2seq.md
@@ -0,0 +1,46 @@
+### `tf.contrib.legacy_seq2seq.embedding_rnn_seq2seq(encoder_inputs, decoder_inputs, cell, num_encoder_symbols, num_decoder_symbols, embedding_size, output_projection=None, feed_previous=False, dtype=None, scope=None)` {#embedding_rnn_seq2seq}
+
+Embedding RNN sequence-to-sequence model.
+
+This model first embeds encoder_inputs by a newly created embedding (of shape
+[num_encoder_symbols x input_size]). Then it runs an RNN to encode
+embedded encoder_inputs into a state vector. Next, it embeds decoder_inputs
+by another newly created embedding (of shape [num_decoder_symbols x
+input_size]). Then it runs RNN decoder, initialized with the last
+encoder state, on embedded decoder_inputs.
+
+##### Args:
+
+
+*  <b>`encoder_inputs`</b>: A list of 1D int32 Tensors of shape [batch_size].
+*  <b>`decoder_inputs`</b>: A list of 1D int32 Tensors of shape [batch_size].
+*  <b>`cell`</b>: rnn_cell.RNNCell defining the cell function and size.
+*  <b>`num_encoder_symbols`</b>: Integer; number of symbols on the encoder side.
+*  <b>`num_decoder_symbols`</b>: Integer; number of symbols on the decoder side.
+*  <b>`embedding_size`</b>: Integer, the length of the embedding vector for each symbol.
+*  <b>`output_projection`</b>: None or a pair (W, B) of output projection weights and
+    biases; W has shape [output_size x num_decoder_symbols] and B has
+    shape [num_decoder_symbols]; if provided and feed_previous=True, each
+    fed previous output will first be multiplied by W and added B.
+*  <b>`feed_previous`</b>: Boolean or scalar Boolean Tensor; if True, only the first
+    of decoder_inputs will be used (the "GO" symbol), and all other decoder
+    inputs will be taken from previous outputs (as in embedding_rnn_decoder).
+    If False, decoder_inputs are used as given (the standard decoder case).
+*  <b>`dtype`</b>: The dtype of the initial state for both the encoder and encoder
+    rnn cells (default: tf.float32).
+*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to
+    "embedding_rnn_seq2seq"
+
+##### Returns:
+
+  A tuple of the form (outputs, state), where:
+
+*  <b>`outputs`</b>: A list of the same length as decoder_inputs of 2D Tensors. The
+      output is of shape [batch_size x cell.output_size] when
+      output_projection is not None (and represents the dense representation
+      of predicted tokens). It is of shape [batch_size x num_decoder_symbols]
+      when output_projection is None.
+*  <b>`state`</b>: The state of each decoder cell in each time-step. This is a list
+      with length len(decoder_inputs) -- one item for each time-step.
+      It is a 2D Tensor of shape [batch_size x cell.state_size].
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.legacy_seq2seq.sequence_loss.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.legacy_seq2seq.sequence_loss.md
new file mode 100644
index 00000000000000..c0beb1541edb52
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.legacy_seq2seq.sequence_loss.md
@@ -0,0 +1,26 @@
+### `tf.contrib.legacy_seq2seq.sequence_loss(logits, targets, weights, average_across_timesteps=True, average_across_batch=True, softmax_loss_function=None, name=None)` {#sequence_loss}
+
+Weighted cross-entropy loss for a sequence of logits, batch-collapsed.
+
+##### Args:
+
+
+*  <b>`logits`</b>: List of 2D Tensors of shape [batch_size x num_decoder_symbols].
+*  <b>`targets`</b>: List of 1D batch-sized int32 Tensors of the same length as logits.
+*  <b>`weights`</b>: List of 1D batch-sized float-Tensors of the same length as logits.
+*  <b>`average_across_timesteps`</b>: If set, divide the returned cost by the total
+    label weight.
+*  <b>`average_across_batch`</b>: If set, divide the returned cost by the batch size.
+*  <b>`softmax_loss_function`</b>: Function (inputs-batch, labels-batch) -> loss-batch
+    to be used instead of the standard softmax (the default if this is None).
+*  <b>`name`</b>: Optional name for this operation, defaults to "sequence_loss".
+
+##### Returns:
+
+  A scalar float Tensor: The average log-perplexity per symbol (weighted).
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If len(logits) is different from len(targets) or len(weights).
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.linalg.LinearOperatorDiag.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.linalg.LinearOperatorDiag.md
index f4360dc46ca7b3..f2c6c8cf6ca0bd 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.linalg.LinearOperatorDiag.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.linalg.LinearOperatorDiag.md
@@ -76,7 +76,7 @@ These have the following meaning
   way.
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorDiag.__init__(diag, is_non_singular=None, is_self_adjoint=True, is_positive_definite=None, name='LinearOperatorDiag')` {#LinearOperatorDiag.__init__}
+#### `tf.contrib.linalg.LinearOperatorDiag.__init__(diag, is_non_singular=None, is_self_adjoint=None, is_positive_definite=None, name='LinearOperatorDiag')` {#LinearOperatorDiag.__init__}
 
 Initialize a `LinearOperatorDiag`.
 
@@ -85,11 +85,10 @@ Initialize a `LinearOperatorDiag`.
 
 *  <b>`diag`</b>: Shape `[B1,...,Bb, N]` `Tensor` with `b >= 0` `N >= 0`.
     The diagonal of the operator.  Allowed dtypes: `float32`, `float64`,
-    `complex64`, `complex128`.
+      `complex64`, `complex128`.
 *  <b>`is_non_singular`</b>: Expect that this operator is non-singular.
 *  <b>`is_self_adjoint`</b>: Expect that this operator is equal to its hermitian
-    transpose.  Since this is a real (not complex) diagonal operator, it is
-    always self adjoint.
+    transpose.  If `diag.dtype` is real, this is auto-set to `True`.
 *  <b>`is_positive_definite`</b>: Expect that this operator is positive definite,
     meaning the real part of all eigenvalues is positive.  We do not require
     the operator to be self-adjoint to be positive-definite.  See:
@@ -101,7 +100,7 @@ Initialize a `LinearOperatorDiag`.
 
 
 *  <b>`TypeError`</b>: If `diag.dtype` is not an allowed type.
-*  <b>`ValueError`</b>: If `is_self_adjoint` is not `True`.
+*  <b>`ValueError`</b>: If `diag.dtype` is real, and `is_self_adjoint` is not `True`.
 
 
 - - -
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.losses.get_losses.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.losses.get_losses.md
index 837d82c8084b69..da8e3ed5bb56e6 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.losses.get_losses.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.losses.get_losses.md
@@ -1,6 +1,10 @@
-### `tf.contrib.losses.get_losses(scope=None, loss_collection='losses')` {#get_losses}
+### `tf.contrib.losses.get_losses(*args, **kwargs)` {#get_losses}
 
-Gets the list of losses from the loss_collection.
+Gets the list of losses from the loss_collection. (deprecated)
+
+THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
+Instructions for updating:
+Use tf.losses.get_losses instead.
 
 ##### Args:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.rnn_cell.MultiRNNCell.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.rnn.MultiRNNCell.md
similarity index 81%
rename from tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.rnn_cell.MultiRNNCell.md
rename to tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.rnn.MultiRNNCell.md
index 15b20babeb72eb..47c1855010b911 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.rnn_cell.MultiRNNCell.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.rnn.MultiRNNCell.md
@@ -1,14 +1,14 @@
 RNN cell composed sequentially of multiple simple cells.
 - - -
 
-#### `tf.nn.rnn_cell.MultiRNNCell.__call__(inputs, state, scope=None)` {#MultiRNNCell.__call__}
+#### `tf.contrib.rnn.MultiRNNCell.__call__(inputs, state, scope=None)` {#MultiRNNCell.__call__}
 
 Run this multi-layer cell on inputs, starting from state.
 
 
 - - -
 
-#### `tf.nn.rnn_cell.MultiRNNCell.__init__(cells, state_is_tuple=True)` {#MultiRNNCell.__init__}
+#### `tf.contrib.rnn.MultiRNNCell.__init__(cells, state_is_tuple=True)` {#MultiRNNCell.__init__}
 
 Create a RNN cell composed sequentially of a number of RNNCells.
 
@@ -30,21 +30,21 @@ Create a RNN cell composed sequentially of a number of RNNCells.
 
 - - -
 
-#### `tf.nn.rnn_cell.MultiRNNCell.output_size` {#MultiRNNCell.output_size}
+#### `tf.contrib.rnn.MultiRNNCell.output_size` {#MultiRNNCell.output_size}
 
 
 
 
 - - -
 
-#### `tf.nn.rnn_cell.MultiRNNCell.state_size` {#MultiRNNCell.state_size}
+#### `tf.contrib.rnn.MultiRNNCell.state_size` {#MultiRNNCell.state_size}
 
 
 
 
 - - -
 
-#### `tf.nn.rnn_cell.MultiRNNCell.zero_state(batch_size, dtype)` {#MultiRNNCell.zero_state}
+#### `tf.contrib.rnn.MultiRNNCell.zero_state(batch_size, dtype)` {#MultiRNNCell.zero_state}
 
 Return zero-filled state tensor(s).
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.rnn_cell.OutputProjectionWrapper.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.rnn.OutputProjectionWrapper.md
similarity index 82%
rename from tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.rnn_cell.OutputProjectionWrapper.md
rename to tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.rnn.OutputProjectionWrapper.md
index 4480aa9e18f2fd..87e1024613c190 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.rnn_cell.OutputProjectionWrapper.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.rnn.OutputProjectionWrapper.md
@@ -6,14 +6,14 @@ do the projection on this batch-concatenated sequence, then split it
 if needed or directly feed into a softmax.
 - - -
 
-#### `tf.nn.rnn_cell.OutputProjectionWrapper.__call__(inputs, state, scope=None)` {#OutputProjectionWrapper.__call__}
+#### `tf.contrib.rnn.OutputProjectionWrapper.__call__(inputs, state, scope=None)` {#OutputProjectionWrapper.__call__}
 
 Run the cell and output projection on inputs, starting from state.
 
 
 - - -
 
-#### `tf.nn.rnn_cell.OutputProjectionWrapper.__init__(cell, output_size)` {#OutputProjectionWrapper.__init__}
+#### `tf.contrib.rnn.OutputProjectionWrapper.__init__(cell, output_size)` {#OutputProjectionWrapper.__init__}
 
 Create a cell with output projection.
 
@@ -32,21 +32,21 @@ Create a cell with output projection.
 
 - - -
 
-#### `tf.nn.rnn_cell.OutputProjectionWrapper.output_size` {#OutputProjectionWrapper.output_size}
+#### `tf.contrib.rnn.OutputProjectionWrapper.output_size` {#OutputProjectionWrapper.output_size}
 
 
 
 
 - - -
 
-#### `tf.nn.rnn_cell.OutputProjectionWrapper.state_size` {#OutputProjectionWrapper.state_size}
+#### `tf.contrib.rnn.OutputProjectionWrapper.state_size` {#OutputProjectionWrapper.state_size}
 
 
 
 
 - - -
 
-#### `tf.nn.rnn_cell.OutputProjectionWrapper.zero_state(batch_size, dtype)` {#OutputProjectionWrapper.zero_state}
+#### `tf.contrib.rnn.OutputProjectionWrapper.zero_state(batch_size, dtype)` {#OutputProjectionWrapper.zero_state}
 
 Return zero-filled state tensor(s).
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.framework.convert_to_tensor_or_sparse_tensor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.convert_to_tensor_or_sparse_tensor.md
similarity index 80%
rename from tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.framework.convert_to_tensor_or_sparse_tensor.md
rename to tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.convert_to_tensor_or_sparse_tensor.md
index 089f02a9b47da7..1999e711806f74 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.framework.convert_to_tensor_or_sparse_tensor.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.convert_to_tensor_or_sparse_tensor.md
@@ -1,4 +1,4 @@
-### `tf.contrib.framework.convert_to_tensor_or_sparse_tensor(value, dtype=None, name=None)` {#convert_to_tensor_or_sparse_tensor}
+### `tf.convert_to_tensor_or_sparse_tensor(value, dtype=None, name=None)` {#convert_to_tensor_or_sparse_tensor}
 
 Converts value to a `SparseTensor` or `Tensor`.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.nn.ctc_loss.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.nn.ctc_loss.md
index 3d5cfdfb7a8425..128808ff36a0cc 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.nn.ctc_loss.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.nn.ctc_loss.md
@@ -1,4 +1,4 @@
-### `tf.nn.ctc_loss(inputs, labels, sequence_length, preprocess_collapse_repeated=False, ctc_merge_repeated=True, time_major=True)` {#ctc_loss}
+### `tf.nn.ctc_loss(labels, inputs, sequence_length, preprocess_collapse_repeated=False, ctc_merge_repeated=True, time_major=True)` {#ctc_loss}
 
 Computes the CTC (Connectionist Temporal Classification) Loss.
 
@@ -68,17 +68,17 @@ Here is a table of the (roughly) expected first order behavior:
 ##### Args:
 
 
+*  <b>`labels`</b>: An `int32` `SparseTensor`.
+    `labels.indices[i, :] == [b, t]` means `labels.values[i]` stores
+    the id for (batch b, time t).
+    `labels.values[i]` must take on values in `[0, num_labels)`.
+    See `core/ops/ctc_ops.cc` for more details.
 *  <b>`inputs`</b>: 3-D `float` `Tensor`.
     If time_major == False, this will be a `Tensor` shaped:
       `[batch_size x max_time x num_classes]`.
     If time_major == True (default), this will be a `Tensor` shaped:
       `[max_time x batch_size x num_classes]`.
     The logits.
-*  <b>`labels`</b>: An `int32` `SparseTensor`.
-    `labels.indices[i, :] == [b, t]` means `labels.values[i]` stores
-    the id for (batch b, time t).
-    `labels.values[i]` must take on values in `[0, num_labels)`.
-    See `core/ops/ctc_ops.cc` for more details.
 *  <b>`sequence_length`</b>: 1-D `int32` vector, size `[batch_size]`.
     The sequence lengths.
 *  <b>`preprocess_collapse_repeated`</b>: Boolean.  Default: False.
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.reverse_sequence.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.reverse_sequence.md
index 03dd068320fc45..b950cd5fe69eb6 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.reverse_sequence.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.reverse_sequence.md
@@ -1,17 +1,17 @@
-### `tf.reverse_sequence(input, seq_lengths, seq_dim, batch_dim=None, name=None)` {#reverse_sequence}
+### `tf.reverse_sequence(input, seq_lengths, seq_axis=None, batch_axis=None, name=None, seq_dim=None, batch_dim=None)` {#reverse_sequence}
 
 Reverses variable length slices.
 
-This op first slices `input` along the dimension `batch_dim`, and for each
+This op first slices `input` along the dimension `batch_axis`, and for each
 slice `i`, reverses the first `seq_lengths[i]` elements along
-the dimension `seq_dim`.
+the dimension `seq_axis`.
 
 The elements of `seq_lengths` must obey `seq_lengths[i] < input.dims[seq_dim]`,
 and `seq_lengths` must be a vector of length `input.dims[batch_dim]`.
 
-The output slice `i` along dimension `batch_dim` is then given by input
+The output slice `i` along dimension `batch_axis` is then given by input
 slice `i`, with the first `seq_lengths[i]` slices along dimension
-`seq_dim` reversed.
+`seq_axis` reversed.
 
 For example:
 
@@ -64,8 +64,8 @@ output[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]
 *  <b>`seq_lengths`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
     1-D with length `input.dims(batch_dim)` and
     `max(seq_lengths) < input.dims(seq_dim)`
-*  <b>`seq_dim`</b>: An `int`. The dimension which is partially reversed.
-*  <b>`batch_dim`</b>: An optional `int`. Defaults to `0`.
+*  <b>`seq_axis`</b>: An `int`. The dimension which is partially reversed.
+*  <b>`batch_axis`</b>: An optional `int`. Defaults to `0`.
     The dimension along which reversal is performed.
 *  <b>`name`</b>: A name for the operation (optional).
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.sparse_tensor_to_dense.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.sparse_tensor_to_dense.md
index a0c0a6ca9c957c..6269665d08d0ee 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.sparse_tensor_to_dense.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.sparse_tensor_to_dense.md
@@ -32,7 +32,7 @@ tested if validate_indices is True.
 
 ##### Returns:
 
-  A dense tensor with shape `sp_input.shape` and values specified by
+  A dense tensor with shape `sp_input.dense_shape` and values specified by
   the non-empty values in `sp_input`. Indices not in `sp_input` are assigned
   `default_value`.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.train.SummaryWriterCache.clear.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.train.SummaryWriterCache.clear.md
deleted file mode 100644
index adc6984e2eff74..00000000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.train.SummaryWriterCache.clear.md
+++ /dev/null
@@ -1,4 +0,0 @@
-#### `tf.train.SummaryWriterCache.clear()` {#SummaryWriterCache.clear}
-
-Clear cached summary writers. Currently only used for unit tests.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.LinearClassifier.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.LinearClassifier.md
index fbbab399f39d8a..e1c34b665ad2e3 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.LinearClassifier.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.LinearClassifier.md
@@ -127,7 +127,7 @@ This method will be removed after the deprecation date. To inspect variables, us
 
 - - -
 
-#### `tf.contrib.learn.LinearClassifier.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#LinearClassifier.evaluate}
+#### `tf.contrib.learn.LinearClassifier.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None, checkpoint_path=None)` {#LinearClassifier.evaluate}
 
 See evaluable.Evaluable. Note: Labels must be integer class indices.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.SummaryWriterCache.get.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.monitors.SummaryWriterCache.get.md
similarity index 62%
rename from tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.SummaryWriterCache.get.md
rename to tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.monitors.SummaryWriterCache.get.md
index d5fee8f7b49fce..35b49b99cfc9ca 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.SummaryWriterCache.get.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.monitors.SummaryWriterCache.get.md
@@ -1,4 +1,4 @@
-#### `tf.train.SummaryWriterCache.get(logdir)` {#SummaryWriterCache.get}
+#### `tf.contrib.learn.monitors.SummaryWriterCache.get(logdir)` {#SummaryWriterCache.get}
 
 Returns the FileWriter for the specified directory.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.losses.absolute_difference.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.losses.absolute_difference.md
index 03c6a45264f1e3..83a87616e3de16 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.losses.absolute_difference.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.losses.absolute_difference.md
@@ -1,32 +1,41 @@
 ### `tf.contrib.losses.absolute_difference(*args, **kwargs)` {#absolute_difference}
 
-Adds an Absolute Difference loss to the training procedure. (deprecated arguments)
+Adds an Absolute Difference loss to the training procedure. (deprecated arguments) (deprecated)
+
+THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
+Instructions for updating:
+Use tf.losses.absolute_difference instead.
 
 SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-11-25.
 Instructions for updating:
 `targets` is being deprecated, use `labels`. `weight` is being deprecated, use `weights`.
 
-  `weight` acts as a coefficient for the loss. If a scalar is provided, then the
-  loss is simply scaled by the given value. If `weight` is a tensor of size
-  [batch_size], then the total loss for each sample of the batch is rescaled
-  by the corresponding element in the `weight` vector. If the shape of
-  `weight` matches the shape of `predictions`, then the loss of each
-  measurable element of `predictions` is scaled by the corresponding value of
-  `weight`.
-
-  Args:
-    predictions: The predicted outputs.
-    labels: The ground truth output tensor, same dimensions as 'predictions'.
-    weights: Coefficients for the loss a scalar, a tensor of shape
-      [batch_size] or a tensor whose shape matches `predictions`.
-    scope: The scope for the operations performed in computing the loss.
-    targets: Deprecated alias for `labels`.
-    weight: Deprecated alias for `weights`.
-
-  Returns:
-    A scalar `Tensor` representing the loss value.
-
-  Raises:
-    ValueError: If the shape of `predictions` doesn't match that of `labels` or
-      if the shape of `weight` is invalid.
+`weight` acts as a coefficient for the loss. If a scalar is provided, then the
+loss is simply scaled by the given value. If `weight` is a tensor of size
+[batch_size], then the total loss for each sample of the batch is rescaled
+by the corresponding element in the `weight` vector. If the shape of
+`weight` matches the shape of `predictions`, then the loss of each
+measurable element of `predictions` is scaled by the corresponding value of
+`weight`.
+
+##### Args:
+
+
+*  <b>`predictions`</b>: The predicted outputs.
+*  <b>`labels`</b>: The ground truth output tensor, same dimensions as 'predictions'.
+*  <b>`weights`</b>: Coefficients for the loss a scalar, a tensor of shape
+    [batch_size] or a tensor whose shape matches `predictions`.
+*  <b>`scope`</b>: The scope for the operations performed in computing the loss.
+*  <b>`targets`</b>: Deprecated alias for `labels`.
+*  <b>`weight`</b>: Deprecated alias for `weights`.
+
+##### Returns:
+
+  A scalar `Tensor` representing the loss value.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If the shape of `predictions` doesn't match that of `labels` or
+    if the shape of `weight` is invalid.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.losses.compute_weighted_loss.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.losses.compute_weighted_loss.md
index 8cd3b0d69fbfa8..2df0588696d33a 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.losses.compute_weighted_loss.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.losses.compute_weighted_loss.md
@@ -1,22 +1,31 @@
 ### `tf.contrib.losses.compute_weighted_loss(*args, **kwargs)` {#compute_weighted_loss}
 
-Computes the weighted loss. (deprecated arguments)
+Computes the weighted loss. (deprecated arguments) (deprecated)
+
+THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
+Instructions for updating:
+Use tf.losses.compute_weighted_loss instead.
 
 SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-11-25.
 Instructions for updating:
 `weight` is being deprecated, use `weights`.
 
-  Args:
-    losses: A tensor of size [batch_size, d1, ... dN].
-    weights: A tensor of size [1] or [batch_size, d1, ... dK] where K < N.
-    scope: the scope for the operations performed in computing the loss.
-    weight: Deprecated alias for `weights`.
+##### Args:
+
+
+*  <b>`losses`</b>: A tensor of size [batch_size, d1, ... dN].
+*  <b>`weights`</b>: A tensor of size [1] or [batch_size, d1, ... dK] where K < N.
+*  <b>`scope`</b>: the scope for the operations performed in computing the loss.
+*  <b>`weight`</b>: Deprecated alias for `weights`.
+
+##### Returns:
+
+  A scalar `Tensor` that returns the weighted loss.
+
+##### Raises:
 
-  Returns:
-    A scalar `Tensor` that returns the weighted loss.
 
-  Raises:
-    ValueError: If `weights` is `None` or the shape is not compatible with
-      `losses`, or if the number of dimensions (rank) of either `losses` or
-      `weights` is missing.
+*  <b>`ValueError`</b>: If `weights` is `None` or the shape is not compatible with
+    `losses`, or if the number of dimensions (rank) of either `losses` or
+    `weights` is missing.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.losses.hinge_loss.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.losses.hinge_loss.md
index c9df0f160dafc8..a63259ae717430 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.losses.hinge_loss.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.losses.hinge_loss.md
@@ -1,22 +1,31 @@
 ### `tf.contrib.losses.hinge_loss(*args, **kwargs)` {#hinge_loss}
 
-Method that returns the loss tensor for hinge loss. (deprecated arguments)
+Method that returns the loss tensor for hinge loss. (deprecated arguments) (deprecated)
+
+THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
+Instructions for updating:
+Use tf.losses.hinge_loss instead.
 
 SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-11-25.
 Instructions for updating:
 `target` is being deprecated, use `labels`.
 
-  Args:
-    logits: The logits, a float tensor.
-    labels: The ground truth output tensor. Its shape should match the shape of
-      logits. The values of the tensor are expected to be 0.0 or 1.0.
-    scope: The scope for the operations performed in computing the loss.
-    target: Deprecated alias for `labels`.
+##### Args:
+
+
+*  <b>`logits`</b>: The logits, a float tensor.
+*  <b>`labels`</b>: The ground truth output tensor. Its shape should match the shape of
+    logits. The values of the tensor are expected to be 0.0 or 1.0.
+*  <b>`scope`</b>: The scope for the operations performed in computing the loss.
+*  <b>`target`</b>: Deprecated alias for `labels`.
+
+##### Returns:
+
+  A `Tensor` of same shape as logits and target representing the loss values
+    across the batch.
+
+##### Raises:
 
-  Returns:
-    A `Tensor` of same shape as logits and target representing the loss values
-      across the batch.
 
-  Raises:
-    ValueError: If the shapes of `logits` and `labels` don't match.
+*  <b>`ValueError`</b>: If the shapes of `logits` and `labels` don't match.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.rnn_cell.BasicLSTMCell.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.rnn.BasicLSTMCell.md
similarity index 84%
rename from tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.rnn_cell.BasicLSTMCell.md
rename to tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.rnn.BasicLSTMCell.md
index c34a52613d09d1..eb4a38a8c3d03a 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.rnn_cell.BasicLSTMCell.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.rnn.BasicLSTMCell.md
@@ -11,14 +11,14 @@ use peep-hole connections: it is the basic baseline.
 For advanced models, please use the full LSTMCell that follows.
 - - -
 
-#### `tf.nn.rnn_cell.BasicLSTMCell.__call__(inputs, state, scope=None)` {#BasicLSTMCell.__call__}
+#### `tf.contrib.rnn.BasicLSTMCell.__call__(inputs, state, scope=None)` {#BasicLSTMCell.__call__}
 
 Long short-term memory cell (LSTM).
 
 
 - - -
 
-#### `tf.nn.rnn_cell.BasicLSTMCell.__init__(num_units, forget_bias=1.0, input_size=None, state_is_tuple=True, activation=tanh)` {#BasicLSTMCell.__init__}
+#### `tf.contrib.rnn.BasicLSTMCell.__init__(num_units, forget_bias=1.0, input_size=None, state_is_tuple=True, activation=tanh)` {#BasicLSTMCell.__init__}
 
 Initialize the basic LSTM cell.
 
@@ -36,21 +36,21 @@ Initialize the basic LSTM cell.
 
 - - -
 
-#### `tf.nn.rnn_cell.BasicLSTMCell.output_size` {#BasicLSTMCell.output_size}
+#### `tf.contrib.rnn.BasicLSTMCell.output_size` {#BasicLSTMCell.output_size}
 
 
 
 
 - - -
 
-#### `tf.nn.rnn_cell.BasicLSTMCell.state_size` {#BasicLSTMCell.state_size}
+#### `tf.contrib.rnn.BasicLSTMCell.state_size` {#BasicLSTMCell.state_size}
 
 
 
 
 - - -
 
-#### `tf.nn.rnn_cell.BasicLSTMCell.zero_state(batch_size, dtype)` {#BasicLSTMCell.zero_state}
+#### `tf.contrib.rnn.BasicLSTMCell.zero_state(batch_size, dtype)` {#BasicLSTMCell.zero_state}
 
 Return zero-filled state tensor(s).
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.nn.rnn_cell.GRUCell.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.rnn.GRUCell.md
similarity index 71%
rename from tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.nn.rnn_cell.GRUCell.md
rename to tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.rnn.GRUCell.md
index fa850232b1e85f..4f7cf4402f4a16 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.nn.rnn_cell.GRUCell.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.rnn.GRUCell.md
@@ -1,35 +1,35 @@
 Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078).
 - - -
 
-#### `tf.nn.rnn_cell.GRUCell.__call__(inputs, state, scope=None)` {#GRUCell.__call__}
+#### `tf.contrib.rnn.GRUCell.__call__(inputs, state, scope=None)` {#GRUCell.__call__}
 
 Gated recurrent unit (GRU) with nunits cells.
 
 
 - - -
 
-#### `tf.nn.rnn_cell.GRUCell.__init__(num_units, input_size=None, activation=tanh)` {#GRUCell.__init__}
+#### `tf.contrib.rnn.GRUCell.__init__(num_units, input_size=None, activation=tanh)` {#GRUCell.__init__}
 
 
 
 
 - - -
 
-#### `tf.nn.rnn_cell.GRUCell.output_size` {#GRUCell.output_size}
+#### `tf.contrib.rnn.GRUCell.output_size` {#GRUCell.output_size}
 
 
 
 
 - - -
 
-#### `tf.nn.rnn_cell.GRUCell.state_size` {#GRUCell.state_size}
+#### `tf.contrib.rnn.GRUCell.state_size` {#GRUCell.state_size}
 
 
 
 
 - - -
 
-#### `tf.nn.rnn_cell.GRUCell.zero_state(batch_size, dtype)` {#GRUCell.zero_state}
+#### `tf.contrib.rnn.GRUCell.zero_state(batch_size, dtype)` {#GRUCell.zero_state}
 
 Return zero-filled state tensor(s).
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.rnn.TimeReversedFusedRNN.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.rnn.TimeReversedFusedRNN.md
index c6f788f75d8e7f..0d9eb1ff9079af 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.rnn.TimeReversedFusedRNN.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.rnn.TimeReversedFusedRNN.md
@@ -3,7 +3,7 @@ This is an adaptor to time-reverse a FusedRNNCell.
 For example,
 
 ```python
-cell = tf.nn.rnn_cell.BasicRNNCell(10)
+cell = tf.contrib.rnn.BasicRNNCell(10)
 fw_lstm = tf.contrib.rnn.FusedRNNCellAdaptor(cell, use_dynamic_rnn=True)
 bw_lstm = tf.contrib.rnn.TimeReversedFusedRNN(fw_lstm)
 fw_out, fw_state = fw_lstm(inputs)
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.rnn.static_state_saving_rnn.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.rnn.static_state_saving_rnn.md
new file mode 100644
index 00000000000000..0f6ef9e409037b
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.rnn.static_state_saving_rnn.md
@@ -0,0 +1,33 @@
+### `tf.contrib.rnn.static_state_saving_rnn(cell, inputs, state_saver, state_name, sequence_length=None, scope=None)` {#static_state_saving_rnn}
+
+RNN that accepts a state saver for time-truncated RNN calculation.
+
+##### Args:
+
+
+*  <b>`cell`</b>: An instance of `RNNCell`.
+*  <b>`inputs`</b>: A length T list of inputs, each a `Tensor` of shape
+    `[batch_size, input_size]`.
+*  <b>`state_saver`</b>: A state saver object with methods `state` and `save_state`.
+*  <b>`state_name`</b>: Python string or tuple of strings.  The name to use with the
+    state_saver. If the cell returns tuples of states (i.e.,
+    `cell.state_size` is a tuple) then `state_name` should be a tuple of
+    strings having the same length as `cell.state_size`.  Otherwise it should
+    be a single string.
+*  <b>`sequence_length`</b>: (optional) An int32/int64 vector size [batch_size].
+    See the documentation for rnn() for more details about sequence_length.
+*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to "rnn".
+
+##### Returns:
+
+  A pair (outputs, state) where:
+    outputs is a length T list of outputs (one for each input)
+    states is the final state
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: If `cell` is not an instance of RNNCell.
+*  <b>`ValueError`</b>: If `inputs` is `None` or an empty list, or if the arity and
+   type of `state_name` does not match that of `cell.state_size`.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.map_fn.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.map_fn.md
index 5e49278a182458..1cbc6177de8172 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.map_fn.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.map_fn.md
@@ -28,13 +28,14 @@ one of the following methods is recommended. First, if the function is
 expressible as TensorFlow ops, use
 
 ```python
-  result = SparseTensor(input.indices, fn(input.values), input.shape)
+  result = SparseTensor(input.indices, fn(input.values), input.dense_shape)
 ```
 
 If, however, the function is not expressible as a TensorFlow op, then use
 
 ```python
-result = SparseTensor(input.indices, map_fn(fn, input.values), input.shape)
+result = SparseTensor(
+  input.indices, map_fn(fn, input.values), input.dense_shape)
 ```
 
 instead.
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.merge_all_summaries.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.merge_all_summaries.md
deleted file mode 100644
index 40143de15dce22..00000000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.merge_all_summaries.md
+++ /dev/null
@@ -1,16 +0,0 @@
-### `tf.merge_all_summaries(key='summaries')` {#merge_all_summaries}
-
-Merges all summaries collected in the default graph.
-
-##### Args:
-
-
-*  <b>`key`</b>: `GraphKey` used to collect the summaries.  Defaults to
-    `GraphKeys.SUMMARIES`.
-
-##### Returns:
-
-  If no summaries were collected, returns None.  Otherwise returns a scalar
-  `Tensor` of type `string` containing the serialized `Summary` protocol
-  buffer resulting from the merging.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.reduce_join.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.reduce_join.md
index ad49e9827412fb..a93d8208ffc4e4 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.reduce_join.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.reduce_join.md
@@ -1,4 +1,4 @@
-### `tf.reduce_join(inputs, reduction_indices, keep_dims=None, separator=None, name=None)` {#reduce_join}
+### `tf.reduce_join(inputs, axis=None, keep_dims=False, separator='', name=None, reduction_indices=None)` {#reduce_join}
 
 Joins a string Tensor across the given dimensions.
 
@@ -6,7 +6,7 @@ Computes the string join across dimensions in the given string Tensor of shape
 `[d_0, d_1, ..., d_n-1]`.  Returns a new Tensor created by joining the input
 strings with the given separator (default: empty string).  Negative indices are
 counted backwards from the end, with `-1` being equivalent to `n - 1`.  Passing
-an empty `reduction_indices` joins all strings in linear index order and outputs
+an empty `axis` joins all strings in linear index order and outputs
 a scalar string.
 
 
@@ -31,9 +31,9 @@ tf.reduce_join(a, []) ==> ["abcd"]
 
 *  <b>`inputs`</b>: A `Tensor` of type `string`.
     The input to be joined.  All reduced indices must have non-zero size.
-*  <b>`reduction_indices`</b>: A `Tensor` of type `int32`.
+*  <b>`axis`</b>: A `Tensor` of type `int32`.
     The dimensions to reduce over.  Dimensions are reduced in the
-    order specified.  Omitting `reduction_indices` is equivalent to passing
+    order specified.  Omitting `axis` is equivalent to passing
     `[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported.
 *  <b>`keep_dims`</b>: An optional `bool`. Defaults to `False`.
     If `True`, retain reduced dimensions with length `1`.
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.split_v.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.split_v.md
index 7e27ca6ac44612..f9f83750fa61c8 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.split_v.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.split_v.md
@@ -1,12 +1,12 @@
-### `tf.split_v(value, size_splits, split_dim=0, num=None, name='split_v')` {#split_v}
+### `tf.split_v(value=None, num_or_size_splits=None, axis=0, num=None, name='split_v')` {#split_v}
 
 Splits a tensor into sub tensors.
 
-If size_splits is a scalar, `num_split`, then
-splits `value` along dimension `split_dim` into `num_split` smaller tensors.
+If num_or_size_splits is a scalar, `num_split`, then
+splits `value` along dimension `axis` into `num_split` smaller tensors.
 Requires that `num_split` evenly divide `value.shape[split_dim]`.
 
-If size_splits is a tensor, then
+If num_or_size_splits is a tensor, then
 splits `value` into len(size_splits) pieces each the same size as the input
 except along dimension split_dim where the size is size_splits[i].
 
@@ -28,12 +28,12 @@ tf.shape(split0) ==> [5, 10]
 
 
 *  <b>`value`</b>: The `Tensor` to split.
-*  <b>`size_splits`</b>: Either an integer indicating the number of splits along
+*  <b>`num_or_size_splits`</b>: Either an integer indicating the number of splits along
     split_dim or a 1-D Tensor containing the sizes of each output tensor
     along split_dim. If an integer then it must evenly divide
     value.shape[split_dim]; otherwise the sum of sizes along the split
     dimension must match that of the input.
-*  <b>`split_dim`</b>: A 0-D `int32` `Tensor`. The dimension along which to split.
+*  <b>`axis`</b>: A 0-D `int32` `Tensor`. The dimension along which to split.
     Must be in the range `[0, rank(value))`. Defaults to 0.
 *  <b>`num`</b>: Optional, used to specify the number of outputs when it cannot be
        inferred from the shape of size_splits.
@@ -41,7 +41,10 @@ tf.shape(split0) ==> [5, 10]
 
 ##### Returns:
 
-  `len(size_splits)` `Tensor` objects resulting from splitting `value`.
+  if `num_or_size_splits` is a scalar returns `num_or_size_splits` `Tensor`
+  objects; if `num_or_size_splits` is a 1-D Tensor returns
+  `num_or_size_splits.get_shape[0]` `Tensor` objects resulting from splitting
+  `value`.
 
 ##### Raises:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.maybe_batch.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.maybe_batch.md
new file mode 100644
index 00000000000000..610d5badcb83fe
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.maybe_batch.md
@@ -0,0 +1,39 @@
+### `tf.train.maybe_batch(tensors, keep_input, batch_size, num_threads=1, capacity=32, enqueue_many=False, shapes=None, dynamic_pad=False, allow_smaller_final_batch=False, shared_name=None, name=None)` {#maybe_batch}
+
+Conditionally creates batches of tensors based on `keep_input`.
+
+See docstring in `batch` for more details.
+
+##### Args:
+
+
+*  <b>`tensors`</b>: The list or dictionary of tensors to enqueue.
+*  <b>`keep_input`</b>: A `bool` scalar Tensor.  This tensor controls whether the input
+    is added to the queue or not.  If it evaluates `True`, then `tensors` are
+    added to the queue; otherwise they are dropped.  This tensor essentially
+    acts as a filtering mechanism.
+*  <b>`batch_size`</b>: The new batch size pulled from the queue.
+*  <b>`num_threads`</b>: The number of threads enqueuing `tensors`.
+*  <b>`capacity`</b>: An integer. The maximum number of elements in the queue.
+*  <b>`enqueue_many`</b>: Whether each tensor in `tensors` is a single example.
+*  <b>`shapes`</b>: (Optional) The shapes for each example.  Defaults to the
+    inferred shapes for `tensors`.
+*  <b>`dynamic_pad`</b>: Boolean.  Allow variable dimensions in input shapes.
+    The given dimensions are padded upon dequeue so that tensors within a
+    batch have the same shapes.
+*  <b>`allow_smaller_final_batch`</b>: (Optional) Boolean. If `True`, allow the final
+    batch to be smaller if there are insufficient items left in the queue.
+*  <b>`shared_name`</b>: (Optional). If set, this queue will be shared under the given
+    name across multiple sessions.
+*  <b>`name`</b>: (Optional) A name for the operations.
+
+##### Returns:
+
+  A list or dictionary of tensors with the same types as `tensors`.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If the `shapes` are not specified, and cannot be
+    inferred from the elements of `tensors`.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.SparseFeature.__new__.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.SparseFeature.__new__.md
new file mode 100644
index 00000000000000..167611ebd5bc42
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.SparseFeature.__new__.md
@@ -0,0 +1,4 @@
+#### `tf.SparseFeature.__new__(_cls, index_key, value_key, dtype, size, already_sorted=False)` {#SparseFeature.__new__}
+
+Create new instance of SparseFeature(index_key, value_key, dtype, size, already_sorted)
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.SparseTensorValue.__new__.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.SparseTensorValue.__new__.md
deleted file mode 100644
index e805f29b985c83..00000000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.SparseTensorValue.__new__.md
+++ /dev/null
@@ -1,4 +0,0 @@
-#### `tf.SparseTensorValue.__new__(_cls, indices, values, shape)` {#SparseTensorValue.__new__}
-
-Create new instance of SparseTensorValue(indices, values, shape)
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.BaseEstimator.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.BaseEstimator.md
index 8afabb054f6f02..8951836ee5c047 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.BaseEstimator.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.BaseEstimator.md
@@ -52,11 +52,12 @@ available in the SKCompat class, Estimator will only accept input_fn.
 
   est = Estimator(...) -> est = SKCompat(Estimator(...))
 
+##### Raises:
+
 
-*  <b>`Raises`</b>: 
 *  <b>`ValueError`</b>: If at least one of `x` or `y` is provided, and at least one of
-          `input_fn` or `feed_fn` is provided.
-          Or if `metrics` is not `None` or `dict`.
+      `input_fn` or `feed_fn` is provided.
+      Or if `metrics` is not `None` or `dict`.
 
 
 - - -
@@ -69,36 +70,39 @@ SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-23.
 Instructions for updating:
 The signature of the input_fn accepted by export is changing to be consistent with what's used by tf.Learn Estimator's train/evaluate. input_fn (and in most cases, input_feature_key) will become required args, and use_deprecated_input_fn will default to False and be removed altogether.
 
-    Args:
-      export_dir: A string containing a directory to write the exported graph
-        and checkpoints.
-      input_fn: If `use_deprecated_input_fn` is true, then a function that given
-        `Tensor` of `Example` strings, parses it into features that are then
-        passed to the model. Otherwise, a function that takes no argument and
-        returns a tuple of (features, labels), where features is a dict of
-        string key to `Tensor` and labels is a `Tensor` that's currently not
-        used (and so can be `None`).
-      input_feature_key: Only used if `use_deprecated_input_fn` is false. String
-        key into the features dict returned by `input_fn` that corresponds to a
-        the raw `Example` strings `Tensor` that the exported model will take as
-        input. Can only be `None` if you're using a custom `signature_fn` that
-        does not use the first arg (examples).
-      use_deprecated_input_fn: Determines the signature format of `input_fn`.
-      signature_fn: Function that returns a default signature and a named
-        signature map, given `Tensor` of `Example` strings, `dict` of `Tensor`s
-        for features and `Tensor` or `dict` of `Tensor`s for predictions.
-      prediction_key: The key for a tensor in the `predictions` dict (output
-        from the `model_fn`) to use as the `predictions` input to the
-        `signature_fn`. Optional. If `None`, predictions will pass to
-        `signature_fn` without filtering.
-      default_batch_size: Default batch size of the `Example` placeholder.
-      exports_to_keep: Number of exports to keep.
-
-    Returns:
-      The string path to the exported directory. NB: this functionality was
-      added ca. 2016/09/25; clients that depend on the return value may need
-      to handle the case where this function returns None because subclasses
-      are not returning a value.
+##### Args:
+
+
+*  <b>`export_dir`</b>: A string containing a directory to write the exported graph
+    and checkpoints.
+*  <b>`input_fn`</b>: If `use_deprecated_input_fn` is true, then a function that given
+    `Tensor` of `Example` strings, parses it into features that are then
+    passed to the model. Otherwise, a function that takes no argument and
+    returns a tuple of (features, labels), where features is a dict of
+    string key to `Tensor` and labels is a `Tensor` that's currently not
+    used (and so can be `None`).
+*  <b>`input_feature_key`</b>: Only used if `use_deprecated_input_fn` is false. String
+    key into the features dict returned by `input_fn` that corresponds to a
+    the raw `Example` strings `Tensor` that the exported model will take as
+    input. Can only be `None` if you're using a custom `signature_fn` that
+    does not use the first arg (examples).
+*  <b>`use_deprecated_input_fn`</b>: Determines the signature format of `input_fn`.
+*  <b>`signature_fn`</b>: Function that returns a default signature and a named
+    signature map, given `Tensor` of `Example` strings, `dict` of `Tensor`s
+    for features and `Tensor` or `dict` of `Tensor`s for predictions.
+*  <b>`prediction_key`</b>: The key for a tensor in the `predictions` dict (output
+    from the `model_fn`) to use as the `predictions` input to the
+    `signature_fn`. Optional. If `None`, predictions will pass to
+    `signature_fn` without filtering.
+*  <b>`default_batch_size`</b>: Default batch size of the `Example` placeholder.
+*  <b>`exports_to_keep`</b>: Number of exports to keep.
+
+##### Returns:
+
+  The string path to the exported directory. NB: this functionality was
+  added ca. 2016/09/25; clients that depend on the return value may need
+  to handle the case where this function returns None because subclasses
+  are not returning a value.
 
 
 - - -
@@ -117,8 +121,9 @@ available in the SKCompat class, Estimator will only accept input_fn.
 
   est = Estimator(...) -> est = SKCompat(Estimator(...))
 
+##### Raises:
+
 
-*  <b>`Raises`</b>: 
 *  <b>`ValueError`</b>: If `x` or `y` are not `None` while `input_fn` is not `None`.
 *  <b>`ValueError`</b>: If both `steps` and `max_steps` are not `None`.
 
@@ -193,39 +198,41 @@ available in the SKCompat class, Estimator will only accept input_fn.
 
   est = Estimator(...) -> est = SKCompat(Estimator(...))
 
-    This method is expected to be called several times consecutively
-    on different or the same chunks of the dataset. This either can
-    implement iterative training or out-of-core/online training.
+This method is expected to be called several times consecutively
+on different or the same chunks of the dataset. This either can
+implement iterative training or out-of-core/online training.
 
-    This is especially useful when the whole dataset is too big to
-    fit in memory at the same time. Or when model is taking long time
-    to converge, and you want to split up training into subparts.
+This is especially useful when the whole dataset is too big to
+fit in memory at the same time. Or when model is taking long time
+to converge, and you want to split up training into subparts.
+
+##### Args:
 
 
-*  <b>`Args`</b>: 
 *  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-         returns arrays of features. The training input samples for fitting the
-         model. If set, `input_fn` must be `None`.
+     returns arrays of features. The training input samples for fitting the
+     model. If set, `input_fn` must be `None`.
 *  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-         iterator that returns array of labels. The training label values
-         (class labels in classification, real numbers in regression). If set,
-         `input_fn` must be `None`.
+     iterator that returns array of labels. The training label values
+     (class labels in classification, real numbers in regression). If set,
+     `input_fn` must be `None`.
 *  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-        `None`.
+    `None`.
 *  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
 *  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-        dimension of `x`. Must be `None` if `input_fn` is provided.
+    dimension of `x`. Must be `None` if `input_fn` is provided.
 *  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-        inside the training loop.
+    inside the training loop.
 
+##### Returns:
 
-*  <b>`Returns`</b>: 
-      `self`, for chaining.
+  `self`, for chaining.
+
+##### Raises:
 
 
-*  <b>`Raises`</b>: 
 *  <b>`ValueError`</b>: If at least one of `x` and `y` is provided, and `input_fn` is
-          provided.
+      provided.
 
 
 - - -
@@ -244,30 +251,32 @@ available in the SKCompat class, Estimator will only accept input_fn.
 
   est = Estimator(...) -> est = SKCompat(Estimator(...))
 
+##### Args:
+
 
-*  <b>`Args`</b>: 
 *  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-         returns arrays of features. The training input samples for fitting the
-         model. If set, `input_fn` must be `None`.
+     returns arrays of features. The training input samples for fitting the
+     model. If set, `input_fn` must be `None`.
 *  <b>`input_fn`</b>: Input function. If set, `x` and 'batch_size' must be `None`.
 *  <b>`batch_size`</b>: Override default batch size. If set, 'input_fn' must be
-        'None'.
+    'None'.
 *  <b>`outputs`</b>: list of `str`, name of the output to predict.
-        If `None`, returns all.
+    If `None`, returns all.
 *  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
-        for each example until inputs are exhausted. Note: The inputs must
-        terminate if you want the iterable to terminate (e.g. be sure to pass
-        num_epochs=1 if you are using something like read_batch_features).
+    for each example until inputs are exhausted. Note: The inputs must
+    terminate if you want the iterable to terminate (e.g. be sure to pass
+    num_epochs=1 if you are using something like read_batch_features).
 
+##### Returns:
 
-*  <b>`Returns`</b>: 
-      A numpy array of predicted classes or regression values if the
-      constructor's `model_fn` returns a `Tensor` for `predictions` or a `dict`
-      of numpy arrays if `model_fn` returns a `dict`. Returns an iterable of
-      predictions if as_iterable is True.
+  A numpy array of predicted classes or regression values if the
+  constructor's `model_fn` returns a `Tensor` for `predictions` or a `dict`
+  of numpy arrays if `model_fn` returns a `dict`. Returns an iterable of
+  predictions if as_iterable is True.
+
+##### Raises:
 
 
-*  <b>`Raises`</b>: 
 *  <b>`ValueError`</b>: If x and input_fn are both provided or both `None`.
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.monitors.ExportMonitor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.monitors.ExportMonitor.md
index 53992bdf4f15d9..bf3fa842a3f80f 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.monitors.ExportMonitor.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.monitors.ExportMonitor.md
@@ -9,27 +9,31 @@ SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-23.
 Instructions for updating:
 The signature of the input_fn accepted by export is changing to be consistent with what's used by tf.Learn Estimator's train/evaluate. input_fn (and in most cases, input_feature_key) will both become required args.
 
-    Args:
-      every_n_steps: Run monitor every N steps.
-      export_dir: str, folder to export.
-      input_fn: A function that takes no argument and returns a tuple of
-        (features, labels), where features is a dict of string key to `Tensor`
-        and labels is a `Tensor` that's currently not used (and so can be
-        `None`).
-      input_feature_key: String key into the features dict returned by
-        `input_fn` that corresponds to the raw `Example` strings `Tensor` that
-        the exported model will take as input. Should be `None` if and only if
-        you're passing in a `signature_fn` that does not use the first arg
-        (`Tensor` of `Example` strings).
-      exports_to_keep: int, number of exports to keep.
-      signature_fn: Function that returns a default signature and a named
-        signature map, given `Tensor` of `Example` strings, `dict` of `Tensor`s
-        for features and `dict` of `Tensor`s for predictions.
-      default_batch_size: Default batch size of the `Example` placeholder.
-
-    Raises:
-      ValueError: If `input_fn` and `input_feature_key` are not both defined or
-        are not both `None`.
+##### Args:
+
+
+*  <b>`every_n_steps`</b>: Run monitor every N steps.
+*  <b>`export_dir`</b>: str, folder to export.
+*  <b>`input_fn`</b>: A function that takes no argument and returns a tuple of
+    (features, labels), where features is a dict of string key to `Tensor`
+    and labels is a `Tensor` that's currently not used (and so can be
+    `None`).
+*  <b>`input_feature_key`</b>: String key into the features dict returned by
+    `input_fn` that corresponds to the raw `Example` strings `Tensor` that
+    the exported model will take as input. Should be `None` if and only if
+    you're passing in a `signature_fn` that does not use the first arg
+    (`Tensor` of `Example` strings).
+*  <b>`exports_to_keep`</b>: int, number of exports to keep.
+*  <b>`signature_fn`</b>: Function that returns a default signature and a named
+    signature map, given `Tensor` of `Example` strings, `dict` of `Tensor`s
+    for features and `dict` of `Tensor`s for predictions.
+*  <b>`default_batch_size`</b>: Default batch size of the `Example` placeholder.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If `input_fn` and `input_feature_key` are not both defined or
+    are not both `None`.
 
 
 - - -
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.legacy_seq2seq.embedding_attention_decoder.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.legacy_seq2seq.embedding_attention_decoder.md
new file mode 100644
index 00000000000000..a3e6aa804f9b9c
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.legacy_seq2seq.embedding_attention_decoder.md
@@ -0,0 +1,52 @@
+### `tf.contrib.legacy_seq2seq.embedding_attention_decoder(decoder_inputs, initial_state, attention_states, cell, num_symbols, embedding_size, num_heads=1, output_size=None, output_projection=None, feed_previous=False, update_embedding_for_previous=True, dtype=None, scope=None, initial_state_attention=False)` {#embedding_attention_decoder}
+
+RNN decoder with embedding and attention and a pure-decoding option.
+
+##### Args:
+
+
+*  <b>`decoder_inputs`</b>: A list of 1D batch-sized int32 Tensors (decoder inputs).
+*  <b>`initial_state`</b>: 2D Tensor [batch_size x cell.state_size].
+*  <b>`attention_states`</b>: 3D Tensor [batch_size x attn_length x attn_size].
+*  <b>`cell`</b>: rnn_cell.RNNCell defining the cell function.
+*  <b>`num_symbols`</b>: Integer, how many symbols come into the embedding.
+*  <b>`embedding_size`</b>: Integer, the length of the embedding vector for each symbol.
+*  <b>`num_heads`</b>: Number of attention heads that read from attention_states.
+*  <b>`output_size`</b>: Size of the output vectors; if None, use output_size.
+*  <b>`output_projection`</b>: None or a pair (W, B) of output projection weights and
+    biases; W has shape [output_size x num_symbols] and B has shape
+    [num_symbols]; if provided and feed_previous=True, each fed previous
+    output will first be multiplied by W and added B.
+*  <b>`feed_previous`</b>: Boolean; if True, only the first of decoder_inputs will be
+    used (the "GO" symbol), and all other decoder inputs will be generated by:
+      next = embedding_lookup(embedding, argmax(previous_output)),
+    In effect, this implements a greedy decoder. It can also be used
+    during training to emulate http://arxiv.org/abs/1506.03099.
+    If False, decoder_inputs are used as given (the standard decoder case).
+*  <b>`update_embedding_for_previous`</b>: Boolean; if False and feed_previous=True,
+    only the embedding for the first symbol of decoder_inputs (the "GO"
+    symbol) will be updated by back propagation. Embeddings for the symbols
+    generated from the decoder itself remain unchanged. This parameter has
+    no effect if feed_previous=False.
+*  <b>`dtype`</b>: The dtype to use for the RNN initial states (default: tf.float32).
+*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to
+    "embedding_attention_decoder".
+*  <b>`initial_state_attention`</b>: If False (default), initial attentions are zero.
+    If True, initialize the attentions from the initial state and attention
+    states -- useful when we wish to resume decoding from a previously
+    stored decoder state and attention states.
+
+##### Returns:
+
+  A tuple of the form (outputs, state), where:
+
+*  <b>`outputs`</b>: A list of the same length as decoder_inputs of 2D Tensors with
+      shape [batch_size x output_size] containing the generated outputs.
+*  <b>`state`</b>: The state of each decoder cell at the final time-step.
+      It is a 2D Tensor of shape [batch_size x cell.state_size].
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: When output_projection has the wrong shape.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.legacy_seq2seq.embedding_attention_seq2seq.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.legacy_seq2seq.embedding_attention_seq2seq.md
new file mode 100644
index 00000000000000..5de66ad5e9326c
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.legacy_seq2seq.embedding_attention_seq2seq.md
@@ -0,0 +1,50 @@
+### `tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(encoder_inputs, decoder_inputs, cell, num_encoder_symbols, num_decoder_symbols, embedding_size, num_heads=1, output_projection=None, feed_previous=False, dtype=None, scope=None, initial_state_attention=False)` {#embedding_attention_seq2seq}
+
+Embedding sequence-to-sequence model with attention.
+
+This model first embeds encoder_inputs by a newly created embedding (of shape
+[num_encoder_symbols x input_size]). Then it runs an RNN to encode
+embedded encoder_inputs into a state vector. It keeps the outputs of this
+RNN at every step to use for attention later. Next, it embeds decoder_inputs
+by another newly created embedding (of shape [num_decoder_symbols x
+input_size]). Then it runs attention decoder, initialized with the last
+encoder state, on embedded decoder_inputs and attending to encoder outputs.
+
+Warning: when output_projection is None, the size of the attention vectors
+and variables will be made proportional to num_decoder_symbols, can be large.
+
+##### Args:
+
+
+*  <b>`encoder_inputs`</b>: A list of 1D int32 Tensors of shape [batch_size].
+*  <b>`decoder_inputs`</b>: A list of 1D int32 Tensors of shape [batch_size].
+*  <b>`cell`</b>: rnn_cell.RNNCell defining the cell function and size.
+*  <b>`num_encoder_symbols`</b>: Integer; number of symbols on the encoder side.
+*  <b>`num_decoder_symbols`</b>: Integer; number of symbols on the decoder side.
+*  <b>`embedding_size`</b>: Integer, the length of the embedding vector for each symbol.
+*  <b>`num_heads`</b>: Number of attention heads that read from attention_states.
+*  <b>`output_projection`</b>: None or a pair (W, B) of output projection weights and
+    biases; W has shape [output_size x num_decoder_symbols] and B has
+    shape [num_decoder_symbols]; if provided and feed_previous=True, each
+    fed previous output will first be multiplied by W and added B.
+*  <b>`feed_previous`</b>: Boolean or scalar Boolean Tensor; if True, only the first
+    of decoder_inputs will be used (the "GO" symbol), and all other decoder
+    inputs will be taken from previous outputs (as in embedding_rnn_decoder).
+    If False, decoder_inputs are used as given (the standard decoder case).
+*  <b>`dtype`</b>: The dtype of the initial RNN state (default: tf.float32).
+*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to
+    "embedding_attention_seq2seq".
+*  <b>`initial_state_attention`</b>: If False (default), initial attentions are zero.
+    If True, initialize the attentions from the initial state and attention
+    states.
+
+##### Returns:
+
+  A tuple of the form (outputs, state), where:
+
+*  <b>`outputs`</b>: A list of the same length as decoder_inputs of 2D Tensors with
+      shape [batch_size x num_decoder_symbols] containing the generated
+      outputs.
+*  <b>`state`</b>: The state of each decoder cell at the final time-step.
+      It is a 2D Tensor of shape [batch_size x cell.state_size].
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.legacy_seq2seq.rnn_decoder.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.legacy_seq2seq.rnn_decoder.md
new file mode 100644
index 00000000000000..00cafc27bc0c65
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.legacy_seq2seq.rnn_decoder.md
@@ -0,0 +1,31 @@
+### `tf.contrib.legacy_seq2seq.rnn_decoder(decoder_inputs, initial_state, cell, loop_function=None, scope=None)` {#rnn_decoder}
+
+RNN decoder for the sequence-to-sequence model.
+
+##### Args:
+
+
+*  <b>`decoder_inputs`</b>: A list of 2D Tensors [batch_size x input_size].
+*  <b>`initial_state`</b>: 2D Tensor with shape [batch_size x cell.state_size].
+*  <b>`cell`</b>: rnn_cell.RNNCell defining the cell function and size.
+*  <b>`loop_function`</b>: If not None, this function will be applied to the i-th output
+    in order to generate the i+1-st input, and decoder_inputs will be ignored,
+    except for the first element ("GO" symbol). This can be used for decoding,
+    but also for training to emulate http://arxiv.org/abs/1506.03099.
+    Signature -- loop_function(prev, i) = next
+      * prev is a 2D Tensor of shape [batch_size x output_size],
+      * i is an integer, the step number (when advanced control is needed),
+      * next is a 2D Tensor of shape [batch_size x input_size].
+*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to "rnn_decoder".
+
+##### Returns:
+
+  A tuple of the form (outputs, state), where:
+
+*  <b>`outputs`</b>: A list of the same length as decoder_inputs of 2D Tensors with
+      shape [batch_size x output_size] containing generated outputs.
+*  <b>`state`</b>: The state of each cell at the final time-step.
+      It is a 2D Tensor of shape [batch_size x cell.state_size].
+      (Note that in some cases, like basic RNN cell or GRU cell, outputs and
+       states can be the same. They are different for LSTM cells though.)
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.rnn.static_bidirectional_rnn.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.rnn.static_bidirectional_rnn.md
new file mode 100644
index 00000000000000..b4cc966e32b40d
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.rnn.static_bidirectional_rnn.md
@@ -0,0 +1,48 @@
+### `tf.contrib.rnn.static_bidirectional_rnn(cell_fw, cell_bw, inputs, initial_state_fw=None, initial_state_bw=None, dtype=None, sequence_length=None, scope=None)` {#static_bidirectional_rnn}
+
+Creates a bidirectional recurrent neural network.
+
+Similar to the unidirectional case above (rnn) but takes input and builds
+independent forward and backward RNNs with the final forward and backward
+outputs depth-concatenated, such that the output will have the format
+[time][batch][cell_fw.output_size + cell_bw.output_size]. The input_size of
+forward and backward cell must match. The initial state for both directions
+is zero by default (but can be set optionally) and no intermediate states are
+ever returned -- the network is fully unrolled for the given (passed in)
+length(s) of the sequence(s) or completely unrolled if length(s) is not given.
+
+##### Args:
+
+
+*  <b>`cell_fw`</b>: An instance of RNNCell, to be used for forward direction.
+*  <b>`cell_bw`</b>: An instance of RNNCell, to be used for backward direction.
+*  <b>`inputs`</b>: A length T list of inputs, each a tensor of shape
+    [batch_size, input_size], or a nested tuple of such elements.
+*  <b>`initial_state_fw`</b>: (optional) An initial state for the forward RNN.
+    This must be a tensor of appropriate type and shape
+    `[batch_size, cell_fw.state_size]`.
+    If `cell_fw.state_size` is a tuple, this should be a tuple of
+    tensors having shapes `[batch_size, s] for s in cell_fw.state_size`.
+*  <b>`initial_state_bw`</b>: (optional) Same as for `initial_state_fw`, but using
+    the corresponding properties of `cell_bw`.
+*  <b>`dtype`</b>: (optional) The data type for the initial state.  Required if
+    either of the initial states are not provided.
+*  <b>`sequence_length`</b>: (optional) An int32/int64 vector, size `[batch_size]`,
+    containing the actual lengths for each of the sequences.
+*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to
+    "bidirectional_rnn"
+
+##### Returns:
+
+  A tuple (outputs, output_state_fw, output_state_bw) where:
+    outputs is a length `T` list of outputs (one for each input), which
+      are depth-concatenated forward and backward outputs.
+    output_state_fw is the final state of the forward rnn.
+    output_state_bw is the final state of the backward rnn.
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: If `cell_fw` or `cell_bw` is not an instance of `RNNCell`.
+*  <b>`ValueError`</b>: If inputs is None or an empty list.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.image_summary.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.image_summary.md
deleted file mode 100644
index 5df729544bfbbc..00000000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.image_summary.md
+++ /dev/null
@@ -1,48 +0,0 @@
-### `tf.image_summary(tag, tensor, max_images=3, collections=None, name=None)` {#image_summary}
-
-Outputs a `Summary` protocol buffer with images.
-
-The summary has up to `max_images` summary values containing images. The
-images are built from `tensor` which must be 4-D with shape `[batch_size,
-height, width, channels]` and where `channels` can be:
-
-*  1: `tensor` is interpreted as Grayscale.
-*  3: `tensor` is interpreted as RGB.
-*  4: `tensor` is interpreted as RGBA.
-
-The images have the same number of channels as the input tensor. For float
-input, the values are normalized one image at a time to fit in the range
-`[0, 255]`.  `uint8` values are unchanged.  The op uses two different
-normalization algorithms:
-
-*  If the input values are all positive, they are rescaled so the largest one
-   is 255.
-
-*  If any input value is negative, the values are shifted so input value 0.0
-   is at 127.  They are then rescaled so that either the smallest value is 0,
-   or the largest one is 255.
-
-The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-build the `tag` of the summary values:
-
-*  If `max_images` is 1, the summary value tag is '*tag*/image'.
-*  If `max_images` is greater than 1, the summary value tags are
-   generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
-
-##### Args:
-
-
-*  <b>`tag`</b>: A scalar `Tensor` of type `string`. Used to build the `tag`
-    of the summary values.
-*  <b>`tensor`</b>: A 4-D `uint8` or `float32` `Tensor` of shape `[batch_size, height,
-    width, channels]` where `channels` is 1, 3, or 4.
-*  <b>`max_images`</b>: Max number of batch elements to generate images for.
-*  <b>`collections`</b>: Optional list of ops.GraphKeys.  The collections to add the
-    summary to.  Defaults to [ops.GraphKeys.SUMMARIES]
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A scalar `Tensor` of type `string`. The serialized `Summary` protocol
-  buffer.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.sparse_placeholder.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.sparse_placeholder.md
index def6c8329dcfc9..c1fa1d12e6794a 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.sparse_placeholder.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.sparse_placeholder.md
@@ -23,7 +23,7 @@ with tf.Session() as sess:
   print(sess.run(y, feed_dict={
     x: (indices, values, shape)}))  # Will succeed.
 
-  sp = tf.SparseTensor(indices=indices, values=values, shape=shape)
+  sp = tf.SparseTensor(indices=indices, values=values, dense_shape=shape)
   sp_value = sp.eval(session)
   print(sess.run(y, feed_dict={x: sp_value}))  # Will succeed.
 ```
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.split.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.split.md
index b6bfac36d47670..3b5479a922c9ae 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.split.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.split.md
@@ -1,9 +1,11 @@
-### `tf.split(split_dim, num_split, value, name='split')` {#split}
+### `tf.split(axis=None, num_or_size_splits=None, value=None, name='split', split_dim=None)` {#split}
+
+DEPRECATED: use split_v; split_v rename to split happening soon.
 
 Splits a tensor into `num_split` tensors along one dimension.
 
-Splits `value` along dimension `split_dim` into `num_split` smaller tensors.
-Requires that `num_split` evenly divide `value.shape[split_dim]`.
+Splits `value` along dimension `axis` into `num_or_size_splits` smaller
+tensors. Requires that `num_or_size_splits` evenly divide `value.shape[axis]`.
 
 For example:
 
@@ -31,11 +33,13 @@ tf.unpack(t, axis=axis)
 ##### Args:
 
 
-*  <b>`split_dim`</b>: A 0-D `int32` `Tensor`. The dimension along which to split.
+*  <b>`axis`</b>: A 0-D `int32` `Tensor`. The dimension along which to split.
     Must be in the range `[0, rank(value))`.
-*  <b>`num_split`</b>: A Python integer. The number of ways to split.
+*  <b>`num_or_size_splits`</b>: A Python integer. The number of ways to split. Has a
+    different meaning in split_v (see docs).
 *  <b>`value`</b>: The `Tensor` to split.
 *  <b>`name`</b>: A name for the operation (optional).
+*  <b>`split_dim`</b>: The old (deprecated) name for axis.
 
 ##### Returns:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.squeeze.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.squeeze.md
index f1590ac762e92e..90a1b9af8258d2 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.squeeze.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.squeeze.md
@@ -12,7 +12,7 @@ For example:
 ```prettyprint
 # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
 shape(squeeze(t)) ==> [2, 3]
-          ```
+```
 
 Or, to remove specific size 1 dimensions:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.test.TestCase.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.test.TestCase.md
index c790b106c89c11..4ee4c44388c9a5 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.test.TestCase.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.test.TestCase.md
@@ -452,7 +452,7 @@ then compares them using self._AssertProtoEqual().
 
 - - -
 
-#### `tf.test.TestCase.assertProtoEqualsVersion(expected, actual, producer=17, min_consumer=0)` {#TestCase.assertProtoEqualsVersion}
+#### `tf.test.TestCase.assertProtoEqualsVersion(expected, actual, producer=18, min_consumer=0)` {#TestCase.assertProtoEqualsVersion}
 
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.CheckpointSaverHook.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.CheckpointSaverHook.md
index e0568d0d858546..a1d93363b444a2 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.CheckpointSaverHook.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.CheckpointSaverHook.md
@@ -1,7 +1,7 @@
 Saves checkpoints every N steps or seconds.
 - - -
 
-#### `tf.train.CheckpointSaverHook.__init__(checkpoint_dir, save_secs=None, save_steps=None, saver=None, checkpoint_basename='model.ckpt', scaffold=None)` {#CheckpointSaverHook.__init__}
+#### `tf.train.CheckpointSaverHook.__init__(checkpoint_dir, save_secs=None, save_steps=None, saver=None, checkpoint_basename='model.ckpt', scaffold=None, listeners=None)` {#CheckpointSaverHook.__init__}
 
 Initialize CheckpointSaverHook monitor.
 
@@ -14,6 +14,10 @@ Initialize CheckpointSaverHook monitor.
 *  <b>`saver`</b>: `Saver` object, used for saving.
 *  <b>`checkpoint_basename`</b>: `str`, base name for the checkpoint files.
 *  <b>`scaffold`</b>: `Scaffold`, use to get saver object.
+*  <b>`listeners`</b>: List of `CheckpointSaverListener` subclass instances.
+    Used for callbacks that run immediately after the corresponding
+    CheckpointSaverHook callbacks, only in steps where the
+    CheckpointSaverHook was triggered.
 
 ##### Raises:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.maybe_shuffle_batch_join.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.maybe_shuffle_batch_join.md
new file mode 100644
index 00000000000000..ab5543d378bc2f
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.maybe_shuffle_batch_join.md
@@ -0,0 +1,40 @@
+### `tf.train.maybe_shuffle_batch_join(tensors_list, batch_size, capacity, min_after_dequeue, keep_input, seed=None, enqueue_many=False, shapes=None, allow_smaller_final_batch=False, shared_name=None, name=None)` {#maybe_shuffle_batch_join}
+
+Create batches by randomly shuffling conditionally-enqueued tensors.
+
+See docstring in `shuffle_batch_join` for more details.
+
+##### Args:
+
+
+*  <b>`tensors_list`</b>: A list of tuples or dictionaries of tensors to enqueue.
+*  <b>`batch_size`</b>: An integer. The new batch size pulled from the queue.
+*  <b>`capacity`</b>: An integer. The maximum number of elements in the queue.
+*  <b>`min_after_dequeue`</b>: Minimum number elements in the queue after a
+    dequeue, used to ensure a level of mixing of elements.
+*  <b>`keep_input`</b>: A `bool` scalar Tensor.  If provided, this tensor controls
+    whether the input is added to the queue or not.  If it evaluates `True`,
+    then `tensors_list` are added to the queue; otherwise they are dropped.
+    This tensor essentially acts as a filtering mechanism.
+*  <b>`seed`</b>: Seed for the random shuffling within the queue.
+*  <b>`enqueue_many`</b>: Whether each tensor in `tensor_list_list` is a single
+    example.
+*  <b>`shapes`</b>: (Optional) The shapes for each example.  Defaults to the
+    inferred shapes for `tensors_list[i]`.
+*  <b>`allow_smaller_final_batch`</b>: (Optional) Boolean. If `True`, allow the final
+    batch to be smaller if there are insufficient items left in the queue.
+*  <b>`shared_name`</b>: (optional). If set, this queue will be shared under the given
+    name across multiple sessions.
+*  <b>`name`</b>: (Optional) A name for the operations.
+
+##### Returns:
+
+  A list or dictionary of tensors with the same number and types as
+  `tensors_list[i]`.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If the `shapes` are not specified, and cannot be
+    inferred from the elements of `tensors_list`.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.SparseTensor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.SparseTensor.md
index d89b4e70c4ccfe..137cf621d4f9a3 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.SparseTensor.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.SparseTensor.md
@@ -1,37 +1,41 @@
 Represents a sparse tensor.
 
 TensorFlow represents a sparse tensor as three separate dense tensors:
-`indices`, `values`, and `shape`.  In Python, the three tensors are
+`indices`, `values`, and `dense_shape`.  In Python, the three tensors are
 collected into a `SparseTensor` class for ease of use.  If you have separate
-`indices`, `values`, and `shape` tensors, wrap them in a `SparseTensor`
+`indices`, `values`, and `dense_shape` tensors, wrap them in a `SparseTensor`
 object before passing to the ops below.
 
-Concretely, the sparse tensor `SparseTensor(indices, values, shape)`
+Concretely, the sparse tensor `SparseTensor(indices, values, dense_shape)`
 comprises the following components, where `N` and `ndims` are the number
 of values and number of dimensions in the `SparseTensor`, respectively:
 
-* `indices`: A 2-D int64 tensor of shape `[N, ndims]`, which specifies
+* `indices`: A 2-D int64 tensor of dense_shape `[N, ndims]`, which specifies
   the indices of the elements in the sparse tensor that contain nonzero
   values (elements are zero-indexed). For example, `indices=[[1,3], [2,4]]`
   specifies that the elements with indexes of [1,3] and [2,4] have
   nonzero values.
 
-* `values`: A 1-D tensor of any type and shape `[N]`, which supplies the
+* `values`: A 1-D tensor of any type and dense_shape `[N]`, which supplies the
   values for each element in `indices`. For example, given
   `indices=[[1,3], [2,4]]`, the parameter `values=[18, 3.6]` specifies
   that element [1,3] of the sparse tensor has a value of 18, and element
   [2,4] of the tensor has a value of 3.6.
 
-* `shape`: A 1-D int64 tensor of shape `[ndims]`, which specifies the shape
-  of the sparse tensor. Takes a list indicating the number of elements in
-  each dimension. For example, `shape=[3,6]` specifies a two-dimensional 3x6
-  tensor, `shape=[2,3,4]` specifies a three-dimensional 2x3x4 tensor, and
-  `shape=[9]` specifies a one-dimensional tensor with 9 elements.
+* `dense_shape`: A 1-D int64 tensor of dense_shape `[ndims]`, which specifies
+the
+  dense_shape of the sparse tensor. Takes a list indicating the number of
+  elements
+  in each dimension. For example, `dense_shape=[3,6]` specifies a
+  two-dimensional
+  3x6 tensor, `dense_shape=[2,3,4]` specifies a three-dimensional 2x3x4
+  tensor, and
+  `dense_shape=[9]` specifies a one-dimensional tensor with 9 elements.
 
 The corresponding dense tensor satisfies:
 
 ```python
-dense.shape = shape
+dense.shape = dense_shape
 dense[tuple(indices[i])] = values[i]
 ```
 
@@ -44,7 +48,7 @@ obtained by calling `tf.sparse_reorder(st)`.
 Example: The sparse tensor
 
 ```python
-SparseTensor(indices=[[0, 0], [1, 2]], values=[1, 2], shape=[3, 4])
+SparseTensor(indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])
 ```
 
 represents the dense tensor
@@ -57,27 +61,34 @@ represents the dense tensor
 
 - - -
 
-#### `tf.SparseTensor.__init__(indices, values, shape)` {#SparseTensor.__init__}
+#### `tf.SparseTensor.__init__(indices, values, dense_shape=None, shape=None)` {#SparseTensor.__init__}
 
 Creates a `SparseTensor`.
 
 ##### Args:
 
 
-*  <b>`indices`</b>: A 2-D int64 tensor of shape `[N, ndims]`.
-*  <b>`values`</b>: A 1-D tensor of any type and shape `[N]`.
-*  <b>`shape`</b>: A 1-D int64 tensor of shape `[ndims]`.
+*  <b>`indices`</b>: A 2-D int64 tensor of dense_shape `[N, ndims]`.
+*  <b>`values`</b>: A 1-D tensor of any type and dense_shape `[N]`.
+*  <b>`dense_shape`</b>: A 1-D int64 tensor of dense_shape `[ndims]`.
+*  <b>`shape`</b>: Temporary.  Legacy naming of dense_shape.  Only one of `shape` or
+    `dense_shape` must be provided.
 
 ##### Returns:
 
-  A `SparseTensor`
+  A `SparseTensor`.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: if both `shape` and `dense_shape` are provided.
 
 
 - - -
 
 #### `tf.SparseTensor.get_shape()` {#SparseTensor.get_shape}
 
-Get the `TensorShape` that represents the shape of the dense tensor.
+Get the `TensorShape` representing the shape of the dense tensor.
 
 ##### Returns:
 
@@ -92,7 +103,7 @@ The indices of non-zero values in the represented dense tensor.
 
 ##### Returns:
 
-  A 2-D Tensor of int64 with shape `[N, ndims]`, where `N` is the
+  A 2-D Tensor of int64 with dense_shape `[N, ndims]`, where `N` is the
     number of non-zero values in the tensor, and `ndims` is the rank.
 
 
@@ -109,7 +120,7 @@ The non-zero values in the represented dense tensor.
 
 - - -
 
-#### `tf.SparseTensor.shape` {#SparseTensor.shape}
+#### `tf.SparseTensor.dense_shape` {#SparseTensor.dense_shape}
 
 A 1-D Tensor of int64 representing the shape of the dense tensor.
 
@@ -132,7 +143,7 @@ The `Operation` that produces `values` as an output.
 
 #### `tf.SparseTensor.graph` {#SparseTensor.graph}
 
-The `Graph` that contains the index, value, and shape tensors.
+The `Graph` that contains the index, value, and dense_shape tensors.
 
 
 
@@ -248,3 +259,10 @@ available, or `session` must be specified explicitly.
 
 
 
+- - -
+
+#### `tf.SparseTensor.shape` {#SparseTensor.shape}
+
+Legacy property returning `dense_shape`.
+
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.learn.Estimator.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.learn.Estimator.md
index 0f3006d9cae0a0..7cc4748251eed1 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.learn.Estimator.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.learn.Estimator.md
@@ -89,11 +89,12 @@ available in the SKCompat class, Estimator will only accept input_fn.
 
   est = Estimator(...) -> est = SKCompat(Estimator(...))
 
+##### Raises:
+
 
-*  <b>`Raises`</b>: 
 *  <b>`ValueError`</b>: If at least one of `x` or `y` is provided, and at least one of
-          `input_fn` or `feed_fn` is provided.
-          Or if `metrics` is not `None` or `dict`.
+      `input_fn` or `feed_fn` is provided.
+      Or if `metrics` is not `None` or `dict`.
 
 
 - - -
@@ -106,36 +107,39 @@ SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-23.
 Instructions for updating:
 The signature of the input_fn accepted by export is changing to be consistent with what's used by tf.Learn Estimator's train/evaluate. input_fn (and in most cases, input_feature_key) will become required args, and use_deprecated_input_fn will default to False and be removed altogether.
 
-    Args:
-      export_dir: A string containing a directory to write the exported graph
-        and checkpoints.
-      input_fn: If `use_deprecated_input_fn` is true, then a function that given
-        `Tensor` of `Example` strings, parses it into features that are then
-        passed to the model. Otherwise, a function that takes no argument and
-        returns a tuple of (features, labels), where features is a dict of
-        string key to `Tensor` and labels is a `Tensor` that's currently not
-        used (and so can be `None`).
-      input_feature_key: Only used if `use_deprecated_input_fn` is false. String
-        key into the features dict returned by `input_fn` that corresponds to a
-        the raw `Example` strings `Tensor` that the exported model will take as
-        input. Can only be `None` if you're using a custom `signature_fn` that
-        does not use the first arg (examples).
-      use_deprecated_input_fn: Determines the signature format of `input_fn`.
-      signature_fn: Function that returns a default signature and a named
-        signature map, given `Tensor` of `Example` strings, `dict` of `Tensor`s
-        for features and `Tensor` or `dict` of `Tensor`s for predictions.
-      prediction_key: The key for a tensor in the `predictions` dict (output
-        from the `model_fn`) to use as the `predictions` input to the
-        `signature_fn`. Optional. If `None`, predictions will pass to
-        `signature_fn` without filtering.
-      default_batch_size: Default batch size of the `Example` placeholder.
-      exports_to_keep: Number of exports to keep.
-
-    Returns:
-      The string path to the exported directory. NB: this functionality was
-      added ca. 2016/09/25; clients that depend on the return value may need
-      to handle the case where this function returns None because subclasses
-      are not returning a value.
+##### Args:
+
+
+*  <b>`export_dir`</b>: A string containing a directory to write the exported graph
+    and checkpoints.
+*  <b>`input_fn`</b>: If `use_deprecated_input_fn` is true, then a function that given
+    `Tensor` of `Example` strings, parses it into features that are then
+    passed to the model. Otherwise, a function that takes no argument and
+    returns a tuple of (features, labels), where features is a dict of
+    string key to `Tensor` and labels is a `Tensor` that's currently not
+    used (and so can be `None`).
+*  <b>`input_feature_key`</b>: Only used if `use_deprecated_input_fn` is false. String
+    key into the features dict returned by `input_fn` that corresponds to a
+    the raw `Example` strings `Tensor` that the exported model will take as
+    input. Can only be `None` if you're using a custom `signature_fn` that
+    does not use the first arg (examples).
+*  <b>`use_deprecated_input_fn`</b>: Determines the signature format of `input_fn`.
+*  <b>`signature_fn`</b>: Function that returns a default signature and a named
+    signature map, given `Tensor` of `Example` strings, `dict` of `Tensor`s
+    for features and `Tensor` or `dict` of `Tensor`s for predictions.
+*  <b>`prediction_key`</b>: The key for a tensor in the `predictions` dict (output
+    from the `model_fn`) to use as the `predictions` input to the
+    `signature_fn`. Optional. If `None`, predictions will pass to
+    `signature_fn` without filtering.
+*  <b>`default_batch_size`</b>: Default batch size of the `Example` placeholder.
+*  <b>`exports_to_keep`</b>: Number of exports to keep.
+
+##### Returns:
+
+  The string path to the exported directory. NB: this functionality was
+  added ca. 2016/09/25; clients that depend on the return value may need
+  to handle the case where this function returns None because subclasses
+  are not returning a value.
 
 
 - - -
@@ -147,28 +151,33 @@ Exports inference graph as a SavedModel into given dir. (experimental)
 THIS FUNCTION IS EXPERIMENTAL. It may change or be removed at any time, and without warning.
 
 
-    Args:
-      export_dir_base: A string containing a directory to write the exported
-        graph and checkpoints.
-      input_fn: A function that takes no argument and
-        returns an `InputFnOps`.
-      default_output_alternative_key: the name of the head to serve when none is
-        specified.
-      assets_extra: A dict specifying how to populate the assets.extra directory
-        within the exported SavedModel.  Each key should give the destination
-        path (including the filename) relative to the assets.extra directory.
-        The corresponding value gives the full path of the source file to be
-        copied.  For example, the simple case of copying a single file without
-        renaming it is specified as
-        `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
-      as_text: whether to write the SavedModel proto in text format.
-      exports_to_keep: Number of exports to keep.
+##### Args:
+
+
+*  <b>`export_dir_base`</b>: A string containing a directory to write the exported
+    graph and checkpoints.
+*  <b>`input_fn`</b>: A function that takes no argument and
+    returns an `InputFnOps`.
+*  <b>`default_output_alternative_key`</b>: the name of the head to serve when none is
+    specified.
+*  <b>`assets_extra`</b>: A dict specifying how to populate the assets.extra directory
+    within the exported SavedModel.  Each key should give the destination
+    path (including the filename) relative to the assets.extra directory.
+    The corresponding value gives the full path of the source file to be
+    copied.  For example, the simple case of copying a single file without
+    renaming it is specified as
+    `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
+*  <b>`as_text`</b>: whether to write the SavedModel proto in text format.
+*  <b>`exports_to_keep`</b>: Number of exports to keep.
+
+##### Returns:
+
+  The string path to the exported directory.
+
+##### Raises:
 
-    Returns:
-      The string path to the exported directory.
 
-    Raises:
-      ValueError: if an unrecognized export_type is requested.
+*  <b>`ValueError`</b>: if an unrecognized export_type is requested.
 
 
 - - -
@@ -187,8 +196,9 @@ available in the SKCompat class, Estimator will only accept input_fn.
 
   est = Estimator(...) -> est = SKCompat(Estimator(...))
 
+##### Raises:
+
 
-*  <b>`Raises`</b>: 
 *  <b>`ValueError`</b>: If `x` or `y` are not `None` while `input_fn` is not `None`.
 *  <b>`ValueError`</b>: If both `steps` and `max_steps` are not `None`.
 
@@ -263,39 +273,41 @@ available in the SKCompat class, Estimator will only accept input_fn.
 
   est = Estimator(...) -> est = SKCompat(Estimator(...))
 
-    This method is expected to be called several times consecutively
-    on different or the same chunks of the dataset. This either can
-    implement iterative training or out-of-core/online training.
+This method is expected to be called several times consecutively
+on different or the same chunks of the dataset. This either can
+implement iterative training or out-of-core/online training.
 
-    This is especially useful when the whole dataset is too big to
-    fit in memory at the same time. Or when model is taking long time
-    to converge, and you want to split up training into subparts.
+This is especially useful when the whole dataset is too big to
+fit in memory at the same time. Or when model is taking long time
+to converge, and you want to split up training into subparts.
+
+##### Args:
 
 
-*  <b>`Args`</b>: 
 *  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-         returns arrays of features. The training input samples for fitting the
-         model. If set, `input_fn` must be `None`.
+     returns arrays of features. The training input samples for fitting the
+     model. If set, `input_fn` must be `None`.
 *  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-         iterator that returns array of labels. The training label values
-         (class labels in classification, real numbers in regression). If set,
-         `input_fn` must be `None`.
+     iterator that returns array of labels. The training label values
+     (class labels in classification, real numbers in regression). If set,
+     `input_fn` must be `None`.
 *  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-        `None`.
+    `None`.
 *  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
 *  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-        dimension of `x`. Must be `None` if `input_fn` is provided.
+    dimension of `x`. Must be `None` if `input_fn` is provided.
 *  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-        inside the training loop.
+    inside the training loop.
+
+##### Returns:
 
+  `self`, for chaining.
 
-*  <b>`Returns`</b>: 
-      `self`, for chaining.
+##### Raises:
 
 
-*  <b>`Raises`</b>: 
 *  <b>`ValueError`</b>: If at least one of `x` and `y` is provided, and `input_fn` is
-          provided.
+      provided.
 
 
 - - -
@@ -314,30 +326,32 @@ available in the SKCompat class, Estimator will only accept input_fn.
 
   est = Estimator(...) -> est = SKCompat(Estimator(...))
 
+##### Args:
+
 
-*  <b>`Args`</b>: 
 *  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-         returns arrays of features. The training input samples for fitting the
-         model. If set, `input_fn` must be `None`.
+     returns arrays of features. The training input samples for fitting the
+     model. If set, `input_fn` must be `None`.
 *  <b>`input_fn`</b>: Input function. If set, `x` and 'batch_size' must be `None`.
 *  <b>`batch_size`</b>: Override default batch size. If set, 'input_fn' must be
-        'None'.
+    'None'.
 *  <b>`outputs`</b>: list of `str`, name of the output to predict.
-        If `None`, returns all.
+    If `None`, returns all.
 *  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
-        for each example until inputs are exhausted. Note: The inputs must
-        terminate if you want the iterable to terminate (e.g. be sure to pass
-        num_epochs=1 if you are using something like read_batch_features).
+    for each example until inputs are exhausted. Note: The inputs must
+    terminate if you want the iterable to terminate (e.g. be sure to pass
+    num_epochs=1 if you are using something like read_batch_features).
 
+##### Returns:
+
+  A numpy array of predicted classes or regression values if the
+  constructor's `model_fn` returns a `Tensor` for `predictions` or a `dict`
+  of numpy arrays if `model_fn` returns a `dict`. Returns an iterable of
+  predictions if as_iterable is True.
 
-*  <b>`Returns`</b>: 
-      A numpy array of predicted classes or regression values if the
-      constructor's `model_fn` returns a `Tensor` for `predictions` or a `dict`
-      of numpy arrays if `model_fn` returns a `dict`. Returns an iterable of
-      predictions if as_iterable is True.
+##### Raises:
 
 
-*  <b>`Raises`</b>: 
 *  <b>`ValueError`</b>: If x and input_fn are both provided or both `None`.
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.train.SummaryWriterCache.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.learn.monitors.SummaryWriterCache.md
similarity index 63%
rename from tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.train.SummaryWriterCache.md
rename to tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.learn.monitors.SummaryWriterCache.md
index 01136ac630396e..8c700a1899d733 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.train.SummaryWriterCache.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.learn.monitors.SummaryWriterCache.md
@@ -3,14 +3,14 @@ Cache for file writers.
 This class caches file writers, one per directory.
 - - -
 
-#### `tf.train.SummaryWriterCache.clear()` {#SummaryWriterCache.clear}
+#### `tf.contrib.learn.monitors.SummaryWriterCache.clear()` {#SummaryWriterCache.clear}
 
 Clear cached summary writers. Currently only used for unit tests.
 
 
 - - -
 
-#### `tf.train.SummaryWriterCache.get(logdir)` {#SummaryWriterCache.get}
+#### `tf.contrib.learn.monitors.SummaryWriterCache.get(logdir)` {#SummaryWriterCache.get}
 
 Returns the FileWriter for the specified directory.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.legacy_seq2seq.basic_rnn_seq2seq.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.legacy_seq2seq.basic_rnn_seq2seq.md
new file mode 100644
index 00000000000000..12e5851497183a
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.legacy_seq2seq.basic_rnn_seq2seq.md
@@ -0,0 +1,26 @@
+### `tf.contrib.legacy_seq2seq.basic_rnn_seq2seq(encoder_inputs, decoder_inputs, cell, dtype=tf.float32, scope=None)` {#basic_rnn_seq2seq}
+
+Basic RNN sequence-to-sequence model.
+
+This model first runs an RNN to encode encoder_inputs into a state vector,
+then runs decoder, initialized with the last encoder state, on decoder_inputs.
+Encoder and decoder use the same RNN cell type, but don't share parameters.
+
+##### Args:
+
+
+*  <b>`encoder_inputs`</b>: A list of 2D Tensors [batch_size x input_size].
+*  <b>`decoder_inputs`</b>: A list of 2D Tensors [batch_size x input_size].
+*  <b>`cell`</b>: rnn_cell.RNNCell defining the cell function and size.
+*  <b>`dtype`</b>: The dtype of the initial state of the RNN cell (default: tf.float32).
+*  <b>`scope`</b>: VariableScope for the created subgraph; default: "basic_rnn_seq2seq".
+
+##### Returns:
+
+  A tuple of the form (outputs, state), where:
+
+*  <b>`outputs`</b>: A list of the same length as decoder_inputs of 2D Tensors with
+      shape [batch_size x output_size] containing the generated outputs.
+*  <b>`state`</b>: The state of each decoder cell in the final time-step.
+      It is a 2D Tensor of shape [batch_size x cell.state_size].
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.legacy_seq2seq.tied_rnn_seq2seq.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.legacy_seq2seq.tied_rnn_seq2seq.md
new file mode 100644
index 00000000000000..5455cefa2db354
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.legacy_seq2seq.tied_rnn_seq2seq.md
@@ -0,0 +1,30 @@
+### `tf.contrib.legacy_seq2seq.tied_rnn_seq2seq(encoder_inputs, decoder_inputs, cell, loop_function=None, dtype=tf.float32, scope=None)` {#tied_rnn_seq2seq}
+
+RNN sequence-to-sequence model with tied encoder and decoder parameters.
+
+This model first runs an RNN to encode encoder_inputs into a state vector, and
+then runs decoder, initialized with the last encoder state, on decoder_inputs.
+Encoder and decoder use the same RNN cell and share parameters.
+
+##### Args:
+
+
+*  <b>`encoder_inputs`</b>: A list of 2D Tensors [batch_size x input_size].
+*  <b>`decoder_inputs`</b>: A list of 2D Tensors [batch_size x input_size].
+*  <b>`cell`</b>: rnn_cell.RNNCell defining the cell function and size.
+*  <b>`loop_function`</b>: If not None, this function will be applied to i-th output
+    in order to generate i+1-th input, and decoder_inputs will be ignored,
+    except for the first element ("GO" symbol), see rnn_decoder for details.
+*  <b>`dtype`</b>: The dtype of the initial state of the rnn cell (default: tf.float32).
+*  <b>`scope`</b>: VariableScope for the created subgraph; default: "tied_rnn_seq2seq".
+
+##### Returns:
+
+  A tuple of the form (outputs, state), where:
+
+*  <b>`outputs`</b>: A list of the same length as decoder_inputs of 2D Tensors with
+      shape [batch_size x output_size] containing the generated outputs.
+*  <b>`state`</b>: The state of each decoder cell in each time-step. This is a list
+      with length len(decoder_inputs) -- one item for each time-step.
+      It is a 2D Tensor of shape [batch_size x cell.state_size].
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.losses.add_loss.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.losses.add_loss.md
index 8905bf8f22cf0f..ba2cba6f1bc1e0 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.losses.add_loss.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.losses.add_loss.md
@@ -1,6 +1,10 @@
 ### `tf.contrib.losses.add_loss(*args, **kwargs)` {#add_loss}
 
-Adds a externally defined loss to the collection of losses.
+Adds a externally defined loss to the collection of losses. (deprecated)
+
+THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
+Instructions for updating:
+Use tf.losses.add_loss instead.
 
 ##### Args:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.losses.cosine_distance.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.losses.cosine_distance.md
index edb3a37b1d93fc..bd91d5a4850bab 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.losses.cosine_distance.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.losses.cosine_distance.md
@@ -1,28 +1,37 @@
 ### `tf.contrib.losses.cosine_distance(*args, **kwargs)` {#cosine_distance}
 
-Adds a cosine-distance loss to the training procedure. (deprecated arguments)
+Adds a cosine-distance loss to the training procedure. (deprecated arguments) (deprecated)
+
+THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
+Instructions for updating:
+Use tf.losses.cosine_distance instead.
 
 SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-11-25.
 Instructions for updating:
 `targets` is being deprecated, use `labels`. `weight` is being deprecated, use `weights`.
 
-  Note that the function assumes that `predictions` and `labels` are already
-  unit-normalized.
-
-  Args:
-    predictions: An arbitrary matrix.
-    labels: A `Tensor` whose shape matches 'predictions'
-    dim: The dimension along which the cosine distance is computed.
-    weights: Coefficients for the loss a scalar, a tensor of shape
-      [batch_size] or a tensor whose shape matches `predictions`.
-    scope: The scope for the operations performed in computing the loss.
-    targets: Deprecated alias for `labels`.
-    weight: Deprecated alias for `weights`.
-
-  Returns:
-    A scalar `Tensor` representing the loss value.
-
-  Raises:
-    ValueError: If `predictions` shape doesn't match `labels` shape, or
-      `weights` is `None`.
+Note that the function assumes that `predictions` and `labels` are already
+unit-normalized.
+
+##### Args:
+
+
+*  <b>`predictions`</b>: An arbitrary matrix.
+*  <b>`labels`</b>: A `Tensor` whose shape matches 'predictions'
+*  <b>`dim`</b>: The dimension along which the cosine distance is computed.
+*  <b>`weights`</b>: Coefficients for the loss a scalar, a tensor of shape
+    [batch_size] or a tensor whose shape matches `predictions`.
+*  <b>`scope`</b>: The scope for the operations performed in computing the loss.
+*  <b>`targets`</b>: Deprecated alias for `labels`.
+*  <b>`weight`</b>: Deprecated alias for `weights`.
+
+##### Returns:
+
+  A scalar `Tensor` representing the loss value.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If `predictions` shape doesn't match `labels` shape, or
+    `weights` is `None`.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.losses.get_regularization_losses.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.losses.get_regularization_losses.md
index 020c24c2c6b048..e48896b8fa15fe 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.losses.get_regularization_losses.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.losses.get_regularization_losses.md
@@ -1,6 +1,10 @@
-### `tf.contrib.losses.get_regularization_losses(scope=None)` {#get_regularization_losses}
+### `tf.contrib.losses.get_regularization_losses(*args, **kwargs)` {#get_regularization_losses}
 
-Gets the regularization losses.
+Gets the regularization losses. (deprecated)
+
+THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
+Instructions for updating:
+Use tf.losses.get_regularization_losses instead.
 
 ##### Args:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.nn.rnn_cell.DropoutWrapper.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.rnn.DropoutWrapper.md
similarity index 82%
rename from tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.nn.rnn_cell.DropoutWrapper.md
rename to tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.rnn.DropoutWrapper.md
index e8f1d032ff4f54..af7dce705d6e01 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.nn.rnn_cell.DropoutWrapper.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.rnn.DropoutWrapper.md
@@ -1,14 +1,14 @@
 Operator adding dropout to inputs and outputs of the given cell.
 - - -
 
-#### `tf.nn.rnn_cell.DropoutWrapper.__call__(inputs, state, scope=None)` {#DropoutWrapper.__call__}
+#### `tf.contrib.rnn.DropoutWrapper.__call__(inputs, state, scope=None)` {#DropoutWrapper.__call__}
 
 Run the cell with the declared dropouts.
 
 
 - - -
 
-#### `tf.nn.rnn_cell.DropoutWrapper.__init__(cell, input_keep_prob=1.0, output_keep_prob=1.0, seed=None)` {#DropoutWrapper.__init__}
+#### `tf.contrib.rnn.DropoutWrapper.__init__(cell, input_keep_prob=1.0, output_keep_prob=1.0, seed=None)` {#DropoutWrapper.__init__}
 
 Create a cell with added input and/or output dropout.
 
@@ -33,21 +33,21 @@ Dropout is never used on the state.
 
 - - -
 
-#### `tf.nn.rnn_cell.DropoutWrapper.output_size` {#DropoutWrapper.output_size}
+#### `tf.contrib.rnn.DropoutWrapper.output_size` {#DropoutWrapper.output_size}
 
 
 
 
 - - -
 
-#### `tf.nn.rnn_cell.DropoutWrapper.state_size` {#DropoutWrapper.state_size}
+#### `tf.contrib.rnn.DropoutWrapper.state_size` {#DropoutWrapper.state_size}
 
 
 
 
 - - -
 
-#### `tf.nn.rnn_cell.DropoutWrapper.zero_state(batch_size, dtype)` {#DropoutWrapper.zero_state}
+#### `tf.contrib.rnn.DropoutWrapper.zero_state(batch_size, dtype)` {#DropoutWrapper.zero_state}
 
 Return zero-filled state tensor(s).
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.nn.rnn_cell.InputProjectionWrapper.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.rnn.InputProjectionWrapper.md
similarity index 81%
rename from tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.nn.rnn_cell.InputProjectionWrapper.md
rename to tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.rnn.InputProjectionWrapper.md
index cc65487d9eb905..1898136704f769 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.nn.rnn_cell.InputProjectionWrapper.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.rnn.InputProjectionWrapper.md
@@ -5,14 +5,14 @@ but instead concatenate the whole sequence of your inputs in time,
 do the projection on this batch-concatenated sequence, then split it.
 - - -
 
-#### `tf.nn.rnn_cell.InputProjectionWrapper.__call__(inputs, state, scope=None)` {#InputProjectionWrapper.__call__}
+#### `tf.contrib.rnn.InputProjectionWrapper.__call__(inputs, state, scope=None)` {#InputProjectionWrapper.__call__}
 
 Run the input projection and then the cell.
 
 
 - - -
 
-#### `tf.nn.rnn_cell.InputProjectionWrapper.__init__(cell, num_proj, input_size=None)` {#InputProjectionWrapper.__init__}
+#### `tf.contrib.rnn.InputProjectionWrapper.__init__(cell, num_proj, input_size=None)` {#InputProjectionWrapper.__init__}
 
 Create a cell with input projection.
 
@@ -31,21 +31,21 @@ Create a cell with input projection.
 
 - - -
 
-#### `tf.nn.rnn_cell.InputProjectionWrapper.output_size` {#InputProjectionWrapper.output_size}
+#### `tf.contrib.rnn.InputProjectionWrapper.output_size` {#InputProjectionWrapper.output_size}
 
 
 
 
 - - -
 
-#### `tf.nn.rnn_cell.InputProjectionWrapper.state_size` {#InputProjectionWrapper.state_size}
+#### `tf.contrib.rnn.InputProjectionWrapper.state_size` {#InputProjectionWrapper.state_size}
 
 
 
 
 - - -
 
-#### `tf.nn.rnn_cell.InputProjectionWrapper.zero_state(batch_size, dtype)` {#InputProjectionWrapper.zero_state}
+#### `tf.contrib.rnn.InputProjectionWrapper.zero_state(batch_size, dtype)` {#InputProjectionWrapper.zero_state}
 
 Return zero-filled state tensor(s).
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.scalar_summary.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.scalar_summary.md
deleted file mode 100644
index 1e8c3479e4b3de..00000000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.scalar_summary.md
+++ /dev/null
@@ -1,21 +0,0 @@
-### `tf.scalar_summary(tags, values, collections=None, name=None)` {#scalar_summary}
-
-Outputs a `Summary` protocol buffer with scalar values.
-
-The input `tags` and `values` must have the same shape.  The generated
-summary has a summary value for each tag-value pair in `tags` and `values`.
-
-##### Args:
-
-
-*  <b>`tags`</b>: A `string` `Tensor`.  Tags for the summaries.
-*  <b>`values`</b>: A real numeric Tensor.  Values for the summaries.
-*  <b>`collections`</b>: Optional list of graph collections keys. The new summary op is
-    added to these collections. Defaults to `[GraphKeys.SUMMARIES]`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A scalar `Tensor` of type `string`. The serialized `Summary` protocol
-  buffer.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.sparse_concat.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.sparse_concat.md
index 618f1f0fef681f..70cab998b444c2 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.sparse_concat.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.sparse_concat.md
@@ -1,4 +1,4 @@
-### `tf.sparse_concat(concat_dim, sp_inputs, name=None, expand_nonconcat_dim=False)` {#sparse_concat}
+### `tf.sparse_concat(axis, sp_inputs, name=None, expand_nonconcat_dim=False, concat_dim=None)` {#sparse_concat}
 
 Concatenates a list of `SparseTensor` along the specified dimension.
 
@@ -27,7 +27,7 @@ This op runs in `O(M log M)` time, where `M` is the total number of non-empty
 values across all inputs. This is due to the need for an internal sort in
 order to concatenate efficiently across an arbitrary dimension.
 
-For example, if `concat_dim = 1` and the inputs are
+For example, if `axis = 1` and the inputs are
 
     sp_inputs[0]: shape = [2, 3]
     [0, 2]: "a"
@@ -52,7 +52,7 @@ Graphically this is equivalent to doing
     [    a] concat [  d e  ] = [    a   d e  ]
     [b c  ]        [       ]   [b c          ]
 
-Another example, if 'concat_dim = 1' and the inputs are
+Another example, if 'axis = 1' and the inputs are
 
     sp_inputs[0]: shape = [3, 3]
     [0, 2]: "a"
@@ -83,12 +83,13 @@ Graphically this is equivalent to doing
 ##### Args:
 
 
-*  <b>`concat_dim`</b>: Dimension to concatenate along. Must be in range [-rank, rank),
+*  <b>`axis`</b>: Dimension to concatenate along. Must be in range [-rank, rank),
     where rank is the number of dimensions in each input `SparseTensor`.
 *  <b>`sp_inputs`</b>: List of `SparseTensor` to concatenate.
 *  <b>`name`</b>: A name prefix for the returned tensors (optional).
 *  <b>`expand_nonconcat_dim`</b>: Whether to allow the expansion in the non-concat
     dimensions. Defaulted to False.
+*  <b>`concat_dim`</b>: The old (deprecated) name for axis.
 
 ##### Returns:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.sparse_split.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.sparse_split.md
index e3e608a9e23bff..11fa3f4465df19 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.sparse_split.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.sparse_split.md
@@ -1,10 +1,10 @@
-### `tf.sparse_split(split_dim, num_split, sp_input, name=None)` {#sparse_split}
+### `tf.sparse_split(keyword_required=KeywordRequired(), sp_input=None, num_split=None, axis=None, name=None, split_dim=None)` {#sparse_split}
 
-Split a `SparseTensor` into `num_split` tensors along `split_dim`.
+Split a `SparseTensor` into `num_split` tensors along `axis`.
 
-If the `sp_input.shape[split_dim]` is not an integer multiple of `num_split`
-each slice starting from 0:`shape[split_dim] % num_split` gets extra one
-dimension. For example, if `split_dim = 1` and `num_split = 2` and the
+If the `sp_input.dense_shape[axis]` is not an integer multiple of `num_split`
+each slice starting from 0:`shape[axis] % num_split` gets extra one
+dimension. For example, if `axis = 1` and `num_split = 2` and the
 input is:
 
     input_tensor = shape = [2, 7]
@@ -24,10 +24,12 @@ Graphically the output tensors are:
 ##### Args:
 
 
-*  <b>`split_dim`</b>: A 0-D `int32` `Tensor`. The dimension along which to split.
-*  <b>`num_split`</b>: A Python integer. The number of ways to split.
+*  <b>`keyword_required`</b>: Python 2 standin for * (temporary for argument reorder)
 *  <b>`sp_input`</b>: The `SparseTensor` to split.
+*  <b>`num_split`</b>: A Python integer. The number of ways to split.
+*  <b>`axis`</b>: A 0-D `int32` `Tensor`. The dimension along which to split.
 *  <b>`name`</b>: A name for the operation (optional).
+*  <b>`split_dim`</b>: Deprecated old name for axis.
 
 ##### Returns:
 
@@ -37,4 +39,5 @@ Graphically the output tensors are:
 
 
 *  <b>`TypeError`</b>: If `sp_input` is not a `SparseTensor`.
+*  <b>`ValueError`</b>: If the deprecated `split_dim` and `axis` are both non None.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.argmin.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.argmin.md
index ae686a24cbd5cd..344cb01ce9cc77 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.argmin.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.argmin.md
@@ -1,6 +1,6 @@
 ### `tf.argmin(input, axis=None, name=None, dimension=None)` {#argmin}
 
-Returns the index with the smallest value across axiss of a tensor.
+Returns the index with the smallest value across axes of a tensor.
 
 ##### Args:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.DNNClassifier.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.DNNClassifier.md
index f9fa7c70cb9485..0ad9c0ec1bb706 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.DNNClassifier.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.DNNClassifier.md
@@ -125,7 +125,7 @@ This method will be removed after the deprecation date. To inspect variables, us
 
 - - -
 
-#### `tf.contrib.learn.DNNClassifier.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#DNNClassifier.evaluate}
+#### `tf.contrib.learn.DNNClassifier.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None, checkpoint_path=None)` {#DNNClassifier.evaluate}
 
 See evaluable.Evaluable. Note: Labels must be integer class indices.
 
@@ -199,19 +199,22 @@ The default behavior of predict() is changing. The default value for
 as_iterable will change to True, and then the flag will be removed
 altogether. The behavior of this flag is described below.
 
-    Args:
-      x: features.
-      input_fn: Input function. If set, x must be None.
-      batch_size: Override default batch size.
-      as_iterable: If True, return an iterable which keeps yielding predictions
-        for each example until inputs are exhausted. Note: The inputs must
-        terminate if you want the iterable to terminate (e.g. be sure to pass
-        num_epochs=1 if you are using something like read_batch_features).
+##### Args:
+
+
+*  <b>`x`</b>: features.
+*  <b>`input_fn`</b>: Input function. If set, x must be None.
+*  <b>`batch_size`</b>: Override default batch size.
+*  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
+    for each example until inputs are exhausted. Note: The inputs must
+    terminate if you want the iterable to terminate (e.g. be sure to pass
+    num_epochs=1 if you are using something like read_batch_features).
 
-    Returns:
-      Numpy array of predicted classes (or an iterable of predicted classes if
-      as_iterable is True). Each predicted class is represented by its class
-      index (i.e. integer from 0 to n_classes-1).
+##### Returns:
+
+  Numpy array of predicted classes with shape [batch_size] (or an iterable
+  of predicted classes if as_iterable is True). Each predicted class is
+  represented by its class index (i.e. integer from 0 to n_classes-1).
 
 
 - - -
@@ -226,19 +229,21 @@ The default behavior of predict() is changing. The default value for
 as_iterable will change to True, and then the flag will be removed
 altogether. The behavior of this flag is described below.
 
-    Args:
-      x: features.
-      input_fn: Input function. If set, x and y must be None.
-      batch_size: Override default batch size.
-      as_iterable: If True, return an iterable which keeps yielding predictions
-        for each example until inputs are exhausted. Note: The inputs must
-        terminate if you want the iterable to terminate (e.g. be sure to pass
-        num_epochs=1 if you are using something like read_batch_features).
-
-    Returns:
-      Numpy array of predicted probabilities (or an iterable of predicted
-      probabilities if as_iterable is True). Each predicted class is represented
-      by its class index (i.e. integer from 0 to n_classes-1).
+##### Args:
+
+
+*  <b>`x`</b>: features.
+*  <b>`input_fn`</b>: Input function. If set, x and y must be None.
+*  <b>`batch_size`</b>: Override default batch size.
+*  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
+    for each example until inputs are exhausted. Note: The inputs must
+    terminate if you want the iterable to terminate (e.g. be sure to pass
+    num_epochs=1 if you are using something like read_batch_features).
+
+##### Returns:
+
+  Numpy array of predicted probabilities with shape [batch_size, n_classes]
+  (or an iterable of predicted probabilities if as_iterable is True).
 
 
 - - -
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.Evaluable.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.Evaluable.md
index 29d4f69c587656..7644408d96da6f 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.Evaluable.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.Evaluable.md
@@ -1,7 +1,7 @@
 Interface for objects that are evaluatable by, e.g., `Experiment`.
 - - -
 
-#### `tf.contrib.learn.Evaluable.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#Evaluable.evaluate}
+#### `tf.contrib.learn.Evaluable.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None, checkpoint_path=None)` {#Evaluable.evaluate}
 
 Evaluates given model with provided evaluation data.
 
@@ -56,9 +56,18 @@ for which this evaluation was performed.
 
 *  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
     different data sets, such as on training data vs test data.
+*  <b>`checkpoint_path`</b>: Path of a specific checkpoint to evaluate. If `None`, the
+    latest checkpoint in `model_dir` is used.
 
 ##### Returns:
 
   Returns `dict` with evaluation results.
 
 
+- - -
+
+#### `tf.contrib.learn.Evaluable.model_dir` {#Evaluable.model_dir}
+
+Returns a path in which the eval process will look for checkpoints.
+
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.monitors.SummaryWriterCache.clear.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.monitors.SummaryWriterCache.clear.md
new file mode 100644
index 00000000000000..b77f11673ddd6a
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.monitors.SummaryWriterCache.clear.md
@@ -0,0 +1,4 @@
+#### `tf.contrib.learn.monitors.SummaryWriterCache.clear()` {#SummaryWriterCache.clear}
+
+Clear cached summary writers. Currently only used for unit tests.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.legacy_seq2seq.one2many_rnn_seq2seq.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.legacy_seq2seq.one2many_rnn_seq2seq.md
new file mode 100644
index 00000000000000..fd297eae06f592
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.legacy_seq2seq.one2many_rnn_seq2seq.md
@@ -0,0 +1,43 @@
+### `tf.contrib.legacy_seq2seq.one2many_rnn_seq2seq(encoder_inputs, decoder_inputs_dict, cell, num_encoder_symbols, num_decoder_symbols_dict, embedding_size, feed_previous=False, dtype=None, scope=None)` {#one2many_rnn_seq2seq}
+
+One-to-many RNN sequence-to-sequence model (multi-task).
+
+This is a multi-task sequence-to-sequence model with one encoder and multiple
+decoders. Reference to multi-task sequence-to-sequence learning can be found
+here: http://arxiv.org/abs/1511.06114
+
+##### Args:
+
+
+*  <b>`encoder_inputs`</b>: A list of 1D int32 Tensors of shape [batch_size].
+*  <b>`decoder_inputs_dict`</b>: A dictionany mapping decoder name (string) to
+    the corresponding decoder_inputs; each decoder_inputs is a list of 1D
+    Tensors of shape [batch_size]; num_decoders is defined as
+    len(decoder_inputs_dict).
+*  <b>`cell`</b>: rnn_cell.RNNCell defining the cell function and size.
+*  <b>`num_encoder_symbols`</b>: Integer; number of symbols on the encoder side.
+*  <b>`num_decoder_symbols_dict`</b>: A dictionary mapping decoder name (string) to an
+    integer specifying number of symbols for the corresponding decoder;
+    len(num_decoder_symbols_dict) must be equal to num_decoders.
+*  <b>`embedding_size`</b>: Integer, the length of the embedding vector for each symbol.
+*  <b>`feed_previous`</b>: Boolean or scalar Boolean Tensor; if True, only the first of
+    decoder_inputs will be used (the "GO" symbol), and all other decoder
+    inputs will be taken from previous outputs (as in embedding_rnn_decoder).
+    If False, decoder_inputs are used as given (the standard decoder case).
+*  <b>`dtype`</b>: The dtype of the initial state for both the encoder and encoder
+    rnn cells (default: tf.float32).
+*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to
+    "one2many_rnn_seq2seq"
+
+##### Returns:
+
+  A tuple of the form (outputs_dict, state_dict), where:
+
+*  <b>`outputs_dict`</b>: A mapping from decoder name (string) to a list of the same
+      length as decoder_inputs_dict[name]; each element in the list is a 2D
+      Tensors with shape [batch_size x num_decoder_symbol_list[name]]
+      containing the generated outputs.
+*  <b>`state_dict`</b>: A mapping from decoder name (string) to the final state of the
+      corresponding decoder RNN; it is a 2D Tensor of shape
+      [batch_size x cell.state_size].
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.losses.mean_squared_error.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.losses.mean_squared_error.md
index b7ddeb6ece93b7..fd7d10579b523c 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.losses.mean_squared_error.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.losses.mean_squared_error.md
@@ -1,32 +1,41 @@
 ### `tf.contrib.losses.mean_squared_error(*args, **kwargs)` {#mean_squared_error}
 
-Adds a Sum-of-Squares loss to the training procedure. (deprecated arguments)
+Adds a Sum-of-Squares loss to the training procedure. (deprecated arguments) (deprecated)
+
+THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
+Instructions for updating:
+Use tf.losses.mean_squared_error instead.
 
 SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-11-25.
 Instructions for updating:
 `targets` is being deprecated, use `labels`. `weight` is being deprecated, use `weights`.
 
-  `weight` acts as a coefficient for the loss. If a scalar is provided, then the
-  loss is simply scaled by the given value. If `weight` is a tensor of size
-  [batch_size], then the total loss for each sample of the batch is rescaled
-  by the corresponding element in the `weight` vector. If the shape of
-  `weight` matches the shape of `predictions`, then the loss of each
-  measurable element of `predictions` is scaled by the corresponding value of
-  `weight`.
-
-  Args:
-    predictions: The predicted outputs.
-    labels: The ground truth output tensor, same dimensions as 'predictions'.
-    weights: Coefficients for the loss a scalar, a tensor of shape
-      [batch_size] or a tensor whose shape matches `predictions`.
-    scope: The scope for the operations performed in computing the loss.
-    targets: Deprecated alias for `labels`.
-    weight: Deprecated alias for `weights`.
-
-  Returns:
-    A scalar `Tensor` representing the loss value.
-
-  Raises:
-    ValueError: If the shape of `predictions` doesn't match that of `labels` or
-      if the shape of `weight` is invalid.
+`weight` acts as a coefficient for the loss. If a scalar is provided, then the
+loss is simply scaled by the given value. If `weight` is a tensor of size
+[batch_size], then the total loss for each sample of the batch is rescaled
+by the corresponding element in the `weight` vector. If the shape of
+`weight` matches the shape of `predictions`, then the loss of each
+measurable element of `predictions` is scaled by the corresponding value of
+`weight`.
+
+##### Args:
+
+
+*  <b>`predictions`</b>: The predicted outputs.
+*  <b>`labels`</b>: The ground truth output tensor, same dimensions as 'predictions'.
+*  <b>`weights`</b>: Coefficients for the loss a scalar, a tensor of shape
+    [batch_size] or a tensor whose shape matches `predictions`.
+*  <b>`scope`</b>: The scope for the operations performed in computing the loss.
+*  <b>`targets`</b>: Deprecated alias for `labels`.
+*  <b>`weight`</b>: Deprecated alias for `weights`.
+
+##### Returns:
+
+  A scalar `Tensor` representing the loss value.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If the shape of `predictions` doesn't match that of `labels` or
+    if the shape of `weight` is invalid.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.metrics.streaming_recall_at_k.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.metrics.streaming_recall_at_k.md
index ee20bbc3c8b08f..88141d43f5919e 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.metrics.streaming_recall_at_k.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.metrics.streaming_recall_at_k.md
@@ -6,42 +6,48 @@ THIS FUNCTION IS DEPRECATED. It will be removed after 2016-11-08.
 Instructions for updating:
 Please use `streaming_sparse_recall_at_k`, and reshape labels from [batch_size] to [batch_size, 1].
 
-  The `streaming_recall_at_k` function creates two local variables, `total` and
-  `count`, that are used to compute the recall@k frequency. This frequency is
-  ultimately returned as `recall_at_<k>`: an idempotent operation that simply
-  divides `total` by `count`.
-
-  For estimation of the metric over a stream of data, the function creates an
-  `update_op` operation that updates these variables and returns the
-  `recall_at_<k>`. Internally, an `in_top_k` operation computes a `Tensor` with
-  shape [batch_size] whose elements indicate whether or not the corresponding
-  label is in the top `k` `predictions`. Then `update_op` increments `total`
-  with the reduced sum of `weights` where `in_top_k` is `True`, and it
-  increments `count` with the reduced sum of `weights`.
-
-  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-  Args:
-    predictions: A float `Tensor` of dimension [batch_size, num_classes].
-    labels: A `Tensor` of dimension [batch_size] whose type is in `int32`,
-      `int64`.
-    k: The number of top elements to look at for computing recall.
-    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
-    metrics_collections: An optional list of collections that `recall_at_k`
-      should be added to.
-    updates_collections: An optional list of collections `update_op` should be
-      added to.
-    name: An optional variable_scope name.
-
-  Returns:
-    recall_at_k: A `Tensor` representing the recall@k, the fraction of labels
-      which fall into the top `k` predictions.
-    update_op: An operation that increments the `total` and `count` variables
-      appropriately and whose value matches `recall_at_k`.
-
-  Raises:
-    ValueError: If `predictions` and `labels` have mismatched shapes, or if
-      `weights` is not `None` and its shape doesn't match `predictions`, or if
-      either `metrics_collections` or `updates_collections` are not a list or
-      tuple.
+The `streaming_recall_at_k` function creates two local variables, `total` and
+`count`, that are used to compute the recall@k frequency. This frequency is
+ultimately returned as `recall_at_<k>`: an idempotent operation that simply
+divides `total` by `count`.
+
+For estimation of the metric over a stream of data, the function creates an
+`update_op` operation that updates these variables and returns the
+`recall_at_<k>`. Internally, an `in_top_k` operation computes a `Tensor` with
+shape [batch_size] whose elements indicate whether or not the corresponding
+label is in the top `k` `predictions`. Then `update_op` increments `total`
+with the reduced sum of `weights` where `in_top_k` is `True`, and it
+increments `count` with the reduced sum of `weights`.
+
+If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+##### Args:
+
+
+*  <b>`predictions`</b>: A float `Tensor` of dimension [batch_size, num_classes].
+*  <b>`labels`</b>: A `Tensor` of dimension [batch_size] whose type is in `int32`,
+    `int64`.
+*  <b>`k`</b>: The number of top elements to look at for computing recall.
+*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to `predictions`.
+*  <b>`metrics_collections`</b>: An optional list of collections that `recall_at_k`
+    should be added to.
+*  <b>`updates_collections`</b>: An optional list of collections `update_op` should be
+    added to.
+*  <b>`name`</b>: An optional variable_scope name.
+
+##### Returns:
+
+
+*  <b>`recall_at_k`</b>: A `Tensor` representing the recall@k, the fraction of labels
+    which fall into the top `k` predictions.
+*  <b>`update_op`</b>: An operation that increments the `total` and `count` variables
+    appropriately and whose value matches `recall_at_k`.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, or if
+    `weights` is not `None` and its shape doesn't match `predictions`, or if
+    either `metrics_collections` or `updates_collections` are not a list or
+    tuple.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.training.batch_sequences_with_states.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.training.batch_sequences_with_states.md
index 1fb61e5e8fb4f1..fdc64077b15d71 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.training.batch_sequences_with_states.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.training.batch_sequences_with_states.md
@@ -42,7 +42,7 @@ batch_size = 32
 num_unroll = 20
 num_enqueue_threads = 3
 lstm_size = 8
-cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=lstm_size)
+cell = tf.contrib.rnn.BasicLSTMCell(num_units=lstm_size)
 
 key, sequences, context = my_parser(raw_data)
 initial_state_values = tf.zeros((state_size,), dtype=tf.float32)
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.log_poisson_loss.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.log_poisson_loss.md
index 8efbc1af99174a..cf5cbe474022c4 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.log_poisson_loss.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.log_poisson_loss.md
@@ -1,4 +1,4 @@
-### `tf.nn.log_poisson_loss(log_input, targets, compute_full_loss=False, name=None)` {#log_poisson_loss}
+### `tf.nn.log_poisson_loss(targets, log_input, compute_full_loss=False, name=None)` {#log_poisson_loss}
 
 Computes log Poisson loss given `log_input`.
 
@@ -26,8 +26,8 @@ loss is
 ##### Args:
 
 
-*  <b>`log_input`</b>: A `Tensor` of type `float32` or `float64`.
 *  <b>`targets`</b>: A `Tensor` of the same type and shape as `log_input`.
+*  <b>`log_input`</b>: A `Tensor` of type `float32` or `float64`.
 *  <b>`compute_full_loss`</b>: whether to compute the full loss. If false, a constant
     term is dropped in favor of more efficient optimization.
 *  <b>`name`</b>: A name for the operation (optional).
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.nce_loss.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.nce_loss.md
index b0fa6372156db8..f8404a6d94e152 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.nce_loss.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.nce_loss.md
@@ -3,8 +3,10 @@
 Computes and returns the noise-contrastive estimation training loss.
 
 See [Noise-contrastive estimation: A new estimation principle for
-unnormalized statistical models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf).
-Also see our [Candidate Sampling Algorithms Reference](../../extras/candidate_sampling.pdf)
+unnormalized statistical
+models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf).
+Also see our [Candidate Sampling Algorithms
+Reference](../../extras/candidate_sampling.pdf)
 
 Note: By default this uses a log-uniform (Zipfian) distribution for sampling,
 so your labels must be sorted in order of decreasing frequency to achieve
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.sparse_to_indicator.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.sparse_to_indicator.md
index 8ee455be329b6f..ede12c08fece55 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.sparse_to_indicator.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.sparse_to_indicator.md
@@ -3,14 +3,14 @@
 Converts a `SparseTensor` of ids into a dense bool indicator tensor.
 
 The last dimension of `sp_input.indices` is discarded and replaced with
-the values of `sp_input`.  If `sp_input.shape = [D0, D1, ..., Dn, K]`, then
-`output.shape = [D0, D1, ..., Dn, vocab_size]`, where
+the values of `sp_input`.  If `sp_input.dense_shape = [D0, D1, ..., Dn, K]`,
+then `output.shape = [D0, D1, ..., Dn, vocab_size]`, where
 
     output[d_0, d_1, ..., d_n, sp_input[d_0, d_1, ..., d_n, k]] = True
 
 and False elsewhere in `output`.
 
-For example, if `sp_input.shape = [2, 3, 4]` with non-empty values:
+For example, if `sp_input.dense_shape = [2, 3, 4]` with non-empty values:
 
     [0, 0, 0]: 0
     [0, 1, 0]: 10
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.train.maybe_batch_join.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.train.maybe_batch_join.md
new file mode 100644
index 00000000000000..96f605b4325682
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.train.maybe_batch_join.md
@@ -0,0 +1,40 @@
+### `tf.train.maybe_batch_join(tensors_list, keep_input, batch_size, capacity=32, enqueue_many=False, shapes=None, dynamic_pad=False, allow_smaller_final_batch=False, shared_name=None, name=None)` {#maybe_batch_join}
+
+Runs a list of tensors to conditionally fill a queue to create batches.
+
+See docstring in `batch_join` for more details.
+
+##### Args:
+
+
+*  <b>`tensors_list`</b>: A list of tuples or dictionaries of tensors to enqueue.
+*  <b>`keep_input`</b>: A `bool` scalar Tensor.  This tensor controls whether the input
+    is added to the queue or not.  If it evaluates `True`, then `tensors` are
+    added to the queue; otherwise they are dropped.  This tensor essentially
+    acts as a filtering mechanism.
+*  <b>`batch_size`</b>: An integer. The new batch size pulled from the queue.
+*  <b>`capacity`</b>: An integer. The maximum number of elements in the queue.
+*  <b>`enqueue_many`</b>: Whether each tensor in `tensor_list_list` is a single
+    example.
+*  <b>`shapes`</b>: (Optional) The shapes for each example.  Defaults to the
+    inferred shapes for `tensor_list_list[i]`.
+*  <b>`dynamic_pad`</b>: Boolean.  Allow variable dimensions in input shapes.
+    The given dimensions are padded upon dequeue so that tensors within a
+    batch have the same shapes.
+*  <b>`allow_smaller_final_batch`</b>: (Optional) Boolean. If `True`, allow the final
+    batch to be smaller if there are insufficient items left in the queue.
+*  <b>`shared_name`</b>: (Optional) If set, this queue will be shared under the given
+    name across multiple sessions.
+*  <b>`name`</b>: (Optional) A name for the operations.
+
+##### Returns:
+
+  A list or dictionary of tensors with the same number and types as
+  `tensors_list[i]`.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If the `shapes` are not specified, and cannot be
+    inferred from the elements of `tensor_list_list`.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.while_loop.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.while_loop.md
index 5149bbecd73e3c..b6db83be5f3d0b 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.while_loop.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.while_loop.md
@@ -35,7 +35,7 @@ a) If a loop variable is a SparseTensor, the shape invariant must be
 TensorShape([r]) where r is the rank of the dense tensor represented
 by the sparse tensor. It means the shapes of the three tensors of the
 SparseTensor are ([None], [None, r], [r]). NOTE: The shape invariant here
-is the shape of the SparseTensor.shape property. It must be the shape of
+is the shape of the SparseTensor.dense_shape property. It must be the shape of
 a vector.
 
 b) If a loop variable is an IndexedSlices, the shape invariant must be
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.batch_matmul.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.batch_matmul.md
deleted file mode 100644
index a4764435b80933..00000000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.batch_matmul.md
+++ /dev/null
@@ -1,41 +0,0 @@
-### `tf.batch_matmul(x, y, adj_x=None, adj_y=None, name=None)` {#batch_matmul}
-
-Multiplies slices of two tensors in batches.
-
-Multiplies all slices of `Tensor` `x` and `y` (each slice can be
-viewed as an element of a batch), and arranges the individual results
-in a single output tensor of the same batch size. Each of the
-individual slices can optionally be adjointed (to adjoint a matrix
-means to transpose and conjugate it) before multiplication by setting
-the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
-
-The input tensors `x` and `y` are 3-D or higher with shape `[..., r_x, c_x]`
-and `[..., r_y, c_y]`.
-
-The output tensor is 3-D or higher with shape `[..., r_o, c_o]`, where:
-
-    r_o = c_x if adj_x else r_x
-    c_o = r_y if adj_y else c_y
-
-It is computed as:
-
-    output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `int32`, `complex64`, `complex128`.
-    3-D or higher with shape `[..., r_x, c_x]`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-    3-D or higher with shape `[..., r_y, c_y]`.
-*  <b>`adj_x`</b>: An optional `bool`. Defaults to `False`.
-    If `True`, adjoint the slices of `x`. Defaults to `False`.
-*  <b>`adj_y`</b>: An optional `bool`. Defaults to `False`.
-    If `True`, adjoint the slices of `y`. Defaults to `False`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-  3-D or higher with shape `[..., r_o, c_o]`
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.legacy_seq2seq.sequence_loss_by_example.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.legacy_seq2seq.sequence_loss_by_example.md
new file mode 100644
index 00000000000000..a7b6c99c9a9f46
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.legacy_seq2seq.sequence_loss_by_example.md
@@ -0,0 +1,25 @@
+### `tf.contrib.legacy_seq2seq.sequence_loss_by_example(logits, targets, weights, average_across_timesteps=True, softmax_loss_function=None, name=None)` {#sequence_loss_by_example}
+
+Weighted cross-entropy loss for a sequence of logits (per example).
+
+##### Args:
+
+
+*  <b>`logits`</b>: List of 2D Tensors of shape [batch_size x num_decoder_symbols].
+*  <b>`targets`</b>: List of 1D batch-sized int32 Tensors of the same length as logits.
+*  <b>`weights`</b>: List of 1D batch-sized float-Tensors of the same length as logits.
+*  <b>`average_across_timesteps`</b>: If set, divide the returned cost by the total
+    label weight.
+*  <b>`softmax_loss_function`</b>: Function (labels-batch, inputs-batch) -> loss-batch
+    to be used instead of the standard softmax (the default if this is None).
+*  <b>`name`</b>: Optional name for this operation, default: "sequence_loss_by_example".
+
+##### Returns:
+
+  1D batch-sized float Tensor: The log-perplexity for each sequence.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If len(logits) is different from len(targets) or len(weights).
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.losses.sigmoid_cross_entropy.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.losses.sigmoid_cross_entropy.md
index 85464c8a7c60fd..242376a5465489 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.losses.sigmoid_cross_entropy.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.losses.sigmoid_cross_entropy.md
@@ -1,35 +1,44 @@
 ### `tf.contrib.losses.sigmoid_cross_entropy(*args, **kwargs)` {#sigmoid_cross_entropy}
 
-Creates a cross-entropy loss using tf.nn.sigmoid_cross_entropy_with_logits. (deprecated arguments)
+Creates a cross-entropy loss using tf.nn.sigmoid_cross_entropy_with_logits. (deprecated arguments) (deprecated)
+
+THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
+Instructions for updating:
+Use tf.losses.sigmoid_cross_entropy instead.
 
 SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-11-25.
 Instructions for updating:
 `weight` is being deprecated, use `weights`
 
-  `weight` acts as a coefficient for the loss. If a scalar is provided,
-  then the loss is simply scaled by the given value. If `weight` is a
-  tensor of size [`batch_size`], then the loss weights apply to each
-  corresponding sample.
+`weight` acts as a coefficient for the loss. If a scalar is provided,
+then the loss is simply scaled by the given value. If `weight` is a
+tensor of size [`batch_size`], then the loss weights apply to each
+corresponding sample.
+
+If `label_smoothing` is nonzero, smooth the labels towards 1/2:
+
+    new_multiclass_labels = multiclass_labels * (1 - label_smoothing)
+                            + 0.5 * label_smoothing
+
+##### Args:
+
+
+*  <b>`logits`</b>: [batch_size, num_classes] logits outputs of the network .
+*  <b>`multi_class_labels`</b>: [batch_size, num_classes] target labels in (0, 1).
+*  <b>`weights`</b>: Coefficients for the loss. The tensor must be a scalar, a tensor of
+    shape [batch_size] or shape [batch_size, num_classes].
+*  <b>`label_smoothing`</b>: If greater than 0 then smooth the labels.
+*  <b>`scope`</b>: The scope for the operations performed in computing the loss.
+*  <b>`weight`</b>: Deprecated alias for `weights`.
 
-  If `label_smoothing` is nonzero, smooth the labels towards 1/2:
+##### Returns:
 
-      new_multiclass_labels = multiclass_labels * (1 - label_smoothing)
-                              + 0.5 * label_smoothing
+  A scalar `Tensor` representing the loss value.
 
-  Args:
-    logits: [batch_size, num_classes] logits outputs of the network .
-    multi_class_labels: [batch_size, num_classes] target labels in (0, 1).
-    weights: Coefficients for the loss. The tensor must be a scalar, a tensor of
-      shape [batch_size] or shape [batch_size, num_classes].
-    label_smoothing: If greater than 0 then smooth the labels.
-    scope: The scope for the operations performed in computing the loss.
-    weight: Deprecated alias for `weights`.
+##### Raises:
 
-  Returns:
-    A scalar `Tensor` representing the loss value.
 
-  Raises:
-    ValueError: If the shape of `logits` doesn't match that of
-      `multi_class_labels` or if the shape of `weight` is invalid, or if
-      `weight` is None.
+*  <b>`ValueError`</b>: If the shape of `logits` doesn't match that of
+    `multi_class_labels` or if the shape of `weight` is invalid, or if
+    `weight` is None.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.nn.rnn_cell.EmbeddingWrapper.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.rnn.EmbeddingWrapper.md
similarity index 83%
rename from tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.nn.rnn_cell.EmbeddingWrapper.md
rename to tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.rnn.EmbeddingWrapper.md
index cb86e3714d80a1..4e1b78cd8b479d 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.nn.rnn_cell.EmbeddingWrapper.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.rnn.EmbeddingWrapper.md
@@ -6,14 +6,14 @@ do the embedding on this batch-concatenated sequence, then split it and
 feed into your RNN.
 - - -
 
-#### `tf.nn.rnn_cell.EmbeddingWrapper.__call__(inputs, state, scope=None)` {#EmbeddingWrapper.__call__}
+#### `tf.contrib.rnn.EmbeddingWrapper.__call__(inputs, state, scope=None)` {#EmbeddingWrapper.__call__}
 
 Run the cell on embedded inputs.
 
 
 - - -
 
-#### `tf.nn.rnn_cell.EmbeddingWrapper.__init__(cell, embedding_classes, embedding_size, initializer=None)` {#EmbeddingWrapper.__init__}
+#### `tf.contrib.rnn.EmbeddingWrapper.__init__(cell, embedding_classes, embedding_size, initializer=None)` {#EmbeddingWrapper.__init__}
 
 Create a cell with an added input embedding.
 
@@ -35,21 +35,21 @@ Create a cell with an added input embedding.
 
 - - -
 
-#### `tf.nn.rnn_cell.EmbeddingWrapper.output_size` {#EmbeddingWrapper.output_size}
+#### `tf.contrib.rnn.EmbeddingWrapper.output_size` {#EmbeddingWrapper.output_size}
 
 
 
 
 - - -
 
-#### `tf.nn.rnn_cell.EmbeddingWrapper.state_size` {#EmbeddingWrapper.state_size}
+#### `tf.contrib.rnn.EmbeddingWrapper.state_size` {#EmbeddingWrapper.state_size}
 
 
 
 
 - - -
 
-#### `tf.nn.rnn_cell.EmbeddingWrapper.zero_state(batch_size, dtype)` {#EmbeddingWrapper.zero_state}
+#### `tf.contrib.rnn.EmbeddingWrapper.zero_state(batch_size, dtype)` {#EmbeddingWrapper.zero_state}
 
 Return zero-filled state tensor(s).
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.rnn_cell.RNNCell.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.rnn.RNNCell.md
similarity index 91%
rename from tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.rnn_cell.RNNCell.md
rename to tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.rnn.RNNCell.md
index f5f4a730cfc8f6..55a16106816a72 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.rnn_cell.RNNCell.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.rnn.RNNCell.md
@@ -21,7 +21,7 @@ or by calling the `rnn` ops several times. Every `RNNCell` must have the
 properties below and and implement `__call__` with the following signature.
 - - -
 
-#### `tf.nn.rnn_cell.RNNCell.__call__(inputs, state, scope=None)` {#RNNCell.__call__}
+#### `tf.contrib.rnn.RNNCell.__call__(inputs, state, scope=None)` {#RNNCell.__call__}
 
 Run this RNN cell on inputs, starting from the given state.
 
@@ -46,14 +46,14 @@ Run this RNN cell on inputs, starting from the given state.
 
 - - -
 
-#### `tf.nn.rnn_cell.RNNCell.output_size` {#RNNCell.output_size}
+#### `tf.contrib.rnn.RNNCell.output_size` {#RNNCell.output_size}
 
 Integer or TensorShape: size of outputs produced by this cell.
 
 
 - - -
 
-#### `tf.nn.rnn_cell.RNNCell.state_size` {#RNNCell.state_size}
+#### `tf.contrib.rnn.RNNCell.state_size` {#RNNCell.state_size}
 
 size(s) of state(s) used by this cell.
 
@@ -63,7 +63,7 @@ or TensorShapes.
 
 - - -
 
-#### `tf.nn.rnn_cell.RNNCell.zero_state(batch_size, dtype)` {#RNNCell.zero_state}
+#### `tf.contrib.rnn.RNNCell.zero_state(batch_size, dtype)` {#RNNCell.zero_state}
 
 Return zero-filled state tensor(s).
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.histogram_summary.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.histogram_summary.md
deleted file mode 100644
index 1ede11e8203f87..00000000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.histogram_summary.md
+++ /dev/null
@@ -1,25 +0,0 @@
-### `tf.histogram_summary(tag, values, collections=None, name=None)` {#histogram_summary}
-
-Outputs a `Summary` protocol buffer with a histogram.
-
-The generated
-[`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-has one summary value containing a histogram for `values`.
-
-This op reports an `InvalidArgument` error if any value is not finite.
-
-##### Args:
-
-
-*  <b>`tag`</b>: A `string` `Tensor`. 0-D.  Tag to use for the summary value.
-*  <b>`values`</b>: A real numeric `Tensor`. Any shape. Values to use to
-    build the histogram.
-*  <b>`collections`</b>: Optional list of graph collections keys. The new summary op is
-    added to these collections. Defaults to `[GraphKeys.SUMMARIES]`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A scalar `Tensor` of type `string`. The serialized `Summary` protocol
-  buffer.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.merge_summary.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.merge_summary.md
deleted file mode 100644
index b61a501c2d6cfc..00000000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.merge_summary.md
+++ /dev/null
@@ -1,26 +0,0 @@
-### `tf.merge_summary(inputs, collections=None, name=None)` {#merge_summary}
-
-Merges summaries.
-
-This op creates a
-[`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-protocol buffer that contains the union of all the values in the input
-summaries.
-
-When the Op is run, it reports an `InvalidArgument` error if multiple values
-in the summaries to merge use the same tag.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A list of `string` `Tensor` objects containing serialized `Summary`
-    protocol buffers.
-*  <b>`collections`</b>: Optional list of graph collections keys. The new summary op is
-    added to these collections. Defaults to `[GraphKeys.SUMMARIES]`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A scalar `Tensor` of type `string`. The serialized `Summary` protocol
-  buffer resulting from the merging.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.nn.atrous_conv2d_transpose.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.nn.atrous_conv2d_transpose.md
new file mode 100644
index 00000000000000..a4caa4625842ba
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.nn.atrous_conv2d_transpose.md
@@ -0,0 +1,43 @@
+### `tf.nn.atrous_conv2d_transpose(value, filters, output_shape, rate, padding, name=None)` {#atrous_conv2d_transpose}
+
+The transpose of `atrous_conv2d`.
+
+This operation is sometimes called "deconvolution" after [Deconvolutional
+Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf), but is
+actually the transpose (gradient) of `atrous_conv2d` rather than an actual
+deconvolution.
+
+##### Args:
+
+
+*  <b>`value`</b>: A 4-D `Tensor` of type `float`. It needs to be in the default `NHWC`
+    format. Its shape is `[batch, in_height, in_width, in_channels]`.
+*  <b>`filters`</b>: A 4-D `Tensor` with the same type as `value` and shape
+    `[filter_height, filter_width, out_channels, in_channels]`. `filters`'
+    `in_channels` dimension must match that of `value`. Atrous convolution is
+    equivalent to standard convolution with upsampled filters with effective
+    height `filter_height + (filter_height - 1) * (rate - 1)` and effective
+    width `filter_width + (filter_width - 1) * (rate - 1)`, produced by
+    inserting `rate - 1` zeros along consecutive elements across the
+    `filters`' spatial dimensions.
+*  <b>`output_shape`</b>: A 1-D `Tensor` of shape representing the output shape of the
+    deconvolution op.
+*  <b>`rate`</b>: A positive int32. The stride with which we sample input values across
+    the `height` and `width` dimensions. Equivalently, the rate by which we
+    upsample the filter values by inserting zeros across the `height` and
+    `width` dimensions. In the literature, the same parameter is sometimes
+    called `input stride` or `dilation`.
+*  <b>`padding`</b>: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
+*  <b>`name`</b>: Optional name for the returned tensor.
+
+##### Returns:
+
+  A `Tensor` with the same type as `value`.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If input/output depth does not match `filters`' shape, or if
+    padding is other than `'VALID'` or `'SAME'`, or if the `rate` is less
+    than one, or if the output_shape is not a tensor with 4 elements.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.maybe_shuffle_batch.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.maybe_shuffle_batch.md
new file mode 100644
index 00000000000000..d85bded6c8342c
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.maybe_shuffle_batch.md
@@ -0,0 +1,39 @@
+### `tf.train.maybe_shuffle_batch(tensors, batch_size, capacity, min_after_dequeue, keep_input, num_threads=1, seed=None, enqueue_many=False, shapes=None, allow_smaller_final_batch=False, shared_name=None, name=None)` {#maybe_shuffle_batch}
+
+Creates batches by randomly shuffling conditionally-enqueued tensors.
+
+See docstring in `shuffle_batch` for more details.
+
+##### Args:
+
+
+*  <b>`tensors`</b>: The list or dictionary of tensors to enqueue.
+*  <b>`batch_size`</b>: The new batch size pulled from the queue.
+*  <b>`capacity`</b>: An integer. The maximum number of elements in the queue.
+*  <b>`min_after_dequeue`</b>: Minimum number elements in the queue after a
+    dequeue, used to ensure a level of mixing of elements.
+*  <b>`keep_input`</b>: A `bool` scalar Tensor.  This tensor controls whether the input
+    is added to the queue or not.  If it evaluates `True`, then `tensors` are
+    added to the queue; otherwise they are dropped.  This tensor essentially
+    acts as a filtering mechanism.
+*  <b>`num_threads`</b>: The number of threads enqueuing `tensor_list`.
+*  <b>`seed`</b>: Seed for the random shuffling within the queue.
+*  <b>`enqueue_many`</b>: Whether each tensor in `tensor_list` is a single example.
+*  <b>`shapes`</b>: (Optional) The shapes for each example.  Defaults to the
+    inferred shapes for `tensor_list`.
+*  <b>`allow_smaller_final_batch`</b>: (Optional) Boolean. If `True`, allow the final
+    batch to be smaller if there are insufficient items left in the queue.
+*  <b>`shared_name`</b>: (Optional) If set, this queue will be shared under the given
+    name across multiple sessions.
+*  <b>`name`</b>: (Optional) A name for the operations.
+
+##### Returns:
+
+  A list or dictionary of tensors with the types as `tensors`.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If the `shapes` are not specified, and cannot be
+    inferred from the elements of `tensors`.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.concat.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.concat.md
index dd300b968bfa32..404a51830d934e 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.concat.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.concat.md
@@ -1,6 +1,10 @@
-### `tf.concat(concat_dim, values, name='concat')` {#concat}
+### `tf.concat(*args, **kwargs)` {#concat}
 
-Concatenates tensors along one dimension.
+Concatenates tensors along one dimension. (deprecated)
+
+THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-13.
+Instructions for updating:
+This op will be removed after the deprecation date. Please switch to tf.concat_v2().
 
 Concatenates the list of tensors `values` along dimension `concat_dim`.  If
 `values[i].shape = [D0, D1, ... Dconcat_dim(i), ...Dn]`, the concatenated
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.losses.log_loss.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.losses.log_loss.md
index 631d05b43efe49..b97a4c5597b92f 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.losses.log_loss.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.losses.log_loss.md
@@ -1,33 +1,42 @@
 ### `tf.contrib.losses.log_loss(*args, **kwargs)` {#log_loss}
 
-Adds a Log Loss term to the training procedure. (deprecated arguments)
+Adds a Log Loss term to the training procedure. (deprecated arguments) (deprecated)
+
+THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
+Instructions for updating:
+Use tf.losses.log_loss instead.
 
 SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-11-25.
 Instructions for updating:
 `targets` is being deprecated, use `labels`. `weight` is being deprecated, use `weights`.
 
-  `weight` acts as a coefficient for the loss. If a scalar is provided, then the
-  loss is simply scaled by the given value. If `weight` is a tensor of size
-  [batch_size], then the total loss for each sample of the batch is rescaled
-  by the corresponding element in the `weight` vector. If the shape of
-  `weight` matches the shape of `predictions`, then the loss of each
-  measurable element of `predictions` is scaled by the corresponding value of
-  `weight`.
-
-  Args:
-    predictions: The predicted outputs.
-    labels: The ground truth output tensor, same dimensions as 'predictions'.
-    weights: Coefficients for the loss a scalar, a tensor of shape
-      [batch_size] or a tensor whose shape matches `predictions`.
-    epsilon: A small increment to add to avoid taking a log of zero.
-    scope: The scope for the operations performed in computing the loss.
-    targets: Deprecated alias for `labels`.
-    weight: Deprecated alias for `weights`.
-
-  Returns:
-    A scalar `Tensor` representing the loss value.
-
-  Raises:
-    ValueError: If the shape of `predictions` doesn't match that of `labels` or
-      if the shape of `weight` is invalid.
+`weight` acts as a coefficient for the loss. If a scalar is provided, then the
+loss is simply scaled by the given value. If `weight` is a tensor of size
+[batch_size], then the total loss for each sample of the batch is rescaled
+by the corresponding element in the `weight` vector. If the shape of
+`weight` matches the shape of `predictions`, then the loss of each
+measurable element of `predictions` is scaled by the corresponding value of
+`weight`.
+
+##### Args:
+
+
+*  <b>`predictions`</b>: The predicted outputs.
+*  <b>`labels`</b>: The ground truth output tensor, same dimensions as 'predictions'.
+*  <b>`weights`</b>: Coefficients for the loss a scalar, a tensor of shape
+    [batch_size] or a tensor whose shape matches `predictions`.
+*  <b>`epsilon`</b>: A small increment to add to avoid taking a log of zero.
+*  <b>`scope`</b>: The scope for the operations performed in computing the loss.
+*  <b>`targets`</b>: Deprecated alias for `labels`.
+*  <b>`weight`</b>: Deprecated alias for `weights`.
+
+##### Returns:
+
+  A scalar `Tensor` representing the loss value.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If the shape of `predictions` doesn't match that of `labels` or
+    if the shape of `weight` is invalid.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.rnn_cell.LSTMStateTuple.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.rnn.LSTMStateTuple.md
similarity index 55%
rename from tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.rnn_cell.LSTMStateTuple.md
rename to tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.rnn.LSTMStateTuple.md
index e8238077bfe3dd..7db1e1277e6d29 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.rnn_cell.LSTMStateTuple.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.rnn.LSTMStateTuple.md
@@ -5,49 +5,49 @@ Stores two elements: `(c, h)`, in that order.
 Only used when `state_is_tuple=True`.
 - - -
 
-#### `tf.nn.rnn_cell.LSTMStateTuple.__getnewargs__()` {#LSTMStateTuple.__getnewargs__}
+#### `tf.contrib.rnn.LSTMStateTuple.__getnewargs__()` {#LSTMStateTuple.__getnewargs__}
 
 Return self as a plain tuple.  Used by copy and pickle.
 
 
 - - -
 
-#### `tf.nn.rnn_cell.LSTMStateTuple.__getstate__()` {#LSTMStateTuple.__getstate__}
+#### `tf.contrib.rnn.LSTMStateTuple.__getstate__()` {#LSTMStateTuple.__getstate__}
 
 Exclude the OrderedDict from pickling
 
 
 - - -
 
-#### `tf.nn.rnn_cell.LSTMStateTuple.__new__(_cls, c, h)` {#LSTMStateTuple.__new__}
+#### `tf.contrib.rnn.LSTMStateTuple.__new__(_cls, c, h)` {#LSTMStateTuple.__new__}
 
 Create new instance of LSTMStateTuple(c, h)
 
 
 - - -
 
-#### `tf.nn.rnn_cell.LSTMStateTuple.__repr__()` {#LSTMStateTuple.__repr__}
+#### `tf.contrib.rnn.LSTMStateTuple.__repr__()` {#LSTMStateTuple.__repr__}
 
 Return a nicely formatted representation string
 
 
 - - -
 
-#### `tf.nn.rnn_cell.LSTMStateTuple.c` {#LSTMStateTuple.c}
+#### `tf.contrib.rnn.LSTMStateTuple.c` {#LSTMStateTuple.c}
 
 Alias for field number 0
 
 
 - - -
 
-#### `tf.nn.rnn_cell.LSTMStateTuple.dtype` {#LSTMStateTuple.dtype}
+#### `tf.contrib.rnn.LSTMStateTuple.dtype` {#LSTMStateTuple.dtype}
 
 
 
 
 - - -
 
-#### `tf.nn.rnn_cell.LSTMStateTuple.h` {#LSTMStateTuple.h}
+#### `tf.contrib.rnn.LSTMStateTuple.h` {#LSTMStateTuple.h}
 
 Alias for field number 1
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.parse_example.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.parse_example.md
index 180a592b8c2ee2..a868d4818e167a 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.parse_example.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.parse_example.md
@@ -7,18 +7,27 @@ protos given in `serialized`.
 
 `example_names` may contain descriptive names for the corresponding serialized
 protos. These may be useful for debugging purposes, but they have no effect on
-the output. If not `None`, `example_names` must be the same length as `serialized`.
+the output. If not `None`, `example_names` must be the same length as
+`serialized`.
 
 This op parses serialized examples into a dictionary mapping keys to `Tensor`
-and `SparseTensor` objects. `features` is a dict from keys to `VarLenFeature`
-and `FixedLenFeature` objects. Each `VarLenFeature` is mapped to a
-`SparseTensor`, and each `FixedLenFeature` is mapped to a `Tensor`.
+and `SparseTensor` objects. `features` is a dict from keys to `VarLenFeature`,
+`SparseFeature`, and `FixedLenFeature` objects. Each `VarLenFeature`
+and `SparseFeature` is mapped to a `SparseTensor`, and each
+`FixedLenFeature` is mapped to a `Tensor`.
 
 Each `VarLenFeature` maps to a `SparseTensor` of the specified type
 representing a ragged matrix. Its indices are `[batch, index]` where `batch`
 is the batch entry the value is from in `serialized`, and `index` is the
 value's index in the list of values associated with that feature and example.
 
+Each `SparseFeature` maps to a `SparseTensor` of the specified type
+representing a sparse matrix of shape
+`(serialized.size(), SparseFeature.size)`. Its indices are `[batch, index]`
+where `batch` is the batch entry the value is from in `serialized`, and
+`index` is the value's index is given by the values in the
+`SparseFeature.index_key` feature column.
+
 Each `FixedLenFeature` `df` maps to a `Tensor` of the specified type (or
 `tf.float32` if not specified) and shape `(serialized.size(),) + df.shape`.
 
@@ -47,7 +56,7 @@ then the output will look like:
 ```
 {"ft": SparseTensor(indices=[[0, 0], [0, 1], [2, 0]],
                     values=[1.0, 2.0, 3.0],
-                    shape=(3, 2)) }
+                    dense_shape=(3, 2)) }
 ```
 
 Given two `Example` input protos in `serialized`:
@@ -130,13 +139,48 @@ And the expected output is:
 }
 ```
 
+Given two `Example` input protos in `serialized`:
+
+```
+[
+  features {
+    feature { key: "val" value { float_list { value: [ 0.5, -1.0 ] } } }
+    feature { key: "ix" value { int64_list { value: [ 3, 20 ] } } }
+  },
+  features {
+    feature { key: "val" value { float_list { value: [ 0.0 ] } } }
+    feature { key: "ix" value { int64_list { value: [ 42 ] } } }
+  }
+]
+```
+
+And arguments
+
+```
+example_names: ["input0", "input1"],
+features: {
+    "sparse": SparseFeature("ix", "val", tf.float32, 100),
+}
+```
+
+Then the output is a dictionary:
+
+```python
+{
+  "sparse": SparseTensor(
+      indices=[[0, 3], [0, 20], [1, 42]],
+      values=[0.5, -1.0, 0.0]
+      shape=[2, 100]),
+}
+```
+
 ##### Args:
 
 
 *  <b>`serialized`</b>: A vector (1-D Tensor) of strings, a batch of binary
     serialized `Example` protos.
-*  <b>`features`</b>: A `dict` mapping feature keys to `FixedLenFeature` or
-    `VarLenFeature` values.
+*  <b>`features`</b>: A `dict` mapping feature keys to `FixedLenFeature`,
+    `VarLenFeature`, and `SparseFeature` values.
 *  <b>`name`</b>: A name for this operation (optional).
 *  <b>`example_names`</b>: A vector (1-D Tensor) of strings (optional), the names of
     the serialized protos in the batch.
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.legacy_seq2seq.model_with_buckets.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.legacy_seq2seq.model_with_buckets.md
new file mode 100644
index 00000000000000..37e2b9a076cd37
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.legacy_seq2seq.model_with_buckets.md
@@ -0,0 +1,42 @@
+### `tf.contrib.legacy_seq2seq.model_with_buckets(encoder_inputs, decoder_inputs, targets, weights, buckets, seq2seq, softmax_loss_function=None, per_example_loss=False, name=None)` {#model_with_buckets}
+
+Create a sequence-to-sequence model with support for bucketing.
+
+The seq2seq argument is a function that defines a sequence-to-sequence model,
+e.g., seq2seq = lambda x, y: basic_rnn_seq2seq(x, y, rnn_cell.GRUCell(24))
+
+##### Args:
+
+
+*  <b>`encoder_inputs`</b>: A list of Tensors to feed the encoder; first seq2seq input.
+*  <b>`decoder_inputs`</b>: A list of Tensors to feed the decoder; second seq2seq input.
+*  <b>`targets`</b>: A list of 1D batch-sized int32 Tensors (desired output sequence).
+*  <b>`weights`</b>: List of 1D batch-sized float-Tensors to weight the targets.
+*  <b>`buckets`</b>: A list of pairs of (input size, output size) for each bucket.
+*  <b>`seq2seq`</b>: A sequence-to-sequence model function; it takes 2 input that
+    agree with encoder_inputs and decoder_inputs, and returns a pair
+    consisting of outputs and states (as, e.g., basic_rnn_seq2seq).
+*  <b>`softmax_loss_function`</b>: Function (inputs-batch, labels-batch) -> loss-batch
+    to be used instead of the standard softmax (the default if this is None).
+*  <b>`per_example_loss`</b>: Boolean. If set, the returned loss will be a batch-sized
+    tensor of losses for each sequence in the batch. If unset, it will be
+    a scalar with the averaged loss from all examples.
+*  <b>`name`</b>: Optional name for this operation, defaults to "model_with_buckets".
+
+##### Returns:
+
+  A tuple of the form (outputs, losses), where:
+
+*  <b>`outputs`</b>: The outputs for each bucket. Its j'th element consists of a list
+      of 2D Tensors. The shape of output tensors can be either
+      [batch_size x output_size] or [batch_size x num_decoder_symbols]
+      depending on the seq2seq model used.
+*  <b>`losses`</b>: List of scalar Tensors, representing losses for each bucket, or,
+      if per_example_loss is set, a list of 1D batch-sized float Tensors.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If length of encoder_inputsut, targets, or weights is smaller
+    than the largest (last) bucket.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.losses.softmax_cross_entropy.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.losses.softmax_cross_entropy.md
index b13daf9ed741f4..1d2b961e444591 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.losses.softmax_cross_entropy.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.losses.softmax_cross_entropy.md
@@ -1,33 +1,42 @@
 ### `tf.contrib.losses.softmax_cross_entropy(*args, **kwargs)` {#softmax_cross_entropy}
 
-Creates a cross-entropy loss using tf.nn.softmax_cross_entropy_with_logits. (deprecated arguments)
+Creates a cross-entropy loss using tf.nn.softmax_cross_entropy_with_logits. (deprecated arguments) (deprecated)
+
+THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
+Instructions for updating:
+Use tf.losses.softmax_cross_entropy instead.
 
 SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-11-25.
 Instructions for updating:
 `weight` is being deprecated, use `weights`
 
-  `weight` acts as a coefficient for the loss. If a scalar is provided,
-  then the loss is simply scaled by the given value. If `weight` is a
-  tensor of size [`batch_size`], then the loss weights apply to each
-  corresponding sample.
-
-  If `label_smoothing` is nonzero, smooth the labels towards 1/num_classes:
-      new_onehot_labels = onehot_labels * (1 - label_smoothing)
-                          + label_smoothing / num_classes
-
-  Args:
-    logits: [batch_size, num_classes] logits outputs of the network .
-    onehot_labels: [batch_size, num_classes] target one_hot_encoded labels.
-    weights: Coefficients for the loss. The tensor must be a scalar or a tensor
-      of shape [batch_size].
-    label_smoothing: If greater than 0 then smooth the labels.
-    scope: the scope for the operations performed in computing the loss.
-    weight: Deprecated alias for `weights`.
-
-  Returns:
-    A scalar `Tensor` representing the loss value.
-
-  Raises:
-    ValueError: If the shape of `logits` doesn't match that of `onehot_labels`
-      or if the shape of `weight` is invalid or if `weight` is None.
+`weight` acts as a coefficient for the loss. If a scalar is provided,
+then the loss is simply scaled by the given value. If `weight` is a
+tensor of size [`batch_size`], then the loss weights apply to each
+corresponding sample.
+
+If `label_smoothing` is nonzero, smooth the labels towards 1/num_classes:
+    new_onehot_labels = onehot_labels * (1 - label_smoothing)
+                        + label_smoothing / num_classes
+
+##### Args:
+
+
+*  <b>`logits`</b>: [batch_size, num_classes] logits outputs of the network .
+*  <b>`onehot_labels`</b>: [batch_size, num_classes] target one_hot_encoded labels.
+*  <b>`weights`</b>: Coefficients for the loss. The tensor must be a scalar or a tensor
+    of shape [batch_size].
+*  <b>`label_smoothing`</b>: If greater than 0 then smooth the labels.
+*  <b>`scope`</b>: the scope for the operations performed in computing the loss.
+*  <b>`weight`</b>: Deprecated alias for `weights`.
+
+##### Returns:
+
+  A scalar `Tensor` representing the loss value.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If the shape of `logits` doesn't match that of `onehot_labels`
+    or if the shape of `weight` is invalid or if `weight` is None.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.nn.rnn_cell.BasicRNNCell.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.rnn.BasicRNNCell.md
similarity index 71%
rename from tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.nn.rnn_cell.BasicRNNCell.md
rename to tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.rnn.BasicRNNCell.md
index a2aed04e463ed8..9f13497f4763d1 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.nn.rnn_cell.BasicRNNCell.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.rnn.BasicRNNCell.md
@@ -1,35 +1,35 @@
 The most basic RNN cell.
 - - -
 
-#### `tf.nn.rnn_cell.BasicRNNCell.__call__(inputs, state, scope=None)` {#BasicRNNCell.__call__}
+#### `tf.contrib.rnn.BasicRNNCell.__call__(inputs, state, scope=None)` {#BasicRNNCell.__call__}
 
 Most basic RNN: output = new_state = act(W * input + U * state + B).
 
 
 - - -
 
-#### `tf.nn.rnn_cell.BasicRNNCell.__init__(num_units, input_size=None, activation=tanh)` {#BasicRNNCell.__init__}
+#### `tf.contrib.rnn.BasicRNNCell.__init__(num_units, input_size=None, activation=tanh)` {#BasicRNNCell.__init__}
 
 
 
 
 - - -
 
-#### `tf.nn.rnn_cell.BasicRNNCell.output_size` {#BasicRNNCell.output_size}
+#### `tf.contrib.rnn.BasicRNNCell.output_size` {#BasicRNNCell.output_size}
 
 
 
 
 - - -
 
-#### `tf.nn.rnn_cell.BasicRNNCell.state_size` {#BasicRNNCell.state_size}
+#### `tf.contrib.rnn.BasicRNNCell.state_size` {#BasicRNNCell.state_size}
 
 
 
 
 - - -
 
-#### `tf.nn.rnn_cell.BasicRNNCell.zero_state(batch_size, dtype)` {#BasicRNNCell.zero_state}
+#### `tf.contrib.rnn.BasicRNNCell.zero_state(batch_size, dtype)` {#BasicRNNCell.zero_state}
 
 Return zero-filled state tensor(s).
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.nn.rnn_cell.LSTMCell.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.rnn.LSTMCell.md
similarity index 92%
rename from tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.nn.rnn_cell.LSTMCell.md
rename to tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.rnn.LSTMCell.md
index 5ee8c2ad30b241..0d380d1e2e281d 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.nn.rnn_cell.LSTMCell.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.rnn.LSTMCell.md
@@ -19,7 +19,7 @@ The class uses optional peep-hole connections, optional cell clipping, and
 an optional projection layer.
 - - -
 
-#### `tf.nn.rnn_cell.LSTMCell.__call__(inputs, state, scope=None)` {#LSTMCell.__call__}
+#### `tf.contrib.rnn.LSTMCell.__call__(inputs, state, scope=None)` {#LSTMCell.__call__}
 
 Run one step of LSTM.
 
@@ -54,7 +54,7 @@ Run one step of LSTM.
 
 - - -
 
-#### `tf.nn.rnn_cell.LSTMCell.__init__(num_units, input_size=None, use_peepholes=False, cell_clip=None, initializer=None, num_proj=None, proj_clip=None, num_unit_shards=None, num_proj_shards=None, forget_bias=1.0, state_is_tuple=True, activation=tanh)` {#LSTMCell.__init__}
+#### `tf.contrib.rnn.LSTMCell.__init__(num_units, input_size=None, use_peepholes=False, cell_clip=None, initializer=None, num_proj=None, proj_clip=None, num_unit_shards=None, num_proj_shards=None, forget_bias=1.0, state_is_tuple=True, activation=tanh)` {#LSTMCell.__init__}
 
 Initialize the parameters for an LSTM cell.
 
@@ -88,21 +88,21 @@ Initialize the parameters for an LSTM cell.
 
 - - -
 
-#### `tf.nn.rnn_cell.LSTMCell.output_size` {#LSTMCell.output_size}
+#### `tf.contrib.rnn.LSTMCell.output_size` {#LSTMCell.output_size}
 
 
 
 
 - - -
 
-#### `tf.nn.rnn_cell.LSTMCell.state_size` {#LSTMCell.state_size}
+#### `tf.contrib.rnn.LSTMCell.state_size` {#LSTMCell.state_size}
 
 
 
 
 - - -
 
-#### `tf.nn.rnn_cell.LSTMCell.zero_state(batch_size, dtype)` {#LSTMCell.zero_state}
+#### `tf.contrib.rnn.LSTMCell.zero_state(batch_size, dtype)` {#LSTMCell.zero_state}
 
 Return zero-filled state tensor(s).
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.sparse_tensor_dense_matmul.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.sparse_tensor_dense_matmul.md
index 7b75c6010979e3..02b4927c286305 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.sparse_tensor_dense_matmul.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.sparse_tensor_dense_matmul.md
@@ -25,8 +25,8 @@ converting the `SparseTensor` to a dense one and using `tf.matmul` with
 `sp_a=True`.
 
 This operation tends to perform well when A is more sparse, if the column size
-of the product is small (e.g. matrix-vector multiplication), if sp_a.shape
-takes on large values.
+of the product is small (e.g. matrix-vector multiplication), if
+`sp_a.dense_shape` takes on large values.
 
 Below is a rough speed comparison between sparse_tensor_dense_matmul,
 labelled 'sparse', and matmul(sp_a=True), labelled 'dense'.  For purposes of
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.train.SummaryWriter.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.train.SummaryWriter.md
deleted file mode 100644
index e9bdda200f9cc2..00000000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.train.SummaryWriter.md
+++ /dev/null
@@ -1,207 +0,0 @@
-
-- - -
-
-#### `tf.train.SummaryWriter.__init__(*args, **kwargs)` {#SummaryWriter.__init__}
-
-Creates a `SummaryWriter` and an event file. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-11-30.
-Instructions for updating:
-Please switch to tf.summary.FileWriter. The interface and behavior is the same; this is just a rename.
-
-    This class is deprecated, and should be replaced with tf.summary.FileWriter.
-
-    On construction the summary writer creates a new event file in `logdir`.
-    This event file will contain `Event` protocol buffers constructed when you
-    call one of the following functions: `add_summary()`, `add_session_log()`,
-    `add_event()`, or `add_graph()`.
-
-    If you pass a `Graph` to the constructor it is added to
-    the event file. (This is equivalent to calling `add_graph()` later).
-
-    TensorBoard will pick the graph from the file and display it graphically so
-    you can interactively explore the graph you built. You will usually pass
-    the graph from the session in which you launched it:
-
-    ```python
-    ...create a graph...
-    # Launch the graph in a session.
-    sess = tf.Session()
-    # Create a summary writer, add the 'graph' to the event file.
-    writer = tf.train.SummaryWriter(<some-directory>, sess.graph)
-    ```
-
-    The other arguments to the constructor control the asynchronous writes to
-    the event file:
-
-    *  `flush_secs`: How often, in seconds, to flush the added summaries
-       and events to disk.
-    *  `max_queue`: Maximum number of summaries or events pending to be
-       written to disk before one of the 'add' calls block.
-
-    Args:
-      logdir: A string. Directory where event file will be written.
-      graph: A `Graph` object, such as `sess.graph`.
-      max_queue: Integer. Size of the queue for pending events and summaries.
-      flush_secs: Number. How often, in seconds, to flush the
-        pending events and summaries to disk.
-      graph_def: DEPRECATED: Use the `graph` argument instead.
-
-
-- - -
-
-#### `tf.train.SummaryWriter.add_event(event)` {#SummaryWriter.add_event}
-
-Adds an event to the event file.
-
-##### Args:
-
-
-*  <b>`event`</b>: An `Event` protocol buffer.
-
-
-- - -
-
-#### `tf.train.SummaryWriter.add_graph(graph, global_step=None, graph_def=None)` {#SummaryWriter.add_graph}
-
-Adds a `Graph` to the event file.
-
-The graph described by the protocol buffer will be displayed by
-TensorBoard. Most users pass a graph in the constructor instead.
-
-##### Args:
-
-
-*  <b>`graph`</b>: A `Graph` object, such as `sess.graph`.
-*  <b>`global_step`</b>: Number. Optional global step counter to record with the
-    graph.
-*  <b>`graph_def`</b>: DEPRECATED. Use the `graph` parameter instead.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If both graph and graph_def are passed to the method.
-
-
-- - -
-
-#### `tf.train.SummaryWriter.add_meta_graph(meta_graph_def, global_step=None)` {#SummaryWriter.add_meta_graph}
-
-Adds a `MetaGraphDef` to the event file.
-
-The `MetaGraphDef` allows running the given graph via
-`saver.import_meta_graph()`.
-
-##### Args:
-
-
-*  <b>`meta_graph_def`</b>: A `MetaGraphDef` object, often as retured by
-    `saver.export_meta_graph()`.
-*  <b>`global_step`</b>: Number. Optional global step counter to record with the
-    graph.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If both `meta_graph_def` is not an instance of `MetaGraphDef`.
-
-
-- - -
-
-#### `tf.train.SummaryWriter.add_run_metadata(run_metadata, tag, global_step=None)` {#SummaryWriter.add_run_metadata}
-
-Adds a metadata information for a single session.run() call.
-
-##### Args:
-
-
-*  <b>`run_metadata`</b>: A `RunMetadata` protobuf object.
-*  <b>`tag`</b>: The tag name for this metadata.
-*  <b>`global_step`</b>: Number. Optional global step counter to record with the
-    StepStats.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the provided tag was already used for this type of event.
-
-
-- - -
-
-#### `tf.train.SummaryWriter.add_session_log(session_log, global_step=None)` {#SummaryWriter.add_session_log}
-
-Adds a `SessionLog` protocol buffer to the event file.
-
-This method wraps the provided session in an `Event` protocol buffer
-and adds it to the event file.
-
-##### Args:
-
-
-*  <b>`session_log`</b>: A `SessionLog` protocol buffer.
-*  <b>`global_step`</b>: Number. Optional global step value to record with the
-    summary.
-
-
-- - -
-
-#### `tf.train.SummaryWriter.add_summary(summary, global_step=None)` {#SummaryWriter.add_summary}
-
-Adds a `Summary` protocol buffer to the event file.
-
-This method wraps the provided summary in an `Event` protocol buffer
-and adds it to the event file.
-
-You can pass the result of evaluating any summary op, using
-[`Session.run()`](client.md#Session.run) or
-[`Tensor.eval()`](framework.md#Tensor.eval), to this
-function. Alternatively, you can pass a `tf.Summary` protocol
-buffer that you populate with your own data. The latter is
-commonly done to report evaluation results in event files.
-
-##### Args:
-
-
-*  <b>`summary`</b>: A `Summary` protocol buffer, optionally serialized as a string.
-*  <b>`global_step`</b>: Number. Optional global step value to record with the
-    summary.
-
-
-- - -
-
-#### `tf.train.SummaryWriter.close()` {#SummaryWriter.close}
-
-Flushes the event file to disk and close the file.
-
-Call this method when you do not need the summary writer anymore.
-
-
-- - -
-
-#### `tf.train.SummaryWriter.flush()` {#SummaryWriter.flush}
-
-Flushes the event file to disk.
-
-Call this method to make sure that all pending events have been written to
-disk.
-
-
-- - -
-
-#### `tf.train.SummaryWriter.get_logdir()` {#SummaryWriter.get_logdir}
-
-Returns the directory where event file will be written.
-
-
-- - -
-
-#### `tf.train.SummaryWriter.reopen()` {#SummaryWriter.reopen}
-
-Reopens the EventFileWriter.
-
-Can be called after `close()` to add more events in the same directory.
-The events will go into a new events file.
-
-Does nothing if the EventFileWriter was not closed.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.GraphKeys.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.GraphKeys.md
index 7ec18e834c7afa..ff4f8f8f58ce88 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.GraphKeys.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.GraphKeys.md
@@ -27,7 +27,7 @@ The following standard keys are defined:
   for more details.
 * `SUMMARIES`: the summary `Tensor` objects that have been created in the
   graph. See
-  [`tf.contrib.deprecated.merge_all_summaries()`](../../api_docs/python/train.md#merge_all_summaries)
+  [`tf.summary.merge_all()`](../../api_docs/python/summary.md#merge_all)
   for more details.
 * `QUEUE_RUNNERS`: the `QueueRunner` objects that are used to
   produce input for a computation. See
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.argmax.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.argmax.md
index 8e92fb59e8b79a..44a278e0d49e27 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.argmax.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.argmax.md
@@ -1,6 +1,6 @@
 ### `tf.argmax(input, axis=None, name=None, dimension=None)` {#argmax}
 
-Returns the index with the largest value across axiss of a tensor.
+Returns the index with the largest value across axes of a tensor.
 
 ##### Args:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.linalg.LinearOperatorTriL.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.linalg.LinearOperatorTriL.md
new file mode 100644
index 00000000000000..1351773a3c24b7
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.linalg.LinearOperatorTriL.md
@@ -0,0 +1,480 @@
+`LinearOperator` acting like a [batch] square lower triangular matrix.
+
+This operator acts like a [batch] matrix `A` with shape
+`[B1,...,Bb, N, N]` for some `b >= 0`.  The first `b` indices index a
+batch member.  For every batch index `(i1,...,ib)`, `A[i1,...,ib, : :]` is
+an `N x N` matrix.
+
+`LinearOperatorTriL` is initialized with a `Tensor` having dimensions
+`[B1,...,Bb, N, N]`. The upper triangle of the last two dimensions is ignored.
+
+```python
+# Create a 2 x 2 lower-triangular linear operator.
+tril = [[1., 2.], [3., 4.]]
+operator = LinearOperatorTriL(tril)
+
+# The upper triangle is ignored.
+operator.to_dense()
+==> [[1., 0.]
+     [3., 4.]]
+
+operator.shape
+==> [2, 2]
+
+operator.log_determinant()
+==> scalar Tensor
+
+x = ... Shape [2, 4] Tensor
+operator.apply(x)
+==> Shape [2, 4] Tensor
+
+# Create a [2, 3] batch of 4 x 4 linear operators.
+tril = tf.random_normal(shape=[2, 3, 4, 4])
+operator = LinearOperatorTriL(tril)
+
+# Create a shape [2, 1, 4, 2] vector.  Note that this shape is compatible
+# since the batch dimensions, [2, 1], are brodcast to
+# operator.batch_shape = [2, 3].
+y = tf.random_normal(shape=[2, 1, 4, 2])
+x = operator.solve(y)
+==> operator.apply(x) = y
+```
+
+### Shape compatibility
+
+This operator acts on [batch] matrix with compatible shape.
+`x` is a batch matrix with compatible shape for `apply` and `solve` if
+
+```
+operator.shape = [B1,...,Bb] + [N, N],  with b >= 0
+x.shape =        [B1,...,Bb] + [N, R],  with R >= 0.
+```
+
+### Performance
+
+Suppose `operator` is a `LinearOperatorTriL` of shape `[N, N]`,
+and `x.shape = [N, R]`.  Then
+
+* `operator.apply(x)` involves `N^2 * R` multiplications.
+* `operator.solve(x)` involves `N * R` size `N` back-substitutions.
+* `operator.determinant()` involves a size `N` `reduce_prod`.
+
+If instead `operator` and `x` have shape `[B1,...,Bb, N, N]` and
+`[B1,...,Bb, N, R]`, every operation increases in complexity by `B1*...*Bb`.
+
+### Matrix property hints
+
+This `LinearOperator` is initialized with boolean flags of the form `is_X`,
+for `X = non_singular, self_adjoint` etc...
+These have the following meaning
+* If `is_X == True`, callers should expect the operator to have the
+  property `X`.  This is a promise that should be fulfilled, but is *not* a
+  runtime assert.  For example, finite floating point precision may result
+  in these promises being violated.
+* If `is_X == False`, callers should expect the operator to not have `X`.
+* If `is_X == None` (the default), callers should have no expectation either
+  way.
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.__init__(tril, is_non_singular=None, is_self_adjoint=None, is_positive_definite=None, name='LinearOperatorTriL')` {#LinearOperatorTriL.__init__}
+
+Initialize a `LinearOperatorTriL`.
+
+##### Args:
+
+
+*  <b>`tril`</b>: Shape `[B1,...,Bb, N, N]` with `b >= 0`, `N >= 0`.
+    The lower triangular part of `tril` defines this operator.  The strictly
+    upper triangle is ignored.  Allowed dtypes: `float32`, `float64`.
+*  <b>`is_non_singular`</b>: Expect that this operator is non-singular.
+    This operator is non-singular if and only if its diagonal elements are
+    all non-zero.
+*  <b>`is_self_adjoint`</b>: Expect that this operator is equal to its hermitian
+    transpose.  This operator is self-adjoint only if it is diagonal with
+    real-valued diagonal entries.  In this case it is advised to use
+    `LinearOperatorDiag`.
+*  <b>`is_positive_definite`</b>: Expect that this operator is positive definite,
+    meaning the real part of all eigenvalues is positive.  We do not require
+    the operator to be self-adjoint to be positive-definite.  See:
+*  <b>`https`</b>: //en.wikipedia.org/wiki/Positive-definite_matrix
+        #Extension_for_non_symmetric_matrices
+*  <b>`name`</b>: A name for this `LinearOperator`.
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: If `diag.dtype` is not an allowed type.
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.add_to_tensor(x, name='add_to_tensor')` {#LinearOperatorTriL.add_to_tensor}
+
+Add matrix represented by this operator to `x`.  Equivalent to `A + x`.
+
+##### Args:
+
+
+*  <b>`x`</b>: `Tensor` with same `dtype` and shape broadcastable to `self.shape`.
+*  <b>`name`</b>: A name to give this `Op`.
+
+##### Returns:
+
+  A `Tensor` with broadcast shape and same `dtype` as `self`.
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.apply(x, adjoint=False, name='apply')` {#LinearOperatorTriL.apply}
+
+Transform `x` with left multiplication:  `x --> Ax`.
+
+##### Args:
+
+
+*  <b>`x`</b>: `Tensor` with compatible shape and same `dtype` as `self`.
+    See class docstring for definition of compatibility.
+*  <b>`adjoint`</b>: Python `bool`.  If `True`, left multiply by the adjoint.
+*  <b>`name`</b>: A name for this `Op.
+
+##### Returns:
+
+  A `Tensor` with shape `[..., M, R]` and same `dtype` as `self`.
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.assert_non_singular(name='assert_non_singular')` {#LinearOperatorTriL.assert_non_singular}
+
+Returns an `Op` that asserts this operator is non singular.
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.assert_positive_definite(name='assert_positive_definite')` {#LinearOperatorTriL.assert_positive_definite}
+
+Returns an `Op` that asserts this operator is positive definite.
+
+Here, positive definite means the real part of all eigenvalues is positive.
+We do not require the operator to be self-adjoint.
+
+##### Args:
+
+
+*  <b>`name`</b>: A name to give this `Op`.
+
+##### Returns:
+
+  An `Op` that asserts this operator is positive definite.
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.assert_self_adjoint(name='assert_self_adjoint')` {#LinearOperatorTriL.assert_self_adjoint}
+
+Returns an `Op` that asserts this operator is self-adjoint.
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.batch_shape` {#LinearOperatorTriL.batch_shape}
+
+`TensorShape` of batch dimensions of this `LinearOperator`.
+
+If this operator acts like the batch matrix `A` with
+`A.shape = [B1,...,Bb, M, N]`, then this returns
+`TensorShape([B1,...,Bb])`, equivalent to `A.get_shape()[:-2]`
+
+##### Returns:
+
+  `TensorShape`, statically determined, may be undefined.
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.batch_shape_dynamic(name='batch_shape_dynamic')` {#LinearOperatorTriL.batch_shape_dynamic}
+
+Shape of batch dimensions of this operator, determined at runtime.
+
+If this operator acts like the batch matrix `A` with
+`A.shape = [B1,...,Bb, M, N]`, then this returns a `Tensor` holding
+`[B1,...,Bb]`.
+
+##### Args:
+
+
+*  <b>`name`</b>: A name for this `Op.
+
+##### Returns:
+
+  `int32` `Tensor`
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.determinant(name='det')` {#LinearOperatorTriL.determinant}
+
+Determinant for every batch member.
+
+##### Args:
+
+
+*  <b>`name`</b>: A name for this `Op.
+
+##### Returns:
+
+  `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.domain_dimension` {#LinearOperatorTriL.domain_dimension}
+
+Dimension (in the sense of vector spaces) of the domain of this operator.
+
+If this operator acts like the batch matrix `A` with
+`A.shape = [B1,...,Bb, M, N]`, then this returns `N`.
+
+##### Returns:
+
+  Python integer if vector space dimension can be determined statically,
+    otherwise `None`.
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.domain_dimension_dynamic(name='domain_dimension_dynamic')` {#LinearOperatorTriL.domain_dimension_dynamic}
+
+Dimension (in the sense of vector spaces) of the domain of this operator.
+
+Determined at runtime.
+
+If this operator acts like the batch matrix `A` with
+`A.shape = [B1,...,Bb, M, N]`, then this returns `N`.
+
+##### Args:
+
+
+*  <b>`name`</b>: A name for this `Op`.
+
+##### Returns:
+
+  `int32` `Tensor`
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.dtype` {#LinearOperatorTriL.dtype}
+
+The `DType` of `Tensor`s handled by this `LinearOperator`.
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.graph_parents` {#LinearOperatorTriL.graph_parents}
+
+List of graph dependencies of this `LinearOperator`.
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.is_non_singular` {#LinearOperatorTriL.is_non_singular}
+
+
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.is_positive_definite` {#LinearOperatorTriL.is_positive_definite}
+
+
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.is_self_adjoint` {#LinearOperatorTriL.is_self_adjoint}
+
+
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.log_abs_determinant(name='log_abs_det')` {#LinearOperatorTriL.log_abs_determinant}
+
+Log absolute value of determinant for every batch member.
+
+##### Args:
+
+
+*  <b>`name`</b>: A name for this `Op.
+
+##### Returns:
+
+  `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.name` {#LinearOperatorTriL.name}
+
+Name prepended to all ops created by this `LinearOperator`.
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.range_dimension` {#LinearOperatorTriL.range_dimension}
+
+Dimension (in the sense of vector spaces) of the range of this operator.
+
+If this operator acts like the batch matrix `A` with
+`A.shape = [B1,...,Bb, M, N]`, then this returns `M`.
+
+##### Returns:
+
+  Python integer if vector space dimension can be determined statically,
+    otherwise `None`.
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.range_dimension_dynamic(name='range_dimension_dynamic')` {#LinearOperatorTriL.range_dimension_dynamic}
+
+Dimension (in the sense of vector spaces) of the range of this operator.
+
+Determined at runtime.
+
+If this operator acts like the batch matrix `A` with
+`A.shape = [B1,...,Bb, M, N]`, then this returns `M`.
+
+##### Args:
+
+
+*  <b>`name`</b>: A name for this `Op`.
+
+##### Returns:
+
+  `int32` `Tensor`
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.shape` {#LinearOperatorTriL.shape}
+
+`TensorShape` of this `LinearOperator`.
+
+If this operator acts like the batch matrix `A` with
+`A.shape = [B1,...,Bb, M, N]`, then this returns
+`TensorShape([B1,...,Bb, M, N])`, equivalent to `A.get_shape()`.
+
+##### Returns:
+
+  `TensorShape`, statically determined, may be undefined.
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.shape_dynamic(name='shape_dynamic')` {#LinearOperatorTriL.shape_dynamic}
+
+Shape of this `LinearOperator`, determined at runtime.
+
+If this operator acts like the batch matrix `A` with
+`A.shape = [B1,...,Bb, M, N]`, then this returns a `Tensor` holding
+`[B1,...,Bb, M, N]`, equivalent to `tf.shape(A)`.
+
+##### Args:
+
+
+*  <b>`name`</b>: A name for this `Op.
+
+##### Returns:
+
+  `int32` `Tensor`
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.solve(rhs, adjoint=False, name='solve')` {#LinearOperatorTriL.solve}
+
+Solve `R` (batch) systems of equations exactly: `A X = rhs`.
+
+Examples:
+
+```python
+# Create an operator acting like a 10 x 2 x 2 matrix.
+operator = LinearOperator(...)
+operator.shape # = 10 x 2 x 2
+
+# Solve one linear system (R = 1) for every member of the length 10 batch.
+RHS = ... # shape 10 x 2 x 1
+X = operator.solve(RHS)  # shape 10 x 2 x 1
+
+# Solve five linear systems (R = 5) for every member of the length 10 batch.
+RHS = ... # shape 10 x 2 x 5
+X = operator.solve(RHS)
+X[3, :, 2]  # Solution to the linear system A[3, :, :] X = RHS[3, :, 2]
+```
+
+##### Args:
+
+
+*  <b>`rhs`</b>: `Tensor` with same `dtype` as this operator and compatible shape.
+    See class docstring for definition of compatibility.
+*  <b>`adjoint`</b>: Python `bool`.  If `True`, solve the system involving the adjoint
+    of this `LinearOperator`.
+*  <b>`name`</b>: A name scope to use for ops added by this method.
+
+##### Returns:
+
+  `Tensor` with shape `[...,N, R]` and same `dtype` as `rhs`.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If self.is_non_singular is False.
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.tensor_rank` {#LinearOperatorTriL.tensor_rank}
+
+Rank (in the sense of tensors) of matrix corresponding to this operator.
+
+If this operator acts like the batch matrix `A` with
+`A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
+
+##### Args:
+
+
+*  <b>`name`</b>: A name for this `Op.
+
+##### Returns:
+
+  Python integer, or None if the tensor rank is undefined.
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.tensor_rank_dynamic(name='tensor_rank_dynamic')` {#LinearOperatorTriL.tensor_rank_dynamic}
+
+Rank (in the sense of tensors) of matrix corresponding to this operator.
+
+If this operator acts like the batch matrix `A` with
+`A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
+
+##### Args:
+
+
+*  <b>`name`</b>: A name for this `Op.
+
+##### Returns:
+
+  `int32` `Tensor`, determined at runtime.
+
+
+- - -
+
+#### `tf.contrib.linalg.LinearOperatorTriL.to_dense(name='to_dense')` {#LinearOperatorTriL.to_dense}
+
+Return a dense (batch) matrix representing this operator.
+
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.losses.sparse_softmax_cross_entropy.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.losses.sparse_softmax_cross_entropy.md
index 0d46ea2083b000..cfe7ca2af105c1 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.losses.sparse_softmax_cross_entropy.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.losses.sparse_softmax_cross_entropy.md
@@ -1,29 +1,38 @@
 ### `tf.contrib.losses.sparse_softmax_cross_entropy(*args, **kwargs)` {#sparse_softmax_cross_entropy}
 
-Cross-entropy loss using `tf.nn.sparse_softmax_cross_entropy_with_logits`. (deprecated arguments)
+Cross-entropy loss using `tf.nn.sparse_softmax_cross_entropy_with_logits`. (deprecated arguments) (deprecated)
+
+THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
+Instructions for updating:
+Use tf.losses.sparse_softmax_cross_entropy instead.
 
 SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-11-25.
 Instructions for updating:
 `weight` is being deprecated, use `weights`
 
-  `weight` acts as a coefficient for the loss. If a scalar is provided,
-  then the loss is simply scaled by the given value. If `weight` is a
-  tensor of size [`batch_size`], then the loss weights apply to each
-  corresponding sample.
-
-  Args:
-    logits: [batch_size, num_classes] logits outputs of the network .
-    labels: [batch_size, 1] or [batch_size] target labels of dtype `int32` or
-      `int64` in the range `[0, num_classes)`.
-    weights: Coefficients for the loss. The tensor must be a scalar or a tensor
-      of shape [batch_size] or [batch_size, 1].
-    scope: the scope for the operations performed in computing the loss.
-    weight: Deprecated alias for `weights`.
-
-  Returns:
-    A scalar `Tensor` representing the loss value.
-
-  Raises:
-    ValueError: If the shapes of logits, labels, and weight are incompatible, or
-      if `weight` is None.
+`weight` acts as a coefficient for the loss. If a scalar is provided,
+then the loss is simply scaled by the given value. If `weight` is a
+tensor of size [`batch_size`], then the loss weights apply to each
+corresponding sample.
+
+##### Args:
+
+
+*  <b>`logits`</b>: [batch_size, num_classes] logits outputs of the network .
+*  <b>`labels`</b>: [batch_size, 1] or [batch_size] target labels of dtype `int32` or
+    `int64` in the range `[0, num_classes)`.
+*  <b>`weights`</b>: Coefficients for the loss. The tensor must be a scalar or a tensor
+    of shape [batch_size] or [batch_size, 1].
+*  <b>`scope`</b>: the scope for the operations performed in computing the loss.
+*  <b>`weight`</b>: Deprecated alias for `weights`.
+
+##### Returns:
+
+  A scalar `Tensor` representing the loss value.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If the shapes of logits, labels, and weight are incompatible, or
+    if `weight` is None.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.metrics.confusion_matrix.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.metrics.confusion_matrix.md
deleted file mode 100644
index da5b22db84045a..00000000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.metrics.confusion_matrix.md
+++ /dev/null
@@ -1,60 +0,0 @@
-### `tf.contrib.metrics.confusion_matrix(predictions, labels, num_classes=None, dtype=tf.int32, name=None, weights=None)` {#confusion_matrix}
-
-Computes the confusion matrix from predictions and labels.
-
-Calculate the Confusion Matrix for a pair of prediction and
-label 1-D int arrays.
-
-The matrix rows represent the prediction labels and the columns
-represents the real labels. The confusion matrix is always a 2-D array
-of shape `[n, n]`, where `n` is the number of valid labels for a given
-classification task. Both prediction and labels must be 1-D arrays of
-the same shape in order for this function to work.
-
-If `num_classes` is None, then `num_classes` will be set to the one plus
-the maximum value in either predictions or labels.
-Class labels are expected to start at 0. E.g., if `num_classes` was
-three, then the possible labels would be `[0, 1, 2]`.
-
-If `weights` is not `None`, then each prediction contributes its
-corresponding weight to the total value of the confusion matrix cell.
-
-For example:
-
-```python
-  tf.contrib.metrics.confusion_matrix([1, 2, 4], [2, 2, 4]) ==>
-      [[0 0 0 0 0]
-       [0 0 1 0 0]
-       [0 0 1 0 0]
-       [0 0 0 0 0]
-       [0 0 0 0 1]]
-```
-
-Note that the possible labels are assumed to be `[0, 1, 2, 3, 4]`,
-resulting in a 5x5 confusion matrix.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: A 1-D array representing the predictions for a given
-               classification.
-*  <b>`labels`</b>: A 1-D representing the real labels for the classification task.
-*  <b>`num_classes`</b>: The possible number of labels the classification task can
-               have. If this value is not provided, it will be calculated
-               using both predictions and labels array.
-*  <b>`dtype`</b>: Data type of the confusion matrix.
-*  <b>`name`</b>: Scope name.
-*  <b>`weights`</b>: An optional `Tensor` whose shape matches `predictions`.
-
-##### Returns:
-
-  A k X k matrix representing the confusion matrix, where k is the number of
-  possible labels in the classification task.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If both predictions and labels are not 1-D vectors and have
-    mismatched shapes, or if `weights` is not `None` and its shape doesn't
-    match `predictions`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.nn.rnn_cell.LSTMStateTuple.__new__.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.rnn.LSTMStateTuple.__new__.md
similarity index 50%
rename from tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.nn.rnn_cell.LSTMStateTuple.__new__.md
rename to tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.rnn.LSTMStateTuple.__new__.md
index 2bc9c4c8a5e771..fec450ce787152 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.nn.rnn_cell.LSTMStateTuple.__new__.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.rnn.LSTMStateTuple.__new__.md
@@ -1,4 +1,4 @@
-#### `tf.nn.rnn_cell.LSTMStateTuple.__new__(_cls, c, h)` {#LSTMStateTuple.__new__}
+#### `tf.contrib.rnn.LSTMStateTuple.__new__(_cls, c, h)` {#LSTMStateTuple.__new__}
 
 Create new instance of LSTMStateTuple(c, h)
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.rnn.static_rnn.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.rnn.static_rnn.md
new file mode 100644
index 00000000000000..fb32ce3d2eb7e5
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.rnn.static_rnn.md
@@ -0,0 +1,65 @@
+### `tf.contrib.rnn.static_rnn(cell, inputs, initial_state=None, dtype=None, sequence_length=None, scope=None)` {#static_rnn}
+
+Creates a recurrent neural network specified by RNNCell `cell`.
+
+The simplest form of RNN network generated is:
+
+```python
+  state = cell.zero_state(...)
+  outputs = []
+  for input_ in inputs:
+    output, state = cell(input_, state)
+    outputs.append(output)
+  return (outputs, state)
+```
+However, a few other options are available:
+
+An initial state can be provided.
+If the sequence_length vector is provided, dynamic calculation is performed.
+This method of calculation does not compute the RNN steps past the maximum
+sequence length of the minibatch (thus saving computational time),
+and properly propagates the state at an example's sequence length
+to the final state output.
+
+The dynamic calculation performed is, at time `t` for batch row `b`,
+
+```python
+  (output, state)(b, t) =
+    (t >= sequence_length(b))
+      ? (zeros(cell.output_size), states(b, sequence_length(b) - 1))
+      : cell(input(b, t), state(b, t - 1))
+```
+
+##### Args:
+
+
+*  <b>`cell`</b>: An instance of RNNCell.
+*  <b>`inputs`</b>: A length T list of inputs, each a `Tensor` of shape
+    `[batch_size, input_size]`, or a nested tuple of such elements.
+*  <b>`initial_state`</b>: (optional) An initial state for the RNN.
+    If `cell.state_size` is an integer, this must be
+    a `Tensor` of appropriate type and shape `[batch_size, cell.state_size]`.
+    If `cell.state_size` is a tuple, this should be a tuple of
+    tensors having shapes `[batch_size, s] for s in cell.state_size`.
+*  <b>`dtype`</b>: (optional) The data type for the initial state and expected output.
+    Required if initial_state is not provided or RNN state has a heterogeneous
+    dtype.
+*  <b>`sequence_length`</b>: Specifies the length of each sequence in inputs.
+    An int32 or int64 vector (tensor) size `[batch_size]`, values in `[0, T)`.
+*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to "rnn".
+
+##### Returns:
+
+  A pair (outputs, state) where:
+
+  - outputs is a length T list of outputs (one for each input), or a nested
+    tuple of such elements.
+  - state is the final state
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: If `cell` is not an instance of RNNCell.
+*  <b>`ValueError`</b>: If `inputs` is `None` or an empty list, or if the input depth
+    (column size) cannot be inferred from inputs via shape inference.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.strided_slice.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.strided_slice.md
index f2c4c2a9fe95e6..25abd415c3af4c 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.strided_slice.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.strided_slice.md
@@ -1,4 +1,4 @@
-### `tf.strided_slice(input_, begin, end, strides, begin_mask=0, end_mask=0, ellipsis_mask=0, new_axis_mask=0, shrink_axis_mask=0, var=None, name=None)` {#strided_slice}
+### `tf.strided_slice(input_, begin, end, strides=None, begin_mask=0, end_mask=0, ellipsis_mask=0, new_axis_mask=0, shrink_axis_mask=0, var=None, name=None)` {#strided_slice}
 
 Extracts a strided slice from a tensor.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.audio_summary.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.audio_summary.md
deleted file mode 100644
index c0aff7777039fc..00000000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.audio_summary.md
+++ /dev/null
@@ -1,36 +0,0 @@
-### `tf.audio_summary(tag, tensor, sample_rate, max_outputs=3, collections=None, name=None)` {#audio_summary}
-
-Outputs a `Summary` protocol buffer with audio.
-
-The summary has up to `max_outputs` summary values containing audio. The
-audio is built from `tensor` which must be 3-D with shape `[batch_size,
-frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-assumed to be in the range of `[-1.0, 1.0]` with a sample rate of
-`sample_rate`.
-
-The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-build the `tag` of the summary values:
-
-*  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-*  If `max_outputs` is greater than 1, the summary value tags are
-   generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
-
-##### Args:
-
-
-*  <b>`tag`</b>: A scalar `Tensor` of type `string`. Used to build the `tag`
-    of the summary values.
-*  <b>`tensor`</b>: A 3-D `float32` `Tensor` of shape `[batch_size, frames, channels]`
-    or a 2-D `float32` `Tensor` of shape `[batch_size, frames]`.
-*  <b>`sample_rate`</b>: A Scalar `float32` `Tensor` indicating the sample rate of the
-    signal in hertz.
-*  <b>`max_outputs`</b>: Max number of batch elements to generate audio for.
-*  <b>`collections`</b>: Optional list of ops.GraphKeys.  The collections to add the
-    summary to.  Defaults to [ops.GraphKeys.SUMMARIES]
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A scalar `Tensor` of type `string`. The serialized `Summary` protocol
-  buffer.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.DNNRegressor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.DNNRegressor.md
index b20f235de649a3..6176f98c65e8d3 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.DNNRegressor.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.DNNRegressor.md
@@ -51,7 +51,7 @@ Input of `fit` and `evaluate` should have following features,
     whose `value` is a `Tensor`.
 - - -
 
-#### `tf.contrib.learn.DNNRegressor.__init__(hidden_units, feature_columns, model_dir=None, weight_column_name=None, optimizer=None, activation_fn=relu, dropout=None, gradient_clip_norm=None, enable_centered_bias=False, config=None, feature_engineering_fn=None, label_dimension=1)` {#DNNRegressor.__init__}
+#### `tf.contrib.learn.DNNRegressor.__init__(hidden_units, feature_columns, model_dir=None, weight_column_name=None, optimizer=None, activation_fn=relu, dropout=None, gradient_clip_norm=None, enable_centered_bias=False, config=None, feature_engineering_fn=None, label_dimension=1, embedding_lr_multipliers=None)` {#DNNRegressor.__init__}
 
 Initializes a `DNNRegressor` instance.
 
@@ -88,30 +88,15 @@ Initializes a `DNNRegressor` instance.
                     returns features and labels which will be fed
                     into the model.
 *  <b>`label_dimension`</b>: Dimension of the label for multilabels. Defaults to 1.
+*  <b>`embedding_lr_multipliers`</b>: Optional. A dictionary from `EbeddingColumn` to
+      a `float` multiplier. Multiplier will be used to multiply with
+      learning rate for the embedding variables.
 
 ##### Returns:
 
   A `DNNRegressor` estimator.
 
 
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.__repr__()` {#DNNRegressor.__repr__}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.bias_` {#DNNRegressor.bias_}
-
-DEPRECATED FUNCTION
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-10-30.
-Instructions for updating:
-This method will be removed after the deprecation date. To inspect variables, use get_variable_names() and get_variable_value().
-
-
 - - -
 
 #### `tf.contrib.learn.DNNRegressor.config` {#DNNRegressor.config}
@@ -121,96 +106,23 @@ This method will be removed after the deprecation date. To inspect variables, us
 
 - - -
 
-#### `tf.contrib.learn.DNNRegressor.dnn_bias_` {#DNNRegressor.dnn_bias_}
-
-Returns bias of deep neural network part. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-10-30.
-Instructions for updating:
-This method will be removed after the deprecation date. To inspect variables, use get_variable_names() and get_variable_value().
-
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.dnn_weights_` {#DNNRegressor.dnn_weights_}
-
-Returns weights of deep neural network part. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-10-30.
-Instructions for updating:
-This method will be removed after the deprecation date. To inspect variables, use get_variable_names() and get_variable_value().
-
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.evaluate(*args, **kwargs)` {#DNNRegressor.evaluate}
-
-See `Evaluable`. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
+#### `tf.contrib.learn.DNNRegressor.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None, checkpoint_path=None)` {#DNNRegressor.evaluate}
 
-
-*  <b>`Raises`</b>: 
-*  <b>`ValueError`</b>: If at least one of `x` or `y` is provided, and at least one of
-          `input_fn` or `feed_fn` is provided.
-          Or if `metrics` is not `None` or `dict`.
+See evaluable.Evaluable.
 
 
 - - -
 
-#### `tf.contrib.learn.DNNRegressor.export(export_dir, input_fn=None, input_feature_key=None, use_deprecated_input_fn=True, signature_fn=None, default_batch_size=None, exports_to_keep=None)` {#DNNRegressor.export}
-
+#### `tf.contrib.learn.DNNRegressor.export(export_dir, input_fn=None, input_feature_key=None, use_deprecated_input_fn=True, signature_fn=None, default_batch_size=1, exports_to_keep=None)` {#DNNRegressor.export}
 
+See BaseEstimator.export.
 
 
 - - -
 
-#### `tf.contrib.learn.DNNRegressor.fit(*args, **kwargs)` {#DNNRegressor.fit}
-
-See `Trainable`. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-
-*  <b>`Raises`</b>: 
-*  <b>`ValueError`</b>: If `x` or `y` are not `None` while `input_fn` is not `None`.
-*  <b>`ValueError`</b>: If both `steps` and `max_steps` are not `None`.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.get_params(deep=True)` {#DNNRegressor.get_params}
-
-Get parameters for this estimator.
-
-##### Args:
-
-
-*  <b>`deep`</b>: boolean, optional
+#### `tf.contrib.learn.DNNRegressor.fit(x=None, y=None, input_fn=None, steps=None, batch_size=None, monitors=None, max_steps=None)` {#DNNRegressor.fit}
 
-    If `True`, will return the parameters for this estimator and
-    contained subobjects that are estimators.
-
-##### Returns:
-
-  params : mapping of string to any
-  Parameter names mapped to their values.
+See trainable.Trainable.
 
 
 - - -
@@ -237,29 +149,7 @@ Returns value of the variable given by name.
 
 ##### Returns:
 
-  Numpy array - value of the tensor.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.linear_bias_` {#DNNRegressor.linear_bias_}
-
-Returns bias of the linear part. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-10-30.
-Instructions for updating:
-This method will be removed after the deprecation date. To inspect variables, use get_variable_names() and get_variable_value().
-
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.linear_weights_` {#DNNRegressor.linear_weights_}
-
-Returns weights per feature of the linear part. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-10-30.
-Instructions for updating:
-This method will be removed after the deprecation date. To inspect variables, use get_variable_names() and get_variable_value().
+  `Tensor` object.
 
 
 - - -
@@ -269,62 +159,11 @@ This method will be removed after the deprecation date. To inspect variables, us
 
 
 
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.partial_fit(*args, **kwargs)` {#DNNRegressor.partial_fit}
-
-Incremental fit on a batch of samples. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-    This method is expected to be called several times consecutively
-    on different or the same chunks of the dataset. This either can
-    implement iterative training or out-of-core/online training.
-
-    This is especially useful when the whole dataset is too big to
-    fit in memory at the same time. Or when model is taking long time
-    to converge, and you want to split up training into subparts.
-
-
-*  <b>`Args`</b>: 
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-         returns arrays of features. The training input samples for fitting the
-         model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-         iterator that returns array of labels. The training label values
-         (class labels in classification, real numbers in regression). If set,
-         `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-        `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-        dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-        inside the training loop.
-
-
-*  <b>`Returns`</b>: 
-      `self`, for chaining.
-
-
-*  <b>`Raises`</b>: 
-*  <b>`ValueError`</b>: If at least one of `x` and `y` is provided, and `input_fn` is
-          provided.
-
-
 - - -
 
 #### `tf.contrib.learn.DNNRegressor.predict(*args, **kwargs)` {#DNNRegressor.predict}
 
-Runs inference to determine the predicted class. (deprecated arguments)
+Returns predicted scores for given features. (deprecated arguments)
 
 SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-15.
 Instructions for updating:
@@ -332,41 +171,21 @@ The default behavior of predict() is changing. The default value for
 as_iterable will change to True, and then the flag will be removed
 altogether. The behavior of this flag is described below.
 
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.set_params(**params)` {#DNNRegressor.set_params}
-
-Set the parameters of this estimator.
-
-The method works on simple estimators as well as on nested objects
-(such as pipelines). The former have parameters of the form
-``<component>__<parameter>`` so that it's possible to update each
-component of a nested object.
-
 ##### Args:
 
 
-*  <b>`**params`</b>: Parameters.
+*  <b>`x`</b>: features.
+*  <b>`input_fn`</b>: Input function. If set, x must be None.
+*  <b>`batch_size`</b>: Override default batch size.
+*  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
+    for each example until inputs are exhausted. Note: The inputs must
+    terminate if you want the iterable to terminate (e.g. be sure to pass
+    num_epochs=1 if you are using something like read_batch_features).
 
 ##### Returns:
 
-  self
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If params contain invalid names.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.weights_` {#DNNRegressor.weights_}
-
-DEPRECATED FUNCTION
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-10-30.
-Instructions for updating:
-This method will be removed after the deprecation date. To inspect variables, use get_variable_names() and get_variable_value().
+  Numpy array of predicted scores (or an iterable of predicted scores if
+  as_iterable is True). If `label_dimension == 1`, the shape of the output
+  is `[batch_size]`, otherwise the shape is `[batch_size, label_dimension]`.
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.LogisticRegressor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.LogisticRegressor.md
index 82a42aaf22e172..a6b774c6bf1c5c 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.LogisticRegressor.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.LogisticRegressor.md
@@ -58,7 +58,7 @@ Initializes a LogisticRegressor.
 
 - - -
 
-#### `tf.contrib.learn.LogisticRegressor.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#LogisticRegressor.evaluate}
+#### `tf.contrib.learn.LogisticRegressor.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None, checkpoint_path=None)` {#LogisticRegressor.evaluate}
 
 Evaluates given model with provided evaluation data.
 
@@ -75,6 +75,8 @@ See superclass Estimator for more details.
 *  <b>`steps`</b>: Number of steps for which to evaluate model.
 *  <b>`metrics`</b>: Dict of metric ops to run. If None, the default metrics are used.
 *  <b>`name`</b>: Name of the evaluation.
+*  <b>`checkpoint_path`</b>: A specific checkpoint to use. By default, use the latest
+    checkpoint in the `model_dir`.
 
 ##### Returns:
 
@@ -91,36 +93,39 @@ SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-23.
 Instructions for updating:
 The signature of the input_fn accepted by export is changing to be consistent with what's used by tf.Learn Estimator's train/evaluate. input_fn (and in most cases, input_feature_key) will become required args, and use_deprecated_input_fn will default to False and be removed altogether.
 
-    Args:
-      export_dir: A string containing a directory to write the exported graph
-        and checkpoints.
-      input_fn: If `use_deprecated_input_fn` is true, then a function that given
-        `Tensor` of `Example` strings, parses it into features that are then
-        passed to the model. Otherwise, a function that takes no argument and
-        returns a tuple of (features, labels), where features is a dict of
-        string key to `Tensor` and labels is a `Tensor` that's currently not
-        used (and so can be `None`).
-      input_feature_key: Only used if `use_deprecated_input_fn` is false. String
-        key into the features dict returned by `input_fn` that corresponds to a
-        the raw `Example` strings `Tensor` that the exported model will take as
-        input. Can only be `None` if you're using a custom `signature_fn` that
-        does not use the first arg (examples).
-      use_deprecated_input_fn: Determines the signature format of `input_fn`.
-      signature_fn: Function that returns a default signature and a named
-        signature map, given `Tensor` of `Example` strings, `dict` of `Tensor`s
-        for features and `Tensor` or `dict` of `Tensor`s for predictions.
-      prediction_key: The key for a tensor in the `predictions` dict (output
-        from the `model_fn`) to use as the `predictions` input to the
-        `signature_fn`. Optional. If `None`, predictions will pass to
-        `signature_fn` without filtering.
-      default_batch_size: Default batch size of the `Example` placeholder.
-      exports_to_keep: Number of exports to keep.
-
-    Returns:
-      The string path to the exported directory. NB: this functionality was
-      added ca. 2016/09/25; clients that depend on the return value may need
-      to handle the case where this function returns None because subclasses
-      are not returning a value.
+##### Args:
+
+
+*  <b>`export_dir`</b>: A string containing a directory to write the exported graph
+    and checkpoints.
+*  <b>`input_fn`</b>: If `use_deprecated_input_fn` is true, then a function that given
+    `Tensor` of `Example` strings, parses it into features that are then
+    passed to the model. Otherwise, a function that takes no argument and
+    returns a tuple of (features, labels), where features is a dict of
+    string key to `Tensor` and labels is a `Tensor` that's currently not
+    used (and so can be `None`).
+*  <b>`input_feature_key`</b>: Only used if `use_deprecated_input_fn` is false. String
+    key into the features dict returned by `input_fn` that corresponds to a
+    the raw `Example` strings `Tensor` that the exported model will take as
+    input. Can only be `None` if you're using a custom `signature_fn` that
+    does not use the first arg (examples).
+*  <b>`use_deprecated_input_fn`</b>: Determines the signature format of `input_fn`.
+*  <b>`signature_fn`</b>: Function that returns a default signature and a named
+    signature map, given `Tensor` of `Example` strings, `dict` of `Tensor`s
+    for features and `Tensor` or `dict` of `Tensor`s for predictions.
+*  <b>`prediction_key`</b>: The key for a tensor in the `predictions` dict (output
+    from the `model_fn`) to use as the `predictions` input to the
+    `signature_fn`. Optional. If `None`, predictions will pass to
+    `signature_fn` without filtering.
+*  <b>`default_batch_size`</b>: Default batch size of the `Example` placeholder.
+*  <b>`exports_to_keep`</b>: Number of exports to keep.
+
+##### Returns:
+
+  The string path to the exported directory. NB: this functionality was
+  added ca. 2016/09/25; clients that depend on the return value may need
+  to handle the case where this function returns None because subclasses
+  are not returning a value.
 
 
 - - -
@@ -132,28 +137,33 @@ Exports inference graph as a SavedModel into given dir. (experimental)
 THIS FUNCTION IS EXPERIMENTAL. It may change or be removed at any time, and without warning.
 
 
-    Args:
-      export_dir_base: A string containing a directory to write the exported
-        graph and checkpoints.
-      input_fn: A function that takes no argument and
-        returns an `InputFnOps`.
-      default_output_alternative_key: the name of the head to serve when none is
-        specified.
-      assets_extra: A dict specifying how to populate the assets.extra directory
-        within the exported SavedModel.  Each key should give the destination
-        path (including the filename) relative to the assets.extra directory.
-        The corresponding value gives the full path of the source file to be
-        copied.  For example, the simple case of copying a single file without
-        renaming it is specified as
-        `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
-      as_text: whether to write the SavedModel proto in text format.
-      exports_to_keep: Number of exports to keep.
+##### Args:
+
+
+*  <b>`export_dir_base`</b>: A string containing a directory to write the exported
+    graph and checkpoints.
+*  <b>`input_fn`</b>: A function that takes no argument and
+    returns an `InputFnOps`.
+*  <b>`default_output_alternative_key`</b>: the name of the head to serve when none is
+    specified.
+*  <b>`assets_extra`</b>: A dict specifying how to populate the assets.extra directory
+    within the exported SavedModel.  Each key should give the destination
+    path (including the filename) relative to the assets.extra directory.
+    The corresponding value gives the full path of the source file to be
+    copied.  For example, the simple case of copying a single file without
+    renaming it is specified as
+    `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
+*  <b>`as_text`</b>: whether to write the SavedModel proto in text format.
+*  <b>`exports_to_keep`</b>: Number of exports to keep.
+
+##### Returns:
+
+  The string path to the exported directory.
+
+##### Raises:
 
-    Returns:
-      The string path to the exported directory.
 
-    Raises:
-      ValueError: if an unrecognized export_type is requested.
+*  <b>`ValueError`</b>: if an unrecognized export_type is requested.
 
 
 - - -
@@ -172,8 +182,9 @@ available in the SKCompat class, Estimator will only accept input_fn.
 
   est = Estimator(...) -> est = SKCompat(Estimator(...))
 
+##### Raises:
+
 
-*  <b>`Raises`</b>: 
 *  <b>`ValueError`</b>: If `x` or `y` are not `None` while `input_fn` is not `None`.
 *  <b>`ValueError`</b>: If both `steps` and `max_steps` are not `None`.
 
@@ -265,39 +276,41 @@ available in the SKCompat class, Estimator will only accept input_fn.
 
   est = Estimator(...) -> est = SKCompat(Estimator(...))
 
-    This method is expected to be called several times consecutively
-    on different or the same chunks of the dataset. This either can
-    implement iterative training or out-of-core/online training.
+This method is expected to be called several times consecutively
+on different or the same chunks of the dataset. This either can
+implement iterative training or out-of-core/online training.
 
-    This is especially useful when the whole dataset is too big to
-    fit in memory at the same time. Or when model is taking long time
-    to converge, and you want to split up training into subparts.
+This is especially useful when the whole dataset is too big to
+fit in memory at the same time. Or when model is taking long time
+to converge, and you want to split up training into subparts.
+
+##### Args:
 
 
-*  <b>`Args`</b>: 
 *  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-         returns arrays of features. The training input samples for fitting the
-         model. If set, `input_fn` must be `None`.
+     returns arrays of features. The training input samples for fitting the
+     model. If set, `input_fn` must be `None`.
 *  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-         iterator that returns array of labels. The training label values
-         (class labels in classification, real numbers in regression). If set,
-         `input_fn` must be `None`.
+     iterator that returns array of labels. The training label values
+     (class labels in classification, real numbers in regression). If set,
+     `input_fn` must be `None`.
 *  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-        `None`.
+    `None`.
 *  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
 *  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-        dimension of `x`. Must be `None` if `input_fn` is provided.
+    dimension of `x`. Must be `None` if `input_fn` is provided.
 *  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-        inside the training loop.
+    inside the training loop.
 
+##### Returns:
 
-*  <b>`Returns`</b>: 
-      `self`, for chaining.
+  `self`, for chaining.
+
+##### Raises:
 
 
-*  <b>`Raises`</b>: 
 *  <b>`ValueError`</b>: If at least one of `x` and `y` is provided, and `input_fn` is
-          provided.
+      provided.
 
 
 - - -
@@ -316,30 +329,32 @@ available in the SKCompat class, Estimator will only accept input_fn.
 
   est = Estimator(...) -> est = SKCompat(Estimator(...))
 
+##### Args:
+
 
-*  <b>`Args`</b>: 
 *  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-         returns arrays of features. The training input samples for fitting the
-         model. If set, `input_fn` must be `None`.
+     returns arrays of features. The training input samples for fitting the
+     model. If set, `input_fn` must be `None`.
 *  <b>`input_fn`</b>: Input function. If set, `x` and 'batch_size' must be `None`.
 *  <b>`batch_size`</b>: Override default batch size. If set, 'input_fn' must be
-        'None'.
+    'None'.
 *  <b>`outputs`</b>: list of `str`, name of the output to predict.
-        If `None`, returns all.
+    If `None`, returns all.
 *  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
-        for each example until inputs are exhausted. Note: The inputs must
-        terminate if you want the iterable to terminate (e.g. be sure to pass
-        num_epochs=1 if you are using something like read_batch_features).
+    for each example until inputs are exhausted. Note: The inputs must
+    terminate if you want the iterable to terminate (e.g. be sure to pass
+    num_epochs=1 if you are using something like read_batch_features).
 
+##### Returns:
+
+  A numpy array of predicted classes or regression values if the
+  constructor's `model_fn` returns a `Tensor` for `predictions` or a `dict`
+  of numpy arrays if `model_fn` returns a `dict`. Returns an iterable of
+  predictions if as_iterable is True.
 
-*  <b>`Returns`</b>: 
-      A numpy array of predicted classes or regression values if the
-      constructor's `model_fn` returns a `Tensor` for `predictions` or a `dict`
-      of numpy arrays if `model_fn` returns a `dict`. Returns an iterable of
-      predictions if as_iterable is True.
+##### Raises:
 
 
-*  <b>`Raises`</b>: 
 *  <b>`ValueError`</b>: If x and input_fn are both provided or both `None`.
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.legacy_seq2seq.embedding_tied_rnn_seq2seq.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.legacy_seq2seq.embedding_tied_rnn_seq2seq.md
new file mode 100644
index 00000000000000..4107deb1340fd1
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.legacy_seq2seq.embedding_tied_rnn_seq2seq.md
@@ -0,0 +1,53 @@
+### `tf.contrib.legacy_seq2seq.embedding_tied_rnn_seq2seq(encoder_inputs, decoder_inputs, cell, num_symbols, embedding_size, num_decoder_symbols=None, output_projection=None, feed_previous=False, dtype=None, scope=None)` {#embedding_tied_rnn_seq2seq}
+
+Embedding RNN sequence-to-sequence model with tied (shared) parameters.
+
+This model first embeds encoder_inputs by a newly created embedding (of shape
+[num_symbols x input_size]). Then it runs an RNN to encode embedded
+encoder_inputs into a state vector. Next, it embeds decoder_inputs using
+the same embedding. Then it runs RNN decoder, initialized with the last
+encoder state, on embedded decoder_inputs. The decoder output is over symbols
+from 0 to num_decoder_symbols - 1 if num_decoder_symbols is none; otherwise it
+is over 0 to num_symbols - 1.
+
+##### Args:
+
+
+*  <b>`encoder_inputs`</b>: A list of 1D int32 Tensors of shape [batch_size].
+*  <b>`decoder_inputs`</b>: A list of 1D int32 Tensors of shape [batch_size].
+*  <b>`cell`</b>: rnn_cell.RNNCell defining the cell function and size.
+*  <b>`num_symbols`</b>: Integer; number of symbols for both encoder and decoder.
+*  <b>`embedding_size`</b>: Integer, the length of the embedding vector for each symbol.
+*  <b>`num_decoder_symbols`</b>: Integer; number of output symbols for decoder. If
+    provided, the decoder output is over symbols 0 to num_decoder_symbols - 1.
+    Otherwise, decoder output is over symbols 0 to num_symbols - 1. Note that
+    this assumes that the vocabulary is set up such that the first
+    num_decoder_symbols of num_symbols are part of decoding.
+*  <b>`output_projection`</b>: None or a pair (W, B) of output projection weights and
+    biases; W has shape [output_size x num_symbols] and B has
+    shape [num_symbols]; if provided and feed_previous=True, each
+    fed previous output will first be multiplied by W and added B.
+*  <b>`feed_previous`</b>: Boolean or scalar Boolean Tensor; if True, only the first
+    of decoder_inputs will be used (the "GO" symbol), and all other decoder
+    inputs will be taken from previous outputs (as in embedding_rnn_decoder).
+    If False, decoder_inputs are used as given (the standard decoder case).
+*  <b>`dtype`</b>: The dtype to use for the initial RNN states (default: tf.float32).
+*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to
+    "embedding_tied_rnn_seq2seq".
+
+##### Returns:
+
+  A tuple of the form (outputs, state), where:
+
+*  <b>`outputs`</b>: A list of the same length as decoder_inputs of 2D Tensors with
+      shape [batch_size x output_symbols] containing the generated
+      outputs where output_symbols = num_decoder_symbols if
+      num_decoder_symbols is not None otherwise output_symbols = num_symbols.
+*  <b>`state`</b>: The state of each decoder cell at the final time-step.
+      It is a 2D Tensor of shape [batch_size x cell.state_size].
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: When output_projection has the wrong shape.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.losses.get_total_loss.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.losses.get_total_loss.md
index 34afd8d725841d..533121794fd8d3 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.losses.get_total_loss.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.losses.get_total_loss.md
@@ -1,6 +1,10 @@
-### `tf.contrib.losses.get_total_loss(add_regularization_losses=True, name='total_loss')` {#get_total_loss}
+### `tf.contrib.losses.get_total_loss(*args, **kwargs)` {#get_total_loss}
 
-Returns a tensor whose value represents the total loss.
+Returns a tensor whose value represents the total loss. (deprecated)
+
+THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
+Instructions for updating:
+Use tf.losses.get_total_loss instead.
 
 Notice that the function adds the given losses to the regularization losses.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.losses.mean_pairwise_squared_error.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.losses.mean_pairwise_squared_error.md
index dcba175d74fc30..ac1de723a918c0 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.losses.mean_pairwise_squared_error.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.losses.mean_pairwise_squared_error.md
@@ -1,46 +1,55 @@
 ### `tf.contrib.losses.mean_pairwise_squared_error(*args, **kwargs)` {#mean_pairwise_squared_error}
 
-Adds a pairwise-errors-squared loss to the training procedure. (deprecated arguments)
+Adds a pairwise-errors-squared loss to the training procedure. (deprecated arguments) (deprecated)
+
+THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
+Instructions for updating:
+Use tf.losses.mean_pairwise_squared_error instead.
 
 SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-11-25.
 Instructions for updating:
 `targets` is being deprecated, use `labels`. `weight` is being deprecated, use `weights`.
 
-  Unlike `mean_squared_error`, which is a measure of the differences between
-  corresponding elements of `predictions` and `labels`,
-  `mean_pairwise_squared_error` is a measure of the differences between pairs of
-  corresponding elements of `predictions` and `labels`.
-
-  For example, if `labels`=[a, b, c] and `predictions`=[x, y, z], there are
-  three pairs of differences are summed to compute the loss:
-    loss = [ ((a-b) - (x-y)).^2 + ((a-c) - (x-z)).^2 + ((b-c) - (y-z)).^2 ] / 3
-
-  Note that since the inputs are of size [batch_size, d0, ... dN], the
-  corresponding pairs are computed within each batch sample but not across
-  samples within a batch. For example, if `predictions` represents a batch of
-  16 grayscale images of dimension [batch_size, 100, 200], then the set of pairs
-  is drawn from each image, but not across images.
-
-  `weight` acts as a coefficient for the loss. If a scalar is provided, then the
-  loss is simply scaled by the given value. If `weight` is a tensor of size
-  [batch_size], then the total loss for each sample of the batch is rescaled
-  by the corresponding element in the `weight` vector.
-
-  Args:
-    predictions: The predicted outputs, a tensor of size [batch_size, d0, .. dN]
-      where N+1 is the total number of dimensions in `predictions`.
-    labels: The ground truth output tensor, whose shape must match the shape of
-      the `predictions` tensor.
-    weights: Coefficients for the loss a scalar, a tensor of shape [batch_size]
-      or a tensor whose shape matches `predictions`.
-    scope: The scope for the operations performed in computing the loss.
-    targets: Deprecated alias for `labels`.
-    weight: Deprecated alias for `weights`.
-
-  Returns:
-    A scalar `Tensor` representing the loss value.
-
-  Raises:
-    ValueError: If the shape of `predictions` doesn't match that of `labels` or
-      if the shape of `weight` is invalid.
+Unlike `mean_squared_error`, which is a measure of the differences between
+corresponding elements of `predictions` and `labels`,
+`mean_pairwise_squared_error` is a measure of the differences between pairs of
+corresponding elements of `predictions` and `labels`.
+
+For example, if `labels`=[a, b, c] and `predictions`=[x, y, z], there are
+three pairs of differences are summed to compute the loss:
+  loss = [ ((a-b) - (x-y)).^2 + ((a-c) - (x-z)).^2 + ((b-c) - (y-z)).^2 ] / 3
+
+Note that since the inputs are of size [batch_size, d0, ... dN], the
+corresponding pairs are computed within each batch sample but not across
+samples within a batch. For example, if `predictions` represents a batch of
+16 grayscale images of dimension [batch_size, 100, 200], then the set of pairs
+is drawn from each image, but not across images.
+
+`weight` acts as a coefficient for the loss. If a scalar is provided, then the
+loss is simply scaled by the given value. If `weight` is a tensor of size
+[batch_size], then the total loss for each sample of the batch is rescaled
+by the corresponding element in the `weight` vector.
+
+##### Args:
+
+
+*  <b>`predictions`</b>: The predicted outputs, a tensor of size [batch_size, d0, .. dN]
+    where N+1 is the total number of dimensions in `predictions`.
+*  <b>`labels`</b>: The ground truth output tensor, whose shape must match the shape of
+    the `predictions` tensor.
+*  <b>`weights`</b>: Coefficients for the loss a scalar, a tensor of shape [batch_size]
+    or a tensor whose shape matches `predictions`.
+*  <b>`scope`</b>: The scope for the operations performed in computing the loss.
+*  <b>`targets`</b>: Deprecated alias for `labels`.
+*  <b>`weight`</b>: Deprecated alias for `weights`.
+
+##### Returns:
+
+  A scalar `Tensor` representing the loss value.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If the shape of `predictions` doesn't match that of `labels` or
+    if the shape of `weight` is invalid.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.training.SequenceQueueingStateSaver.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.training.SequenceQueueingStateSaver.md
index 682d8e5f2ed16c..84c51fa9a41358 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.training.SequenceQueueingStateSaver.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.training.SequenceQueueingStateSaver.md
@@ -56,7 +56,7 @@ Example usage:
 batch_size = 32
 num_unroll = 20
 lstm_size = 8
-cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=lstm_size)
+cell = tf.contrib.rnn.BasicLSTMCell(num_units=lstm_size)
 initial_state_values = tf.zeros(cell.state_size, dtype=tf.float32)
 
 raw_data = get_single_input_from_input_reader()
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.raw_rnn.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.raw_rnn.md
index 8cb2eab12f13b7..fd18b6ebb8cc27 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.raw_rnn.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.raw_rnn.md
@@ -51,7 +51,7 @@ sequence_length = tf.placeholder(shape=(batch_size,), dtype=tf.int32)
 inputs_ta = tf.TensorArray(dtype=tf.float32, size=max_time)
 inputs_ta = inputs_ta.unpack(inputs)
 
-cell = tf.nn.rnn_cell.LSTMCell(num_units)
+cell = tf.contrib.rnn.LSTMCell(num_units)
 
 def loop_fn(time, cell_output, cell_state, loop_state):
   emit_output = cell_output  # == None for time == 0
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.zero_fraction.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.zero_fraction.md
index 766341a73f0a5d..dc519bbf764457 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.zero_fraction.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.zero_fraction.md
@@ -8,7 +8,8 @@ This is useful in summaries to measure and report sparsity.  For example,
 
 ```python
     z = tf.Relu(...)
-    summ = tf.contrib.deprecated.scalar_summary('sparsity', tf.nn.zero_fraction(z))
+    summ = tf.contrib.deprecated.scalar_summary('sparsity',
+    tf.nn.zero_fraction(z))
 ```
 
 ##### Args:
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.sparse_merge.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.sparse_merge.md
index b2f9570b2c605d..13f54b20dabc42 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.sparse_merge.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.sparse_merge.md
@@ -13,7 +13,7 @@ The `SparseTensor` returned by this function has the following properties:
   - `indices` is equivalent to `sp_ids.indices` with the last
     dimension discarded and replaced with `sp_ids.values`.
   - `values` is simply `sp_values.values`.
-  - If `sp_ids.shape = [D0, D1, ..., Dn, K]`, then
+  - If `sp_ids.dense_shape = [D0, D1, ..., Dn, K]`, then
     `output.shape = [D0, D1, ..., Dn, vocab_size]`.
 
 For example, consider the following feature vectors:
@@ -54,7 +54,7 @@ equal to:
 ```python
   SparseTensor(indices=[[0, 0], [1, 1], [1, 3], [1, 4], [2, 0], [2, 3]],
                values=[-3, 1, 4, 1, 5, 9],
-               shape=[3, 6])
+               dense_shape=[3, 6])
 ```
 
 ##### Args:
diff --git a/tensorflow/g3doc/api_docs/python/index.md b/tensorflow/g3doc/api_docs/python/index.md
index f3127013bf13c6..6172e005c1495e 100644
--- a/tensorflow/g3doc/api_docs/python/index.md
+++ b/tensorflow/g3doc/api_docs/python/index.md
@@ -9,6 +9,7 @@
   * [`control_dependencies`](../../api_docs/python/framework.md#control_dependencies)
   * [`convert_to_tensor`](../../api_docs/python/framework.md#convert_to_tensor)
   * [`convert_to_tensor_or_indexed_slices`](../../api_docs/python/framework.md#convert_to_tensor_or_indexed_slices)
+  * [`convert_to_tensor_or_sparse_tensor`](../../api_docs/python/framework.md#convert_to_tensor_or_sparse_tensor)
   * [`device`](../../api_docs/python/framework.md#device)
   * [`DeviceSpec`](../../api_docs/python/framework.md#DeviceSpec)
   * [`Dimension`](../../api_docs/python/framework.md#Dimension)
@@ -200,7 +201,6 @@
   * [`argmin`](../../api_docs/python/math_ops.md#argmin)
   * [`asin`](../../api_docs/python/math_ops.md#asin)
   * [`atan`](../../api_docs/python/math_ops.md#atan)
-  * [`batch_matmul`](../../api_docs/python/math_ops.md#batch_matmul)
   * [`betainc`](../../api_docs/python/math_ops.md#betainc)
   * [`ceil`](../../api_docs/python/math_ops.md#ceil)
   * [`cholesky`](../../api_docs/python/math_ops.md#cholesky)
@@ -415,7 +415,6 @@
   * [`transpose_image`](../../api_docs/python/image.md#transpose_image)
 
 * **[Sparse Tensors](../../api_docs/python/sparse_ops.md)**:
-  * [`shape`](../../api_docs/python/sparse_ops.md#shape)
   * [`sparse_add`](../../api_docs/python/sparse_ops.md#sparse_add)
   * [`sparse_concat`](../../api_docs/python/sparse_ops.md#sparse_concat)
   * [`sparse_fill_empty_rows`](../../api_docs/python/sparse_ops.md#sparse_fill_empty_rows)
@@ -455,6 +454,10 @@
   * [`limit_epochs`](../../api_docs/python/io_ops.md#limit_epochs)
   * [`match_filenames_once`](../../api_docs/python/io_ops.md#match_filenames_once)
   * [`matching_files`](../../api_docs/python/io_ops.md#matching_files)
+  * [`maybe_batch`](../../api_docs/python/io_ops.md#maybe_batch)
+  * [`maybe_batch_join`](../../api_docs/python/io_ops.md#maybe_batch_join)
+  * [`maybe_shuffle_batch`](../../api_docs/python/io_ops.md#maybe_shuffle_batch)
+  * [`maybe_shuffle_batch_join`](../../api_docs/python/io_ops.md#maybe_shuffle_batch_join)
   * [`PaddingFIFOQueue`](../../api_docs/python/io_ops.md#PaddingFIFOQueue)
   * [`parse_example`](../../api_docs/python/io_ops.md#parse_example)
   * [`parse_single_example`](../../api_docs/python/io_ops.md#parse_single_example)
@@ -473,6 +476,7 @@
   * [`slice_input_producer`](../../api_docs/python/io_ops.md#slice_input_producer)
   * [`sparse_placeholder`](../../api_docs/python/io_ops.md#sparse_placeholder)
   * [`SparseConditionalAccumulator`](../../api_docs/python/io_ops.md#SparseConditionalAccumulator)
+  * [`SparseFeature`](../../api_docs/python/io_ops.md#SparseFeature)
   * [`string_input_producer`](../../api_docs/python/io_ops.md#string_input_producer)
   * [`TextLineReader`](../../api_docs/python/io_ops.md#TextLineReader)
   * [`TFRecordReader`](../../api_docs/python/io_ops.md#TFRecordReader)
@@ -488,6 +492,7 @@
 
 * **[Neural Network](../../api_docs/python/nn.md)**:
   * [`atrous_conv2d`](../../api_docs/python/nn.md#atrous_conv2d)
+  * [`atrous_conv2d_transpose`](../../api_docs/python/nn.md#atrous_conv2d_transpose)
   * [`avg_pool`](../../api_docs/python/nn.md#avg_pool)
   * [`avg_pool3d`](../../api_docs/python/nn.md#avg_pool3d)
   * [`batch_norm_with_global_normalization`](../../api_docs/python/nn.md#batch_norm_with_global_normalization)
@@ -564,19 +569,7 @@
   * [`weighted_cross_entropy_with_logits`](../../api_docs/python/nn.md#weighted_cross_entropy_with_logits)
   * [`weighted_moments`](../../api_docs/python/nn.md#weighted_moments)
   * [`with_space_to_batch`](../../api_docs/python/nn.md#with_space_to_batch)
-
-* **[Neural Network RNN Cells](../../api_docs/python/rnn_cell.md)**:
-  * [`BasicLSTMCell`](../../api_docs/python/rnn_cell.md#BasicLSTMCell)
-  * [`BasicRNNCell`](../../api_docs/python/rnn_cell.md#BasicRNNCell)
-  * [`DropoutWrapper`](../../api_docs/python/rnn_cell.md#DropoutWrapper)
-  * [`EmbeddingWrapper`](../../api_docs/python/rnn_cell.md#EmbeddingWrapper)
-  * [`GRUCell`](../../api_docs/python/rnn_cell.md#GRUCell)
-  * [`InputProjectionWrapper`](../../api_docs/python/rnn_cell.md#InputProjectionWrapper)
-  * [`LSTMCell`](../../api_docs/python/rnn_cell.md#LSTMCell)
-  * [`LSTMStateTuple`](../../api_docs/python/rnn_cell.md#LSTMStateTuple)
-  * [`MultiRNNCell`](../../api_docs/python/rnn_cell.md#MultiRNNCell)
-  * [`OutputProjectionWrapper`](../../api_docs/python/rnn_cell.md#OutputProjectionWrapper)
-  * [`RNNCell`](../../api_docs/python/rnn_cell.md#RNNCell)
+  * [`zero_fraction`](../../api_docs/python/nn.md#zero_fraction)
 
 * **[Running Graphs](../../api_docs/python/client.md)**:
   * [`AbortedError`](../../api_docs/python/client.md#AbortedError)
@@ -611,7 +604,6 @@
   * [`add_queue_runner`](../../api_docs/python/train.md#add_queue_runner)
   * [`AggregationMethod`](../../api_docs/python/train.md#AggregationMethod)
   * [`assert_global_step`](../../api_docs/python/train.md#assert_global_step)
-  * [`audio_summary`](../../api_docs/python/train.md#audio_summary)
   * [`basic_train_loop`](../../api_docs/python/train.md#basic_train_loop)
   * [`checkpoint_exists`](../../api_docs/python/train.md#checkpoint_exists)
   * [`CheckpointSaverHook`](../../api_docs/python/train.md#CheckpointSaverHook)
@@ -635,13 +627,9 @@
   * [`GradientDescentOptimizer`](../../api_docs/python/train.md#GradientDescentOptimizer)
   * [`gradients`](../../api_docs/python/train.md#gradients)
   * [`hessians`](../../api_docs/python/train.md#hessians)
-  * [`histogram_summary`](../../api_docs/python/train.md#histogram_summary)
-  * [`image_summary`](../../api_docs/python/train.md#image_summary)
   * [`inverse_time_decay`](../../api_docs/python/train.md#inverse_time_decay)
   * [`LoggingTensorHook`](../../api_docs/python/train.md#LoggingTensorHook)
   * [`LooperThread`](../../api_docs/python/train.md#LooperThread)
-  * [`merge_all_summaries`](../../api_docs/python/train.md#merge_all_summaries)
-  * [`merge_summary`](../../api_docs/python/train.md#merge_summary)
   * [`MomentumOptimizer`](../../api_docs/python/train.md#MomentumOptimizer)
   * [`MonitoredSession`](../../api_docs/python/train.md#MonitoredSession)
   * [`MonitoredTrainingSession`](../../api_docs/python/train.md#MonitoredTrainingSession)
@@ -658,7 +646,6 @@
   * [`replica_device_setter`](../../api_docs/python/train.md#replica_device_setter)
   * [`RMSPropOptimizer`](../../api_docs/python/train.md#RMSPropOptimizer)
   * [`Scaffold`](../../api_docs/python/train.md#Scaffold)
-  * [`scalar_summary`](../../api_docs/python/train.md#scalar_summary)
   * [`Server`](../../api_docs/python/train.md#Server)
   * [`SessionCreator`](../../api_docs/python/train.md#SessionCreator)
   * [`SessionManager`](../../api_docs/python/train.md#SessionManager)
@@ -672,14 +659,11 @@
   * [`StopAtStepHook`](../../api_docs/python/train.md#StopAtStepHook)
   * [`summary_iterator`](../../api_docs/python/train.md#summary_iterator)
   * [`SummarySaverHook`](../../api_docs/python/train.md#SummarySaverHook)
-  * [`SummaryWriter`](../../api_docs/python/train.md#SummaryWriter)
-  * [`SummaryWriterCache`](../../api_docs/python/train.md#SummaryWriterCache)
   * [`Supervisor`](../../api_docs/python/train.md#Supervisor)
   * [`SyncReplicasOptimizer`](../../api_docs/python/train.md#SyncReplicasOptimizer)
   * [`SyncReplicasOptimizerV2`](../../api_docs/python/train.md#SyncReplicasOptimizerV2)
   * [`WorkerSessionCreator`](../../api_docs/python/train.md#WorkerSessionCreator)
   * [`write_graph`](../../api_docs/python/train.md#write_graph)
-  * [`zero_fraction`](../../api_docs/python/train.md#zero_fraction)
 
 * **[Wraps python functions](../../api_docs/python/script_ops.md)**:
   * [`py_func`](../../api_docs/python/script_ops.md#py_func)
@@ -1014,11 +998,28 @@
   * [`StepCounter`](../../api_docs/python/contrib.learn.monitors.md#StepCounter)
   * [`StopAtStep`](../../api_docs/python/contrib.learn.monitors.md#StopAtStep)
   * [`SummarySaver`](../../api_docs/python/contrib.learn.monitors.md#SummarySaver)
+  * [`SummaryWriterCache`](../../api_docs/python/contrib.learn.monitors.md#SummaryWriterCache)
   * [`ValidationMonitor`](../../api_docs/python/contrib.learn.monitors.md#ValidationMonitor)
 
+* **[Sequence to Sequence (contrib)](../../api_docs/python/contrib.legacy_seq2seq.md)**:
+  * [`attention_decoder`](../../api_docs/python/contrib.legacy_seq2seq.md#attention_decoder)
+  * [`basic_rnn_seq2seq`](../../api_docs/python/contrib.legacy_seq2seq.md#basic_rnn_seq2seq)
+  * [`embedding_attention_decoder`](../../api_docs/python/contrib.legacy_seq2seq.md#embedding_attention_decoder)
+  * [`embedding_attention_seq2seq`](../../api_docs/python/contrib.legacy_seq2seq.md#embedding_attention_seq2seq)
+  * [`embedding_rnn_decoder`](../../api_docs/python/contrib.legacy_seq2seq.md#embedding_rnn_decoder)
+  * [`embedding_rnn_seq2seq`](../../api_docs/python/contrib.legacy_seq2seq.md#embedding_rnn_seq2seq)
+  * [`embedding_tied_rnn_seq2seq`](../../api_docs/python/contrib.legacy_seq2seq.md#embedding_tied_rnn_seq2seq)
+  * [`model_with_buckets`](../../api_docs/python/contrib.legacy_seq2seq.md#model_with_buckets)
+  * [`one2many_rnn_seq2seq`](../../api_docs/python/contrib.legacy_seq2seq.md#one2many_rnn_seq2seq)
+  * [`rnn_decoder`](../../api_docs/python/contrib.legacy_seq2seq.md#rnn_decoder)
+  * [`sequence_loss`](../../api_docs/python/contrib.legacy_seq2seq.md#sequence_loss)
+  * [`sequence_loss_by_example`](../../api_docs/python/contrib.legacy_seq2seq.md#sequence_loss_by_example)
+  * [`tied_rnn_seq2seq`](../../api_docs/python/contrib.legacy_seq2seq.md#tied_rnn_seq2seq)
+
 * **[Linear Algebra (contrib)](../../api_docs/python/contrib.linalg.md)**:
   * [`LinearOperator`](../../api_docs/python/contrib.linalg.md#LinearOperator)
   * [`LinearOperatorDiag`](../../api_docs/python/contrib.linalg.md#LinearOperatorDiag)
+  * [`LinearOperatorTriL`](../../api_docs/python/contrib.linalg.md#LinearOperatorTriL)
 
 * **[Losses (contrib)](../../api_docs/python/contrib.losses.md)**:
   * [`absolute_difference`](../../api_docs/python/contrib.losses.md#absolute_difference)
@@ -1036,20 +1037,34 @@
   * [`softmax_cross_entropy`](../../api_docs/python/contrib.losses.md#softmax_cross_entropy)
   * [`sparse_softmax_cross_entropy`](../../api_docs/python/contrib.losses.md#sparse_softmax_cross_entropy)
 
-* **[RNN (contrib)](../../api_docs/python/contrib.rnn.md)**:
+* **[RNN and Cells (contrib)](../../api_docs/python/contrib.rnn.md)**:
   * [`AttentionCellWrapper`](../../api_docs/python/contrib.rnn.md#AttentionCellWrapper)
+  * [`BasicLSTMCell`](../../api_docs/python/contrib.rnn.md#BasicLSTMCell)
+  * [`BasicRNNCell`](../../api_docs/python/contrib.rnn.md#BasicRNNCell)
   * [`BidirectionalGridLSTMCell`](../../api_docs/python/contrib.rnn.md#BidirectionalGridLSTMCell)
   * [`CoupledInputForgetGateLSTMCell`](../../api_docs/python/contrib.rnn.md#CoupledInputForgetGateLSTMCell)
+  * [`DropoutWrapper`](../../api_docs/python/contrib.rnn.md#DropoutWrapper)
+  * [`EmbeddingWrapper`](../../api_docs/python/contrib.rnn.md#EmbeddingWrapper)
   * [`FusedRNNCell`](../../api_docs/python/contrib.rnn.md#FusedRNNCell)
   * [`FusedRNNCellAdaptor`](../../api_docs/python/contrib.rnn.md#FusedRNNCellAdaptor)
   * [`GridLSTMCell`](../../api_docs/python/contrib.rnn.md#GridLSTMCell)
   * [`GRUBlockCell`](../../api_docs/python/contrib.rnn.md#GRUBlockCell)
+  * [`GRUCell`](../../api_docs/python/contrib.rnn.md#GRUCell)
+  * [`InputProjectionWrapper`](../../api_docs/python/contrib.rnn.md#InputProjectionWrapper)
   * [`LayerNormBasicLSTMCell`](../../api_docs/python/contrib.rnn.md#LayerNormBasicLSTMCell)
   * [`LSTMBlockCell`](../../api_docs/python/contrib.rnn.md#LSTMBlockCell)
   * [`LSTMBlockFusedCell`](../../api_docs/python/contrib.rnn.md#LSTMBlockFusedCell)
   * [`LSTMBlockWrapper`](../../api_docs/python/contrib.rnn.md#LSTMBlockWrapper)
+  * [`LSTMCell`](../../api_docs/python/contrib.rnn.md#LSTMCell)
+  * [`LSTMStateTuple`](../../api_docs/python/contrib.rnn.md#LSTMStateTuple)
+  * [`MultiRNNCell`](../../api_docs/python/contrib.rnn.md#MultiRNNCell)
+  * [`OutputProjectionWrapper`](../../api_docs/python/contrib.rnn.md#OutputProjectionWrapper)
+  * [`RNNCell`](../../api_docs/python/contrib.rnn.md#RNNCell)
   * [`stack_bidirectional_dynamic_rnn`](../../api_docs/python/contrib.rnn.md#stack_bidirectional_dynamic_rnn)
   * [`stack_bidirectional_rnn`](../../api_docs/python/contrib.rnn.md#stack_bidirectional_rnn)
+  * [`static_bidirectional_rnn`](../../api_docs/python/contrib.rnn.md#static_bidirectional_rnn)
+  * [`static_rnn`](../../api_docs/python/contrib.rnn.md#static_rnn)
+  * [`static_state_saving_rnn`](../../api_docs/python/contrib.rnn.md#static_state_saving_rnn)
   * [`TimeFreqLSTMCell`](../../api_docs/python/contrib.rnn.md#TimeFreqLSTMCell)
   * [`TimeReversedFusedRNN`](../../api_docs/python/contrib.rnn.md#TimeReversedFusedRNN)
 
@@ -1058,7 +1073,6 @@
   * [`aggregate_metric_map`](../../api_docs/python/contrib.metrics.md#aggregate_metric_map)
   * [`aggregate_metrics`](../../api_docs/python/contrib.metrics.md#aggregate_metrics)
   * [`auc_using_histogram`](../../api_docs/python/contrib.metrics.md#auc_using_histogram)
-  * [`confusion_matrix`](../../api_docs/python/contrib.metrics.md#confusion_matrix)
   * [`set_difference`](../../api_docs/python/contrib.metrics.md#set_difference)
   * [`set_intersection`](../../api_docs/python/contrib.metrics.md#set_intersection)
   * [`set_size`](../../api_docs/python/contrib.metrics.md#set_size)
diff --git a/tensorflow/g3doc/api_docs/python/io_ops.md b/tensorflow/g3doc/api_docs/python/io_ops.md
index 216586084a0963..7acb5b287dc46a 100644
--- a/tensorflow/g3doc/api_docs/python/io_ops.md
+++ b/tensorflow/g3doc/api_docs/python/io_ops.md
@@ -101,7 +101,7 @@ with tf.Session() as sess:
   print(sess.run(y, feed_dict={
     x: (indices, values, shape)}))  # Will succeed.
 
-  sp = tf.SparseTensor(indices=indices, values=values, shape=shape)
+  sp = tf.SparseTensor(indices=indices, values=values, dense_shape=shape)
   sp_value = sp.eval(session)
   print(sess.run(y, feed_dict={x: sp_value}))  # Will succeed.
 ```
@@ -1458,6 +1458,87 @@ Alias for field number 0
 
 
 
+- - -
+
+### `class tf.SparseFeature` {#SparseFeature}
+
+Configuration for parsing a sparse input feature.
+
+Fields:
+  index_key: Name of index feature.  The underlying feature's type must
+    be `int64` and its length must always match that of the `value_key`
+    feature.
+  value_key: Name of value feature.  The underlying feature's type must
+    be `dtype` and its length must always match that of the `index_key`
+    feature.
+  dtype: Data type of the `value_key` feature.
+  size: Each value in the `index_key` feature must be in `[0, size)`.
+  already_sorted: A boolean to specify whether the values in `index_key` are
+    already sorted. If so skip sorting, False by default (optional).
+- - -
+
+#### `tf.SparseFeature.__getnewargs__()` {#SparseFeature.__getnewargs__}
+
+Return self as a plain tuple.  Used by copy and pickle.
+
+
+- - -
+
+#### `tf.SparseFeature.__getstate__()` {#SparseFeature.__getstate__}
+
+Exclude the OrderedDict from pickling
+
+
+- - -
+
+#### `tf.SparseFeature.__new__(_cls, index_key, value_key, dtype, size, already_sorted=False)` {#SparseFeature.__new__}
+
+Create new instance of SparseFeature(index_key, value_key, dtype, size, already_sorted)
+
+
+- - -
+
+#### `tf.SparseFeature.__repr__()` {#SparseFeature.__repr__}
+
+Return a nicely formatted representation string
+
+
+- - -
+
+#### `tf.SparseFeature.already_sorted` {#SparseFeature.already_sorted}
+
+Alias for field number 4
+
+
+- - -
+
+#### `tf.SparseFeature.dtype` {#SparseFeature.dtype}
+
+Alias for field number 2
+
+
+- - -
+
+#### `tf.SparseFeature.index_key` {#SparseFeature.index_key}
+
+Alias for field number 0
+
+
+- - -
+
+#### `tf.SparseFeature.size` {#SparseFeature.size}
+
+Alias for field number 3
+
+
+- - -
+
+#### `tf.SparseFeature.value_key` {#SparseFeature.value_key}
+
+Alias for field number 1
+
+
+
 - - -
 
 ### `tf.parse_example(serialized, features, name=None, example_names=None)` {#parse_example}
@@ -1469,18 +1550,27 @@ protos given in `serialized`.
 
 `example_names` may contain descriptive names for the corresponding serialized
 protos. These may be useful for debugging purposes, but they have no effect on
-the output. If not `None`, `example_names` must be the same length as `serialized`.
+the output. If not `None`, `example_names` must be the same length as
+`serialized`.
 
 This op parses serialized examples into a dictionary mapping keys to `Tensor`
-and `SparseTensor` objects. `features` is a dict from keys to `VarLenFeature`
-and `FixedLenFeature` objects. Each `VarLenFeature` is mapped to a
-`SparseTensor`, and each `FixedLenFeature` is mapped to a `Tensor`.
+and `SparseTensor` objects. `features` is a dict from keys to `VarLenFeature`,
+`SparseFeature`, and `FixedLenFeature` objects. Each `VarLenFeature`
+and `SparseFeature` is mapped to a `SparseTensor`, and each
+`FixedLenFeature` is mapped to a `Tensor`.
 
 Each `VarLenFeature` maps to a `SparseTensor` of the specified type
 representing a ragged matrix. Its indices are `[batch, index]` where `batch`
 is the batch entry the value is from in `serialized`, and `index` is the
 value's index in the list of values associated with that feature and example.
 
+Each `SparseFeature` maps to a `SparseTensor` of the specified type
+representing a sparse matrix of shape
+`(serialized.size(), SparseFeature.size)`. Its indices are `[batch, index]`
+where `batch` is the batch entry the value is from in `serialized`, and
+`index` is the value's index is given by the values in the
+`SparseFeature.index_key` feature column.
+
 Each `FixedLenFeature` `df` maps to a `Tensor` of the specified type (or
 `tf.float32` if not specified) and shape `(serialized.size(),) + df.shape`.
 
@@ -1509,7 +1599,7 @@ then the output will look like:
 ```
 {"ft": SparseTensor(indices=[[0, 0], [0, 1], [2, 0]],
                     values=[1.0, 2.0, 3.0],
-                    shape=(3, 2)) }
+                    dense_shape=(3, 2)) }
 ```
 
 Given two `Example` input protos in `serialized`:
@@ -1592,13 +1682,48 @@ And the expected output is:
 }
 ```
 
+Given two `Example` input protos in `serialized`:
+
+```
+[
+  features {
+    feature { key: "val" value { float_list { value: [ 0.5, -1.0 ] } } }
+    feature { key: "ix" value { int64_list { value: [ 3, 20 ] } } }
+  },
+  features {
+    feature { key: "val" value { float_list { value: [ 0.0 ] } } }
+    feature { key: "ix" value { int64_list { value: [ 42 ] } } }
+  }
+]
+```
+
+And arguments
+
+```
+example_names: ["input0", "input1"],
+features: {
+    "sparse": SparseFeature("ix", "val", tf.float32, 100),
+}
+```
+
+Then the output is a dictionary:
+
+```python
+{
+  "sparse": SparseTensor(
+      indices=[[0, 3], [0, 20], [1, 42]],
+      values=[0.5, -1.0, 0.0]
+      shape=[2, 100]),
+}
+```
+
 ##### Args:
 
 
 *  <b>`serialized`</b>: A vector (1-D Tensor) of strings, a batch of binary
     serialized `Example` protos.
-*  <b>`features`</b>: A `dict` mapping feature keys to `FixedLenFeature` or
-    `VarLenFeature` values.
+*  <b>`features`</b>: A `dict` mapping feature keys to `FixedLenFeature`,
+    `VarLenFeature`, and `SparseFeature` values.
 *  <b>`name`</b>: A name for this operation (optional).
 *  <b>`example_names`</b>: A vector (1-D Tensor) of strings (optional), the names of
     the serialized protos in the batch.
@@ -2972,7 +3097,7 @@ single subgraph producing examples but you want to run it in *N* threads
 (where you increase *N* until it can keep the queue full).  Use
 [`batch_join`](#batch_join) or [`shuffle_batch_join`](#shuffle_batch_join)
 if you have *N* different subgraphs producing examples to batch and you
-want them run by *N* threads.
+want them run by *N* threads. Use `maybe_*` to enqueue conditionally.
 
 - - -
 
@@ -3053,6 +3178,48 @@ Note: if `num_epochs` is not `None`, this function creates local counter
 ##### Raises:
 
 
+*  <b>`ValueError`</b>: If the `shapes` are not specified, and cannot be
+    inferred from the elements of `tensors`.
+
+
+- - -
+
+### `tf.train.maybe_batch(tensors, keep_input, batch_size, num_threads=1, capacity=32, enqueue_many=False, shapes=None, dynamic_pad=False, allow_smaller_final_batch=False, shared_name=None, name=None)` {#maybe_batch}
+
+Conditionally creates batches of tensors based on `keep_input`.
+
+See docstring in `batch` for more details.
+
+##### Args:
+
+
+*  <b>`tensors`</b>: The list or dictionary of tensors to enqueue.
+*  <b>`keep_input`</b>: A `bool` scalar Tensor.  This tensor controls whether the input
+    is added to the queue or not.  If it evaluates `True`, then `tensors` are
+    added to the queue; otherwise they are dropped.  This tensor essentially
+    acts as a filtering mechanism.
+*  <b>`batch_size`</b>: The new batch size pulled from the queue.
+*  <b>`num_threads`</b>: The number of threads enqueuing `tensors`.
+*  <b>`capacity`</b>: An integer. The maximum number of elements in the queue.
+*  <b>`enqueue_many`</b>: Whether each tensor in `tensors` is a single example.
+*  <b>`shapes`</b>: (Optional) The shapes for each example.  Defaults to the
+    inferred shapes for `tensors`.
+*  <b>`dynamic_pad`</b>: Boolean.  Allow variable dimensions in input shapes.
+    The given dimensions are padded upon dequeue so that tensors within a
+    batch have the same shapes.
+*  <b>`allow_smaller_final_batch`</b>: (Optional) Boolean. If `True`, allow the final
+    batch to be smaller if there are insufficient items left in the queue.
+*  <b>`shared_name`</b>: (Optional). If set, this queue will be shared under the given
+    name across multiple sessions.
+*  <b>`name`</b>: (Optional) A name for the operations.
+
+##### Returns:
+
+  A list or dictionary of tensors with the same types as `tensors`.
+
+##### Raises:
+
+
 *  <b>`ValueError`</b>: If the `shapes` are not specified, and cannot be
     inferred from the elements of `tensors`.
 
@@ -3144,6 +3311,49 @@ operations that depend on fixed batch_size would fail.
 ##### Raises:
 
 
+*  <b>`ValueError`</b>: If the `shapes` are not specified, and cannot be
+    inferred from the elements of `tensor_list_list`.
+
+
+- - -
+
+### `tf.train.maybe_batch_join(tensors_list, keep_input, batch_size, capacity=32, enqueue_many=False, shapes=None, dynamic_pad=False, allow_smaller_final_batch=False, shared_name=None, name=None)` {#maybe_batch_join}
+
+Runs a list of tensors to conditionally fill a queue to create batches.
+
+See docstring in `batch_join` for more details.
+
+##### Args:
+
+
+*  <b>`tensors_list`</b>: A list of tuples or dictionaries of tensors to enqueue.
+*  <b>`keep_input`</b>: A `bool` scalar Tensor.  This tensor controls whether the input
+    is added to the queue or not.  If it evaluates `True`, then `tensors` are
+    added to the queue; otherwise they are dropped.  This tensor essentially
+    acts as a filtering mechanism.
+*  <b>`batch_size`</b>: An integer. The new batch size pulled from the queue.
+*  <b>`capacity`</b>: An integer. The maximum number of elements in the queue.
+*  <b>`enqueue_many`</b>: Whether each tensor in `tensor_list_list` is a single
+    example.
+*  <b>`shapes`</b>: (Optional) The shapes for each example.  Defaults to the
+    inferred shapes for `tensor_list_list[i]`.
+*  <b>`dynamic_pad`</b>: Boolean.  Allow variable dimensions in input shapes.
+    The given dimensions are padded upon dequeue so that tensors within a
+    batch have the same shapes.
+*  <b>`allow_smaller_final_batch`</b>: (Optional) Boolean. If `True`, allow the final
+    batch to be smaller if there are insufficient items left in the queue.
+*  <b>`shared_name`</b>: (Optional) If set, this queue will be shared under the given
+    name across multiple sessions.
+*  <b>`name`</b>: (Optional) A name for the operations.
+
+##### Returns:
+
+  A list or dictionary of tensors with the same number and types as
+  `tensors_list[i]`.
+
+##### Raises:
+
+
 *  <b>`ValueError`</b>: If the `shapes` are not specified, and cannot be
     inferred from the elements of `tensor_list_list`.
 
@@ -3233,6 +3443,48 @@ Note: if `num_epochs` is not `None`, this function creates local counter
 ##### Raises:
 
 
+*  <b>`ValueError`</b>: If the `shapes` are not specified, and cannot be
+    inferred from the elements of `tensors`.
+
+
+- - -
+
+### `tf.train.maybe_shuffle_batch(tensors, batch_size, capacity, min_after_dequeue, keep_input, num_threads=1, seed=None, enqueue_many=False, shapes=None, allow_smaller_final_batch=False, shared_name=None, name=None)` {#maybe_shuffle_batch}
+
+Creates batches by randomly shuffling conditionally-enqueued tensors.
+
+See docstring in `shuffle_batch` for more details.
+
+##### Args:
+
+
+*  <b>`tensors`</b>: The list or dictionary of tensors to enqueue.
+*  <b>`batch_size`</b>: The new batch size pulled from the queue.
+*  <b>`capacity`</b>: An integer. The maximum number of elements in the queue.
+*  <b>`min_after_dequeue`</b>: Minimum number elements in the queue after a
+    dequeue, used to ensure a level of mixing of elements.
+*  <b>`keep_input`</b>: A `bool` scalar Tensor.  This tensor controls whether the input
+    is added to the queue or not.  If it evaluates `True`, then `tensors` are
+    added to the queue; otherwise they are dropped.  This tensor essentially
+    acts as a filtering mechanism.
+*  <b>`num_threads`</b>: The number of threads enqueuing `tensor_list`.
+*  <b>`seed`</b>: Seed for the random shuffling within the queue.
+*  <b>`enqueue_many`</b>: Whether each tensor in `tensor_list` is a single example.
+*  <b>`shapes`</b>: (Optional) The shapes for each example.  Defaults to the
+    inferred shapes for `tensor_list`.
+*  <b>`allow_smaller_final_batch`</b>: (Optional) Boolean. If `True`, allow the final
+    batch to be smaller if there are insufficient items left in the queue.
+*  <b>`shared_name`</b>: (Optional) If set, this queue will be shared under the given
+    name across multiple sessions.
+*  <b>`name`</b>: (Optional) A name for the operations.
+
+##### Returns:
+
+  A list or dictionary of tensors with the types as `tensors`.
+
+##### Raises:
+
+
 *  <b>`ValueError`</b>: If the `shapes` are not specified, and cannot be
     inferred from the elements of `tensors`.
 
@@ -3317,3 +3569,46 @@ operations that depend on fixed batch_size would fail.
     inferred from the elements of `tensors_list`.
 
 
+- - -
+
+### `tf.train.maybe_shuffle_batch_join(tensors_list, batch_size, capacity, min_after_dequeue, keep_input, seed=None, enqueue_many=False, shapes=None, allow_smaller_final_batch=False, shared_name=None, name=None)` {#maybe_shuffle_batch_join}
+
+Create batches by randomly shuffling conditionally-enqueued tensors.
+
+See docstring in `shuffle_batch_join` for more details.
+
+##### Args:
+
+
+*  <b>`tensors_list`</b>: A list of tuples or dictionaries of tensors to enqueue.
+*  <b>`batch_size`</b>: An integer. The new batch size pulled from the queue.
+*  <b>`capacity`</b>: An integer. The maximum number of elements in the queue.
+*  <b>`min_after_dequeue`</b>: Minimum number elements in the queue after a
+    dequeue, used to ensure a level of mixing of elements.
+*  <b>`keep_input`</b>: A `bool` scalar Tensor.  If provided, this tensor controls
+    whether the input is added to the queue or not.  If it evaluates `True`,
+    then `tensors_list` are added to the queue; otherwise they are dropped.
+    This tensor essentially acts as a filtering mechanism.
+*  <b>`seed`</b>: Seed for the random shuffling within the queue.
+*  <b>`enqueue_many`</b>: Whether each tensor in `tensor_list_list` is a single
+    example.
+*  <b>`shapes`</b>: (Optional) The shapes for each example.  Defaults to the
+    inferred shapes for `tensors_list[i]`.
+*  <b>`allow_smaller_final_batch`</b>: (Optional) Boolean. If `True`, allow the final
+    batch to be smaller if there are insufficient items left in the queue.
+*  <b>`shared_name`</b>: (optional). If set, this queue will be shared under the given
+    name across multiple sessions.
+*  <b>`name`</b>: (Optional) A name for the operations.
+
+##### Returns:
+
+  A list or dictionary of tensors with the same number and types as
+  `tensors_list[i]`.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If the `shapes` are not specified, and cannot be
+    inferred from the elements of `tensors_list`.
+
+
diff --git a/tensorflow/g3doc/api_docs/python/math_ops.md b/tensorflow/g3doc/api_docs/python/math_ops.md
index 994ff48c7a8a52..5c794b50fbdb12 100644
--- a/tensorflow/g3doc/api_docs/python/math_ops.md
+++ b/tensorflow/g3doc/api_docs/python/math_ops.md
@@ -1669,50 +1669,6 @@ c = tf.matmul(a, b) => [[[ 94 100]
     are both set to True.
 
 
-- - -
-
-### `tf.batch_matmul(x, y, adj_x=None, adj_y=None, name=None)` {#batch_matmul}
-
-Multiplies slices of two tensors in batches.
-
-Multiplies all slices of `Tensor` `x` and `y` (each slice can be
-viewed as an element of a batch), and arranges the individual results
-in a single output tensor of the same batch size. Each of the
-individual slices can optionally be adjointed (to adjoint a matrix
-means to transpose and conjugate it) before multiplication by setting
-the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
-
-The input tensors `x` and `y` are 3-D or higher with shape `[..., r_x, c_x]`
-and `[..., r_y, c_y]`.
-
-The output tensor is 3-D or higher with shape `[..., r_o, c_o]`, where:
-
-    r_o = c_x if adj_x else r_x
-    c_o = r_y if adj_y else c_y
-
-It is computed as:
-
-    output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `int32`, `complex64`, `complex128`.
-    3-D or higher with shape `[..., r_x, c_x]`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-    3-D or higher with shape `[..., r_y, c_y]`.
-*  <b>`adj_x`</b>: An optional `bool`. Defaults to `False`.
-    If `True`, adjoint the slices of `x`. Defaults to `False`.
-*  <b>`adj_y`</b>: An optional `bool`. Defaults to `False`.
-    If `True`, adjoint the slices of `y`. Defaults to `False`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-  3-D or higher with shape `[..., r_o, c_o]`
-
-
 
 - - -
 
@@ -3357,7 +3313,7 @@ a tensor.
 
 ### `tf.argmin(input, axis=None, name=None, dimension=None)` {#argmin}
 
-Returns the index with the smallest value across axiss of a tensor.
+Returns the index with the smallest value across axes of a tensor.
 
 ##### Args:
 
@@ -3377,7 +3333,7 @@ Returns the index with the smallest value across axiss of a tensor.
 
 ### `tf.argmax(input, axis=None, name=None, dimension=None)` {#argmax}
 
-Returns the index with the largest value across axiss of a tensor.
+Returns the index with the largest value across axes of a tensor.
 
 ##### Args:
 
diff --git a/tensorflow/g3doc/api_docs/python/nn.md b/tensorflow/g3doc/api_docs/python/nn.md
index 5f1e189874bb9f..516aa4e29e8f13 100644
--- a/tensorflow/g3doc/api_docs/python/nn.md
+++ b/tensorflow/g3doc/api_docs/python/nn.md
@@ -733,6 +733,52 @@ inputs are identical.
     padding is other than `'VALID'` or `'SAME'`.
 
 
+- - -
+
+### `tf.nn.atrous_conv2d_transpose(value, filters, output_shape, rate, padding, name=None)` {#atrous_conv2d_transpose}
+
+The transpose of `atrous_conv2d`.
+
+This operation is sometimes called "deconvolution" after [Deconvolutional
+Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf), but is
+actually the transpose (gradient) of `atrous_conv2d` rather than an actual
+deconvolution.
+
+##### Args:
+
+
+*  <b>`value`</b>: A 4-D `Tensor` of type `float`. It needs to be in the default `NHWC`
+    format. Its shape is `[batch, in_height, in_width, in_channels]`.
+*  <b>`filters`</b>: A 4-D `Tensor` with the same type as `value` and shape
+    `[filter_height, filter_width, out_channels, in_channels]`. `filters`'
+    `in_channels` dimension must match that of `value`. Atrous convolution is
+    equivalent to standard convolution with upsampled filters with effective
+    height `filter_height + (filter_height - 1) * (rate - 1)` and effective
+    width `filter_width + (filter_width - 1) * (rate - 1)`, produced by
+    inserting `rate - 1` zeros along consecutive elements across the
+    `filters`' spatial dimensions.
+*  <b>`output_shape`</b>: A 1-D `Tensor` of shape representing the output shape of the
+    deconvolution op.
+*  <b>`rate`</b>: A positive int32. The stride with which we sample input values across
+    the `height` and `width` dimensions. Equivalently, the rate by which we
+    upsample the filter values by inserting zeros across the `height` and
+    `width` dimensions. In the literature, the same parameter is sometimes
+    called `input stride` or `dilation`.
+*  <b>`padding`</b>: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
+*  <b>`name`</b>: Optional name for the returned tensor.
+
+##### Returns:
+
+  A `Tensor` with the same type as `value`.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If input/output depth does not match `filters`' shape, or if
+    padding is other than `'VALID'` or `'SAME'`, or if the `rate` is less
+    than one, or if the output_shape is not a tensor with 4 elements.
+
+
 - - -
 
 ### `tf.nn.conv2d_transpose(value, filter, output_shape, strides, padding='SAME', data_format='NHWC', name=None)` {#conv2d_transpose}
@@ -2067,7 +2113,7 @@ Computes half the L2 norm of a tensor without the `sqrt`:
 
 - - -
 
-### `tf.nn.log_poisson_loss(log_input, targets, compute_full_loss=False, name=None)` {#log_poisson_loss}
+### `tf.nn.log_poisson_loss(targets, log_input, compute_full_loss=False, name=None)` {#log_poisson_loss}
 
 Computes log Poisson loss given `log_input`.
 
@@ -2095,8 +2141,8 @@ loss is
 ##### Args:
 
 
-*  <b>`log_input`</b>: A `Tensor` of type `float32` or `float64`.
 *  <b>`targets`</b>: A `Tensor` of the same type and shape as `log_input`.
+*  <b>`log_input`</b>: A `Tensor` of type `float32` or `float64`.
 *  <b>`compute_full_loss`</b>: whether to compute the full loss. If false, a constant
     term is dropped in favor of more efficient optimization.
 *  <b>`name`</b>: A name for the operation (optional).
@@ -2518,7 +2564,7 @@ is the sum of the size of params along dimension 0.
 
 TensorFlow provides a number of methods for constructing Recurrent
 Neural Networks.  Most accept an `RNNCell`-subclassed object
-(see the documentation for `tf.nn.rnn_cell`).
+(see the documentation for `tf.contrib.rnn`).
 
 - - -
 
@@ -2922,7 +2968,7 @@ sequence_length = tf.placeholder(shape=(batch_size,), dtype=tf.int32)
 inputs_ta = tf.TensorArray(dtype=tf.float32, size=max_time)
 inputs_ta = inputs_ta.unpack(inputs)
 
-cell = tf.nn.rnn_cell.LSTMCell(num_units)
+cell = tf.contrib.rnn.LSTMCell(num_units)
 
 def loop_fn(time, cell_output, cell_state, loop_state):
   emit_output = cell_output  # == None for time == 0
@@ -3045,7 +3091,7 @@ outputs = outputs_ta.pack()
 
 - - -
 
-### `tf.nn.ctc_loss(inputs, labels, sequence_length, preprocess_collapse_repeated=False, ctc_merge_repeated=True, time_major=True)` {#ctc_loss}
+### `tf.nn.ctc_loss(labels, inputs, sequence_length, preprocess_collapse_repeated=False, ctc_merge_repeated=True, time_major=True)` {#ctc_loss}
 
 Computes the CTC (Connectionist Temporal Classification) Loss.
 
@@ -3115,17 +3161,17 @@ Here is a table of the (roughly) expected first order behavior:
 ##### Args:
 
 
+*  <b>`labels`</b>: An `int32` `SparseTensor`.
+    `labels.indices[i, :] == [b, t]` means `labels.values[i]` stores
+    the id for (batch b, time t).
+    `labels.values[i]` must take on values in `[0, num_labels)`.
+    See `core/ops/ctc_ops.cc` for more details.
 *  <b>`inputs`</b>: 3-D `float` `Tensor`.
     If time_major == False, this will be a `Tensor` shaped:
       `[batch_size x max_time x num_classes]`.
     If time_major == True (default), this will be a `Tensor` shaped:
       `[max_time x batch_size x num_classes]`.
     The logits.
-*  <b>`labels`</b>: An `int32` `SparseTensor`.
-    `labels.indices[i, :] == [b, t]` means `labels.values[i]` stores
-    the id for (batch b, time t).
-    `labels.values[i]` must take on values in `[0, num_labels)`.
-    See `core/ops/ctc_ops.cc` for more details.
 *  <b>`sequence_length`</b>: 1-D `int32` vector, size `[batch_size]`.
     The sequence lengths.
 *  <b>`preprocess_collapse_repeated`</b>: Boolean.  Default: False.
@@ -3338,8 +3384,10 @@ TensorFlow provides the following sampled loss functions for faster training.
 Computes and returns the noise-contrastive estimation training loss.
 
 See [Noise-contrastive estimation: A new estimation principle for
-unnormalized statistical models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf).
-Also see our [Candidate Sampling Algorithms Reference](../../extras/candidate_sampling.pdf)
+unnormalized statistical
+models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf).
+Also see our [Candidate Sampling Algorithms
+Reference](../../extras/candidate_sampling.pdf)
 
 Note: By default this uses a log-uniform (Zipfian) distribution for sampling,
 so your labels must be sorted in order of decreasing frequency to achieve
@@ -3884,3 +3932,32 @@ Produces the average pool of the input tensor for quantized types.
 *  <b>`max_output`</b>: A `Tensor` of type `float32`. The float value that the highest quantized output value represents.
 
 
+
+## Other Functions and Classes
+- - -
+
+### `tf.nn.zero_fraction(value, name=None)` {#zero_fraction}
+
+Returns the fraction of zeros in `value`.
+
+If `value` is empty, the result is `nan`.
+
+This is useful in summaries to measure and report sparsity.  For example,
+
+```python
+    z = tf.Relu(...)
+    summ = tf.contrib.deprecated.scalar_summary('sparsity',
+    tf.nn.zero_fraction(z))
+```
+
+##### Args:
+
+
+*  <b>`value`</b>: A tensor of numeric type.
+*  <b>`name`</b>: A name for the operation (optional).
+
+##### Returns:
+
+  The fraction of zeros in `value`, with type `float32`.
+
+
diff --git a/tensorflow/g3doc/api_docs/python/rnn_cell.md b/tensorflow/g3doc/api_docs/python/rnn_cell.md
deleted file mode 100644
index c6d39bd9361866..00000000000000
--- a/tensorflow/g3doc/api_docs/python/rnn_cell.md
+++ /dev/null
@@ -1,851 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# Neural Network RNN Cells
-[TOC]
-
-Module for constructing RNN Cells.
-
-## Base interface for all RNN Cells
-
-- - -
-
-### `class tf.nn.rnn_cell.RNNCell` {#RNNCell}
-
-Abstract object representing an RNN cell.
-
-The definition of cell in this package differs from the definition used in the
-literature. In the literature, cell refers to an object with a single scalar
-output. The definition in this package refers to a horizontal array of such
-units.
-
-An RNN cell, in the most abstract setting, is anything that has
-a state and performs some operation that takes a matrix of inputs.
-This operation results in an output matrix with `self.output_size` columns.
-If `self.state_size` is an integer, this operation also results in a new
-state matrix with `self.state_size` columns.  If `self.state_size` is a
-tuple of integers, then it results in a tuple of `len(state_size)` state
-matrices, each with a column size corresponding to values in `state_size`.
-
-This module provides a number of basic commonly used RNN cells, such as
-LSTM (Long Short Term Memory) or GRU (Gated Recurrent Unit), and a number
-of operators that allow add dropouts, projections, or embeddings for inputs.
-Constructing multi-layer cells is supported by the class `MultiRNNCell`,
-or by calling the `rnn` ops several times. Every `RNNCell` must have the
-properties below and and implement `__call__` with the following signature.
-- - -
-
-#### `tf.nn.rnn_cell.RNNCell.__call__(inputs, state, scope=None)` {#RNNCell.__call__}
-
-Run this RNN cell on inputs, starting from the given state.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: `2-D` tensor with shape `[batch_size x input_size]`.
-*  <b>`state`</b>: if `self.state_size` is an integer, this should be a `2-D Tensor`
-    with shape `[batch_size x self.state_size]`.  Otherwise, if
-    `self.state_size` is a tuple of integers, this should be a tuple
-    with shapes `[batch_size x s] for s in self.state_size`.
-*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to class name.
-
-##### Returns:
-
-  A pair containing:
-
-  - Output: A `2-D` tensor with shape `[batch_size x self.output_size]`.
-  - New state: Either a single `2-D` tensor, or a tuple of tensors matching
-    the arity and shapes of `state`.
-
-
-- - -
-
-#### `tf.nn.rnn_cell.RNNCell.output_size` {#RNNCell.output_size}
-
-Integer or TensorShape: size of outputs produced by this cell.
-
-
-- - -
-
-#### `tf.nn.rnn_cell.RNNCell.state_size` {#RNNCell.state_size}
-
-size(s) of state(s) used by this cell.
-
-It can be represented by an Integer, a TensorShape or a tuple of Integers
-or TensorShapes.
-
-
-- - -
-
-#### `tf.nn.rnn_cell.RNNCell.zero_state(batch_size, dtype)` {#RNNCell.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
-
-
-## RNN Cells for use with TensorFlow's core RNN methods
-
-- - -
-
-### `class tf.nn.rnn_cell.BasicRNNCell` {#BasicRNNCell}
-
-The most basic RNN cell.
-- - -
-
-#### `tf.nn.rnn_cell.BasicRNNCell.__call__(inputs, state, scope=None)` {#BasicRNNCell.__call__}
-
-Most basic RNN: output = new_state = act(W * input + U * state + B).
-
-
-- - -
-
-#### `tf.nn.rnn_cell.BasicRNNCell.__init__(num_units, input_size=None, activation=tanh)` {#BasicRNNCell.__init__}
-
-
-
-
-- - -
-
-#### `tf.nn.rnn_cell.BasicRNNCell.output_size` {#BasicRNNCell.output_size}
-
-
-
-
-- - -
-
-#### `tf.nn.rnn_cell.BasicRNNCell.state_size` {#BasicRNNCell.state_size}
-
-
-
-
-- - -
-
-#### `tf.nn.rnn_cell.BasicRNNCell.zero_state(batch_size, dtype)` {#BasicRNNCell.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
-
-- - -
-
-### `class tf.nn.rnn_cell.BasicLSTMCell` {#BasicLSTMCell}
-
-Basic LSTM recurrent network cell.
-
-The implementation is based on: http://arxiv.org/abs/1409.2329.
-
-We add forget_bias (default: 1) to the biases of the forget gate in order to
-reduce the scale of forgetting in the beginning of the training.
-
-It does not allow cell clipping, a projection layer, and does not
-use peep-hole connections: it is the basic baseline.
-
-For advanced models, please use the full LSTMCell that follows.
-- - -
-
-#### `tf.nn.rnn_cell.BasicLSTMCell.__call__(inputs, state, scope=None)` {#BasicLSTMCell.__call__}
-
-Long short-term memory cell (LSTM).
-
-
-- - -
-
-#### `tf.nn.rnn_cell.BasicLSTMCell.__init__(num_units, forget_bias=1.0, input_size=None, state_is_tuple=True, activation=tanh)` {#BasicLSTMCell.__init__}
-
-Initialize the basic LSTM cell.
-
-##### Args:
-
-
-*  <b>`num_units`</b>: int, The number of units in the LSTM cell.
-*  <b>`forget_bias`</b>: float, The bias added to forget gates (see above).
-*  <b>`input_size`</b>: Deprecated and unused.
-*  <b>`state_is_tuple`</b>: If True, accepted and returned states are 2-tuples of
-    the `c_state` and `m_state`.  If False, they are concatenated
-    along the column axis.  The latter behavior will soon be deprecated.
-*  <b>`activation`</b>: Activation function of the inner states.
-
-
-- - -
-
-#### `tf.nn.rnn_cell.BasicLSTMCell.output_size` {#BasicLSTMCell.output_size}
-
-
-
-
-- - -
-
-#### `tf.nn.rnn_cell.BasicLSTMCell.state_size` {#BasicLSTMCell.state_size}
-
-
-
-
-- - -
-
-#### `tf.nn.rnn_cell.BasicLSTMCell.zero_state(batch_size, dtype)` {#BasicLSTMCell.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
-
-- - -
-
-### `class tf.nn.rnn_cell.GRUCell` {#GRUCell}
-
-Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078).
-- - -
-
-#### `tf.nn.rnn_cell.GRUCell.__call__(inputs, state, scope=None)` {#GRUCell.__call__}
-
-Gated recurrent unit (GRU) with nunits cells.
-
-
-- - -
-
-#### `tf.nn.rnn_cell.GRUCell.__init__(num_units, input_size=None, activation=tanh)` {#GRUCell.__init__}
-
-
-
-
-- - -
-
-#### `tf.nn.rnn_cell.GRUCell.output_size` {#GRUCell.output_size}
-
-
-
-
-- - -
-
-#### `tf.nn.rnn_cell.GRUCell.state_size` {#GRUCell.state_size}
-
-
-
-
-- - -
-
-#### `tf.nn.rnn_cell.GRUCell.zero_state(batch_size, dtype)` {#GRUCell.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
-
-- - -
-
-### `class tf.nn.rnn_cell.LSTMCell` {#LSTMCell}
-
-Long short-term memory unit (LSTM) recurrent network cell.
-
-The default non-peephole implementation is based on:
-
-  http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf
-
-S. Hochreiter and J. Schmidhuber.
-"Long Short-Term Memory". Neural Computation, 9(8):1735-1780, 1997.
-
-The peephole implementation is based on:
-
-  https://research.google.com/pubs/archive/43905.pdf
-
-Hasim Sak, Andrew Senior, and Francoise Beaufays.
-"Long short-term memory recurrent neural network architectures for
- large scale acoustic modeling." INTERSPEECH, 2014.
-
-The class uses optional peep-hole connections, optional cell clipping, and
-an optional projection layer.
-- - -
-
-#### `tf.nn.rnn_cell.LSTMCell.__call__(inputs, state, scope=None)` {#LSTMCell.__call__}
-
-Run one step of LSTM.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: input Tensor, 2D, batch x num_units.
-*  <b>`state`</b>: if `state_is_tuple` is False, this must be a state Tensor,
-    `2-D, batch x state_size`.  If `state_is_tuple` is True, this must be a
-    tuple of state Tensors, both `2-D`, with column sizes `c_state` and
-    `m_state`.
-*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to "lstm_cell".
-
-##### Returns:
-
-  A tuple containing:
-
-  - A `2-D, [batch x output_dim]`, Tensor representing the output of the
-    LSTM after reading `inputs` when previous state was `state`.
-    Here output_dim is:
-       num_proj if num_proj was set,
-       num_units otherwise.
-  - Tensor(s) representing the new state of LSTM after reading `inputs` when
-    the previous state was `state`.  Same type and shape(s) as `state`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If input size cannot be inferred from inputs via
-    static shape inference.
-
-
-- - -
-
-#### `tf.nn.rnn_cell.LSTMCell.__init__(num_units, input_size=None, use_peepholes=False, cell_clip=None, initializer=None, num_proj=None, proj_clip=None, num_unit_shards=None, num_proj_shards=None, forget_bias=1.0, state_is_tuple=True, activation=tanh)` {#LSTMCell.__init__}
-
-Initialize the parameters for an LSTM cell.
-
-##### Args:
-
-
-*  <b>`num_units`</b>: int, The number of units in the LSTM cell
-*  <b>`input_size`</b>: Deprecated and unused.
-*  <b>`use_peepholes`</b>: bool, set True to enable diagonal/peephole connections.
-*  <b>`cell_clip`</b>: (optional) A float value, if provided the cell state is clipped
-    by this value prior to the cell output activation.
-*  <b>`initializer`</b>: (optional) The initializer to use for the weight and
-    projection matrices.
-*  <b>`num_proj`</b>: (optional) int, The output dimensionality for the projection
-    matrices.  If None, no projection is performed.
-*  <b>`proj_clip`</b>: (optional) A float value.  If `num_proj > 0` and `proj_clip` is
-    provided, then the projected values are clipped elementwise to within
-    `[-proj_clip, proj_clip]`.
-*  <b>`num_unit_shards`</b>: Deprecated, will be removed by Jan. 2017.
-    Use a variable_scope partitioner instead.
-*  <b>`num_proj_shards`</b>: Deprecated, will be removed by Jan. 2017.
-    Use a variable_scope partitioner instead.
-*  <b>`forget_bias`</b>: Biases of the forget gate are initialized by default to 1
-    in order to reduce the scale of forgetting at the beginning of
-    the training.
-*  <b>`state_is_tuple`</b>: If True, accepted and returned states are 2-tuples of
-    the `c_state` and `m_state`.  If False, they are concatenated
-    along the column axis.  This latter behavior will soon be deprecated.
-*  <b>`activation`</b>: Activation function of the inner states.
-
-
-- - -
-
-#### `tf.nn.rnn_cell.LSTMCell.output_size` {#LSTMCell.output_size}
-
-
-
-
-- - -
-
-#### `tf.nn.rnn_cell.LSTMCell.state_size` {#LSTMCell.state_size}
-
-
-
-
-- - -
-
-#### `tf.nn.rnn_cell.LSTMCell.zero_state(batch_size, dtype)` {#LSTMCell.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
-
-
-## Classes storing split `RNNCell` state
-
-- - -
-
-### `class tf.nn.rnn_cell.LSTMStateTuple` {#LSTMStateTuple}
-
-Tuple used by LSTM Cells for `state_size`, `zero_state`, and output state.
-
-Stores two elements: `(c, h)`, in that order.
-
-Only used when `state_is_tuple=True`.
-- - -
-
-#### `tf.nn.rnn_cell.LSTMStateTuple.__getnewargs__()` {#LSTMStateTuple.__getnewargs__}
-
-Return self as a plain tuple.  Used by copy and pickle.
-
-
-- - -
-
-#### `tf.nn.rnn_cell.LSTMStateTuple.__getstate__()` {#LSTMStateTuple.__getstate__}
-
-Exclude the OrderedDict from pickling
-
-
-- - -
-
-#### `tf.nn.rnn_cell.LSTMStateTuple.__new__(_cls, c, h)` {#LSTMStateTuple.__new__}
-
-Create new instance of LSTMStateTuple(c, h)
-
-
-- - -
-
-#### `tf.nn.rnn_cell.LSTMStateTuple.__repr__()` {#LSTMStateTuple.__repr__}
-
-Return a nicely formatted representation string
-
-
-- - -
-
-#### `tf.nn.rnn_cell.LSTMStateTuple.c` {#LSTMStateTuple.c}
-
-Alias for field number 0
-
-
-- - -
-
-#### `tf.nn.rnn_cell.LSTMStateTuple.dtype` {#LSTMStateTuple.dtype}
-
-
-
-
-- - -
-
-#### `tf.nn.rnn_cell.LSTMStateTuple.h` {#LSTMStateTuple.h}
-
-Alias for field number 1
-
-
-
-
-## RNN Cell wrappers (RNNCells that wrap other RNNCells)
-
-- - -
-
-### `class tf.nn.rnn_cell.MultiRNNCell` {#MultiRNNCell}
-
-RNN cell composed sequentially of multiple simple cells.
-- - -
-
-#### `tf.nn.rnn_cell.MultiRNNCell.__call__(inputs, state, scope=None)` {#MultiRNNCell.__call__}
-
-Run this multi-layer cell on inputs, starting from state.
-
-
-- - -
-
-#### `tf.nn.rnn_cell.MultiRNNCell.__init__(cells, state_is_tuple=True)` {#MultiRNNCell.__init__}
-
-Create a RNN cell composed sequentially of a number of RNNCells.
-
-##### Args:
-
-
-*  <b>`cells`</b>: list of RNNCells that will be composed in this order.
-*  <b>`state_is_tuple`</b>: If True, accepted and returned states are n-tuples, where
-    `n = len(cells)`.  If False, the states are all
-    concatenated along the column axis.  This latter behavior will soon be
-    deprecated.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if cells is empty (not allowed), or at least one of the cells
-    returns a state tuple but the flag `state_is_tuple` is `False`.
-
-
-- - -
-
-#### `tf.nn.rnn_cell.MultiRNNCell.output_size` {#MultiRNNCell.output_size}
-
-
-
-
-- - -
-
-#### `tf.nn.rnn_cell.MultiRNNCell.state_size` {#MultiRNNCell.state_size}
-
-
-
-
-- - -
-
-#### `tf.nn.rnn_cell.MultiRNNCell.zero_state(batch_size, dtype)` {#MultiRNNCell.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
-
-- - -
-
-### `class tf.nn.rnn_cell.DropoutWrapper` {#DropoutWrapper}
-
-Operator adding dropout to inputs and outputs of the given cell.
-- - -
-
-#### `tf.nn.rnn_cell.DropoutWrapper.__call__(inputs, state, scope=None)` {#DropoutWrapper.__call__}
-
-Run the cell with the declared dropouts.
-
-
-- - -
-
-#### `tf.nn.rnn_cell.DropoutWrapper.__init__(cell, input_keep_prob=1.0, output_keep_prob=1.0, seed=None)` {#DropoutWrapper.__init__}
-
-Create a cell with added input and/or output dropout.
-
-Dropout is never used on the state.
-
-##### Args:
-
-
-*  <b>`cell`</b>: an RNNCell, a projection to output_size is added to it.
-*  <b>`input_keep_prob`</b>: unit Tensor or float between 0 and 1, input keep
-    probability; if it is float and 1, no input dropout will be added.
-*  <b>`output_keep_prob`</b>: unit Tensor or float between 0 and 1, output keep
-    probability; if it is float and 1, no output dropout will be added.
-*  <b>`seed`</b>: (optional) integer, the randomness seed.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if cell is not an RNNCell.
-*  <b>`ValueError`</b>: if keep_prob is not between 0 and 1.
-
-
-- - -
-
-#### `tf.nn.rnn_cell.DropoutWrapper.output_size` {#DropoutWrapper.output_size}
-
-
-
-
-- - -
-
-#### `tf.nn.rnn_cell.DropoutWrapper.state_size` {#DropoutWrapper.state_size}
-
-
-
-
-- - -
-
-#### `tf.nn.rnn_cell.DropoutWrapper.zero_state(batch_size, dtype)` {#DropoutWrapper.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
-
-- - -
-
-### `class tf.nn.rnn_cell.EmbeddingWrapper` {#EmbeddingWrapper}
-
-Operator adding input embedding to the given cell.
-
-Note: in many cases it may be more efficient to not use this wrapper,
-but instead concatenate the whole sequence of your inputs in time,
-do the embedding on this batch-concatenated sequence, then split it and
-feed into your RNN.
-- - -
-
-#### `tf.nn.rnn_cell.EmbeddingWrapper.__call__(inputs, state, scope=None)` {#EmbeddingWrapper.__call__}
-
-Run the cell on embedded inputs.
-
-
-- - -
-
-#### `tf.nn.rnn_cell.EmbeddingWrapper.__init__(cell, embedding_classes, embedding_size, initializer=None)` {#EmbeddingWrapper.__init__}
-
-Create a cell with an added input embedding.
-
-##### Args:
-
-
-*  <b>`cell`</b>: an RNNCell, an embedding will be put before its inputs.
-*  <b>`embedding_classes`</b>: integer, how many symbols will be embedded.
-*  <b>`embedding_size`</b>: integer, the size of the vectors we embed into.
-*  <b>`initializer`</b>: an initializer to use when creating the embedding;
-    if None, the initializer from variable scope or a default one is used.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if cell is not an RNNCell.
-*  <b>`ValueError`</b>: if embedding_classes is not positive.
-
-
-- - -
-
-#### `tf.nn.rnn_cell.EmbeddingWrapper.output_size` {#EmbeddingWrapper.output_size}
-
-
-
-
-- - -
-
-#### `tf.nn.rnn_cell.EmbeddingWrapper.state_size` {#EmbeddingWrapper.state_size}
-
-
-
-
-- - -
-
-#### `tf.nn.rnn_cell.EmbeddingWrapper.zero_state(batch_size, dtype)` {#EmbeddingWrapper.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
-
-- - -
-
-### `class tf.nn.rnn_cell.InputProjectionWrapper` {#InputProjectionWrapper}
-
-Operator adding an input projection to the given cell.
-
-Note: in many cases it may be more efficient to not use this wrapper,
-but instead concatenate the whole sequence of your inputs in time,
-do the projection on this batch-concatenated sequence, then split it.
-- - -
-
-#### `tf.nn.rnn_cell.InputProjectionWrapper.__call__(inputs, state, scope=None)` {#InputProjectionWrapper.__call__}
-
-Run the input projection and then the cell.
-
-
-- - -
-
-#### `tf.nn.rnn_cell.InputProjectionWrapper.__init__(cell, num_proj, input_size=None)` {#InputProjectionWrapper.__init__}
-
-Create a cell with input projection.
-
-##### Args:
-
-
-*  <b>`cell`</b>: an RNNCell, a projection of inputs is added before it.
-*  <b>`num_proj`</b>: Python integer.  The dimension to project to.
-*  <b>`input_size`</b>: Deprecated and unused.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if cell is not an RNNCell.
-
-
-- - -
-
-#### `tf.nn.rnn_cell.InputProjectionWrapper.output_size` {#InputProjectionWrapper.output_size}
-
-
-
-
-- - -
-
-#### `tf.nn.rnn_cell.InputProjectionWrapper.state_size` {#InputProjectionWrapper.state_size}
-
-
-
-
-- - -
-
-#### `tf.nn.rnn_cell.InputProjectionWrapper.zero_state(batch_size, dtype)` {#InputProjectionWrapper.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
-
-- - -
-
-### `class tf.nn.rnn_cell.OutputProjectionWrapper` {#OutputProjectionWrapper}
-
-Operator adding an output projection to the given cell.
-
-Note: in many cases it may be more efficient to not use this wrapper,
-but instead concatenate the whole sequence of your outputs in time,
-do the projection on this batch-concatenated sequence, then split it
-if needed or directly feed into a softmax.
-- - -
-
-#### `tf.nn.rnn_cell.OutputProjectionWrapper.__call__(inputs, state, scope=None)` {#OutputProjectionWrapper.__call__}
-
-Run the cell and output projection on inputs, starting from state.
-
-
-- - -
-
-#### `tf.nn.rnn_cell.OutputProjectionWrapper.__init__(cell, output_size)` {#OutputProjectionWrapper.__init__}
-
-Create a cell with output projection.
-
-##### Args:
-
-
-*  <b>`cell`</b>: an RNNCell, a projection to output_size is added to it.
-*  <b>`output_size`</b>: integer, the size of the output after projection.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if cell is not an RNNCell.
-*  <b>`ValueError`</b>: if output_size is not positive.
-
-
-- - -
-
-#### `tf.nn.rnn_cell.OutputProjectionWrapper.output_size` {#OutputProjectionWrapper.output_size}
-
-
-
-
-- - -
-
-#### `tf.nn.rnn_cell.OutputProjectionWrapper.state_size` {#OutputProjectionWrapper.state_size}
-
-
-
-
-- - -
-
-#### `tf.nn.rnn_cell.OutputProjectionWrapper.zero_state(batch_size, dtype)` {#OutputProjectionWrapper.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/sparse_ops.md b/tensorflow/g3doc/api_docs/python/sparse_ops.md
index 2590b21fcc143e..f789a9878bf7f3 100644
--- a/tensorflow/g3doc/api_docs/python/sparse_ops.md
+++ b/tensorflow/g3doc/api_docs/python/sparse_ops.md
@@ -21,37 +21,41 @@ dimension, and dense along all other dimensions.
 Represents a sparse tensor.
 
 TensorFlow represents a sparse tensor as three separate dense tensors:
-`indices`, `values`, and `shape`.  In Python, the three tensors are
+`indices`, `values`, and `dense_shape`.  In Python, the three tensors are
 collected into a `SparseTensor` class for ease of use.  If you have separate
-`indices`, `values`, and `shape` tensors, wrap them in a `SparseTensor`
+`indices`, `values`, and `dense_shape` tensors, wrap them in a `SparseTensor`
 object before passing to the ops below.
 
-Concretely, the sparse tensor `SparseTensor(indices, values, shape)`
+Concretely, the sparse tensor `SparseTensor(indices, values, dense_shape)`
 comprises the following components, where `N` and `ndims` are the number
 of values and number of dimensions in the `SparseTensor`, respectively:
 
-* `indices`: A 2-D int64 tensor of shape `[N, ndims]`, which specifies
+* `indices`: A 2-D int64 tensor of dense_shape `[N, ndims]`, which specifies
   the indices of the elements in the sparse tensor that contain nonzero
   values (elements are zero-indexed). For example, `indices=[[1,3], [2,4]]`
   specifies that the elements with indexes of [1,3] and [2,4] have
   nonzero values.
 
-* `values`: A 1-D tensor of any type and shape `[N]`, which supplies the
+* `values`: A 1-D tensor of any type and dense_shape `[N]`, which supplies the
   values for each element in `indices`. For example, given
   `indices=[[1,3], [2,4]]`, the parameter `values=[18, 3.6]` specifies
   that element [1,3] of the sparse tensor has a value of 18, and element
   [2,4] of the tensor has a value of 3.6.
 
-* `shape`: A 1-D int64 tensor of shape `[ndims]`, which specifies the shape
-  of the sparse tensor. Takes a list indicating the number of elements in
-  each dimension. For example, `shape=[3,6]` specifies a two-dimensional 3x6
-  tensor, `shape=[2,3,4]` specifies a three-dimensional 2x3x4 tensor, and
-  `shape=[9]` specifies a one-dimensional tensor with 9 elements.
+* `dense_shape`: A 1-D int64 tensor of dense_shape `[ndims]`, which specifies
+the
+  dense_shape of the sparse tensor. Takes a list indicating the number of
+  elements
+  in each dimension. For example, `dense_shape=[3,6]` specifies a
+  two-dimensional
+  3x6 tensor, `dense_shape=[2,3,4]` specifies a three-dimensional 2x3x4
+  tensor, and
+  `dense_shape=[9]` specifies a one-dimensional tensor with 9 elements.
 
 The corresponding dense tensor satisfies:
 
 ```python
-dense.shape = shape
+dense.shape = dense_shape
 dense[tuple(indices[i])] = values[i]
 ```
 
@@ -64,7 +68,7 @@ obtained by calling `tf.sparse_reorder(st)`.
 Example: The sparse tensor
 
 ```python
-SparseTensor(indices=[[0, 0], [1, 2]], values=[1, 2], shape=[3, 4])
+SparseTensor(indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])
 ```
 
 represents the dense tensor
@@ -77,27 +81,34 @@ represents the dense tensor
 
 - - -
 
-#### `tf.SparseTensor.__init__(indices, values, shape)` {#SparseTensor.__init__}
+#### `tf.SparseTensor.__init__(indices, values, dense_shape=None, shape=None)` {#SparseTensor.__init__}
 
 Creates a `SparseTensor`.
 
 ##### Args:
 
 
-*  <b>`indices`</b>: A 2-D int64 tensor of shape `[N, ndims]`.
-*  <b>`values`</b>: A 1-D tensor of any type and shape `[N]`.
-*  <b>`shape`</b>: A 1-D int64 tensor of shape `[ndims]`.
+*  <b>`indices`</b>: A 2-D int64 tensor of dense_shape `[N, ndims]`.
+*  <b>`values`</b>: A 1-D tensor of any type and dense_shape `[N]`.
+*  <b>`dense_shape`</b>: A 1-D int64 tensor of dense_shape `[ndims]`.
+*  <b>`shape`</b>: Temporary.  Legacy naming of dense_shape.  Only one of `shape` or
+    `dense_shape` must be provided.
 
 ##### Returns:
 
-  A `SparseTensor`
+  A `SparseTensor`.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: if both `shape` and `dense_shape` are provided.
 
 
 - - -
 
 #### `tf.SparseTensor.get_shape()` {#SparseTensor.get_shape}
 
-Get the `TensorShape` that represents the shape of the dense tensor.
+Get the `TensorShape` representing the shape of the dense tensor.
 
 ##### Returns:
 
@@ -112,7 +123,7 @@ The indices of non-zero values in the represented dense tensor.
 
 ##### Returns:
 
-  A 2-D Tensor of int64 with shape `[N, ndims]`, where `N` is the
+  A 2-D Tensor of int64 with dense_shape `[N, ndims]`, where `N` is the
     number of non-zero values in the tensor, and `ndims` is the rank.
 
 
@@ -129,7 +140,7 @@ The non-zero values in the represented dense tensor.
 
 - - -
 
-#### `tf.SparseTensor.shape` {#SparseTensor.shape}
+#### `tf.SparseTensor.dense_shape` {#SparseTensor.dense_shape}
 
 A 1-D Tensor of int64 representing the shape of the dense tensor.
 
@@ -152,7 +163,7 @@ The `Operation` that produces `values` as an output.
 
 #### `tf.SparseTensor.graph` {#SparseTensor.graph}
 
-The `Graph` that contains the index, value, and shape tensors.
+The `Graph` that contains the index, value, and dense_shape tensors.
 
 
 
@@ -268,59 +279,75 @@ available, or `session` must be specified explicitly.
 
 
 
+- - -
+
+#### `tf.SparseTensor.shape` {#SparseTensor.shape}
+
+Legacy property returning `dense_shape`.
+
+
 
 - - -
 
 ### `class tf.SparseTensorValue` {#SparseTensorValue}
 
-SparseTensorValue(indices, values, shape)
+Stores the calculated numpy arrays representing a `SparseTensor`.
+
+Returned as the output of a session.run on a `SparseTensor` object.
 - - -
 
-#### `tf.SparseTensorValue.__getnewargs__()` {#SparseTensorValue.__getnewargs__}
+#### `tf.SparseTensorValue.__getitem__(i)` {#SparseTensorValue.__getitem__}
+
 
-Return self as a plain tuple.  Used by copy and pickle.
 
 
 - - -
 
-#### `tf.SparseTensorValue.__getstate__()` {#SparseTensorValue.__getstate__}
+#### `tf.SparseTensorValue.__init__(indices, values, dense_shape=None, shape=None)` {#SparseTensorValue.__init__}
+
 
-Exclude the OrderedDict from pickling
 
 
 - - -
 
-#### `tf.SparseTensorValue.__new__(_cls, indices, values, shape)` {#SparseTensorValue.__new__}
+#### `tf.SparseTensorValue.__iter__()` {#SparseTensorValue.__iter__}
+
 
-Create new instance of SparseTensorValue(indices, values, shape)
 
 
 - - -
 
 #### `tf.SparseTensorValue.__repr__()` {#SparseTensorValue.__repr__}
 
-Return a nicely formatted representation string
+
+
+
+- - -
+
+#### `tf.SparseTensorValue.dense_shape` {#SparseTensorValue.dense_shape}
+
+
 
 
 - - -
 
 #### `tf.SparseTensorValue.indices` {#SparseTensorValue.indices}
 
-Alias for field number 0
+
 
 
 - - -
 
 #### `tf.SparseTensorValue.shape` {#SparseTensorValue.shape}
 
-Alias for field number 2
+
 
 
 - - -
 
 #### `tf.SparseTensorValue.values` {#SparseTensorValue.values}
 
-Alias for field number 1
+
 
 
 
@@ -411,7 +438,7 @@ tested if validate_indices is True.
 
 ##### Returns:
 
-  A dense tensor with shape `sp_input.shape` and values specified by
+  A dense tensor with shape `sp_input.dense_shape` and values specified by
   the non-empty values in `sp_input`. Indices not in `sp_input` are assigned
   `default_value`.
 
@@ -428,14 +455,14 @@ tested if validate_indices is True.
 Converts a `SparseTensor` of ids into a dense bool indicator tensor.
 
 The last dimension of `sp_input.indices` is discarded and replaced with
-the values of `sp_input`.  If `sp_input.shape = [D0, D1, ..., Dn, K]`, then
-`output.shape = [D0, D1, ..., Dn, vocab_size]`, where
+the values of `sp_input`.  If `sp_input.dense_shape = [D0, D1, ..., Dn, K]`,
+then `output.shape = [D0, D1, ..., Dn, vocab_size]`, where
 
     output[d_0, d_1, ..., d_n, sp_input[d_0, d_1, ..., d_n, k]] = True
 
 and False elsewhere in `output`.
 
-For example, if `sp_input.shape = [2, 3, 4]` with non-empty values:
+For example, if `sp_input.dense_shape = [2, 3, 4]` with non-empty values:
 
     [0, 0, 0]: 0
     [0, 1, 0]: 10
@@ -493,7 +520,7 @@ The `SparseTensor` returned by this function has the following properties:
   - `indices` is equivalent to `sp_ids.indices` with the last
     dimension discarded and replaced with `sp_ids.values`.
   - `values` is simply `sp_values.values`.
-  - If `sp_ids.shape = [D0, D1, ..., Dn, K]`, then
+  - If `sp_ids.dense_shape = [D0, D1, ..., Dn, K]`, then
     `output.shape = [D0, D1, ..., Dn, vocab_size]`.
 
 For example, consider the following feature vectors:
@@ -534,7 +561,7 @@ equal to:
 ```python
   SparseTensor(indices=[[0, 0], [1, 1], [1, 3], [1, 4], [2, 0], [2, 3]],
                values=[-3, 1, 4, 1, 5, 9],
-               shape=[3, 6])
+               dense_shape=[3, 6])
 ```
 
 ##### Args:
@@ -566,7 +593,7 @@ equal to:
 
 - - -
 
-### `tf.sparse_concat(concat_dim, sp_inputs, name=None, expand_nonconcat_dim=False)` {#sparse_concat}
+### `tf.sparse_concat(axis, sp_inputs, name=None, expand_nonconcat_dim=False, concat_dim=None)` {#sparse_concat}
 
 Concatenates a list of `SparseTensor` along the specified dimension.
 
@@ -595,7 +622,7 @@ This op runs in `O(M log M)` time, where `M` is the total number of non-empty
 values across all inputs. This is due to the need for an internal sort in
 order to concatenate efficiently across an arbitrary dimension.
 
-For example, if `concat_dim = 1` and the inputs are
+For example, if `axis = 1` and the inputs are
 
     sp_inputs[0]: shape = [2, 3]
     [0, 2]: "a"
@@ -620,7 +647,7 @@ Graphically this is equivalent to doing
     [    a] concat [  d e  ] = [    a   d e  ]
     [b c  ]        [       ]   [b c          ]
 
-Another example, if 'concat_dim = 1' and the inputs are
+Another example, if 'axis = 1' and the inputs are
 
     sp_inputs[0]: shape = [3, 3]
     [0, 2]: "a"
@@ -651,12 +678,13 @@ Graphically this is equivalent to doing
 ##### Args:
 
 
-*  <b>`concat_dim`</b>: Dimension to concatenate along. Must be in range [-rank, rank),
+*  <b>`axis`</b>: Dimension to concatenate along. Must be in range [-rank, rank),
     where rank is the number of dimensions in each input `SparseTensor`.
 *  <b>`sp_inputs`</b>: List of `SparseTensor` to concatenate.
 *  <b>`name`</b>: A name prefix for the returned tensors (optional).
 *  <b>`expand_nonconcat_dim`</b>: Whether to allow the expansion in the non-concat
     dimensions. Defaulted to False.
+*  <b>`concat_dim`</b>: The old (deprecated) name for axis.
 
 ##### Returns:
 
@@ -768,13 +796,13 @@ shape `[9, 4]` and `indices` / `values`:
 
 - - -
 
-### `tf.sparse_split(split_dim, num_split, sp_input, name=None)` {#sparse_split}
+### `tf.sparse_split(keyword_required=KeywordRequired(), sp_input=None, num_split=None, axis=None, name=None, split_dim=None)` {#sparse_split}
 
-Split a `SparseTensor` into `num_split` tensors along `split_dim`.
+Split a `SparseTensor` into `num_split` tensors along `axis`.
 
-If the `sp_input.shape[split_dim]` is not an integer multiple of `num_split`
-each slice starting from 0:`shape[split_dim] % num_split` gets extra one
-dimension. For example, if `split_dim = 1` and `num_split = 2` and the
+If the `sp_input.dense_shape[axis]` is not an integer multiple of `num_split`
+each slice starting from 0:`shape[axis] % num_split` gets extra one
+dimension. For example, if `axis = 1` and `num_split = 2` and the
 input is:
 
     input_tensor = shape = [2, 7]
@@ -794,10 +822,12 @@ Graphically the output tensors are:
 ##### Args:
 
 
-*  <b>`split_dim`</b>: A 0-D `int32` `Tensor`. The dimension along which to split.
-*  <b>`num_split`</b>: A Python integer. The number of ways to split.
+*  <b>`keyword_required`</b>: Python 2 standin for * (temporary for argument reorder)
 *  <b>`sp_input`</b>: The `SparseTensor` to split.
+*  <b>`num_split`</b>: A Python integer. The number of ways to split.
+*  <b>`axis`</b>: A 0-D `int32` `Tensor`. The dimension along which to split.
 *  <b>`name`</b>: A name for the operation (optional).
+*  <b>`split_dim`</b>: Deprecated old name for axis.
 
 ##### Returns:
 
@@ -807,6 +837,7 @@ Graphically the output tensors are:
 
 
 *  <b>`TypeError`</b>: If `sp_input` is not a `SparseTensor`.
+*  <b>`ValueError`</b>: If the deprecated `split_dim` and `axis` are both non None.
 
 
 - - -
@@ -1233,8 +1264,8 @@ converting the `SparseTensor` to a dense one and using `tf.matmul` with
 `sp_a=True`.
 
 This operation tends to perform well when A is more sparse, if the column size
-of the product is small (e.g. matrix-vector multiplication), if sp_a.shape
-takes on large values.
+of the product is small (e.g. matrix-vector multiplication), if
+`sp_a.dense_shape` takes on large values.
 
 Below is a rough speed comparison between sparse_tensor_dense_matmul,
 labelled 'sparse', and matmul(sp_a=True), labelled 'dense'.  For purposes of
diff --git a/tensorflow/g3doc/api_docs/python/string_ops.md b/tensorflow/g3doc/api_docs/python/string_ops.md
index 7e751488914468..fc27ba36f00953 100644
--- a/tensorflow/g3doc/api_docs/python/string_ops.md
+++ b/tensorflow/g3doc/api_docs/python/string_ops.md
@@ -105,7 +105,7 @@ string tensor.
 
 - - -
 
-### `tf.reduce_join(inputs, reduction_indices, keep_dims=None, separator=None, name=None)` {#reduce_join}
+### `tf.reduce_join(inputs, axis=None, keep_dims=False, separator='', name=None, reduction_indices=None)` {#reduce_join}
 
 Joins a string Tensor across the given dimensions.
 
@@ -113,7 +113,7 @@ Computes the string join across dimensions in the given string Tensor of shape
 `[d_0, d_1, ..., d_n-1]`.  Returns a new Tensor created by joining the input
 strings with the given separator (default: empty string).  Negative indices are
 counted backwards from the end, with `-1` being equivalent to `n - 1`.  Passing
-an empty `reduction_indices` joins all strings in linear index order and outputs
+an empty `axis` joins all strings in linear index order and outputs
 a scalar string.
 
 
@@ -138,9 +138,9 @@ tf.reduce_join(a, []) ==> ["abcd"]
 
 *  <b>`inputs`</b>: A `Tensor` of type `string`.
     The input to be joined.  All reduced indices must have non-zero size.
-*  <b>`reduction_indices`</b>: A `Tensor` of type `int32`.
+*  <b>`axis`</b>: A `Tensor` of type `int32`.
     The dimensions to reduce over.  Dimensions are reduced in the
-    order specified.  Omitting `reduction_indices` is equivalent to passing
+    order specified.  Omitting `axis` is equivalent to passing
     `[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported.
 *  <b>`keep_dims`</b>: An optional `bool`. Defaults to `False`.
     If `True`, retain reduced dimensions with length `1`.
diff --git a/tensorflow/g3doc/api_docs/python/tensor_array_ops.md b/tensorflow/g3doc/api_docs/python/tensor_array_ops.md
index a994e90b8597fc..c5019f41bf91da 100644
--- a/tensorflow/g3doc/api_docs/python/tensor_array_ops.md
+++ b/tensorflow/g3doc/api_docs/python/tensor_array_ops.md
@@ -76,11 +76,12 @@ must all match.
 
 - - -
 
-#### `tf.TensorArray.pack(name=None)` {#TensorArray.pack}
+#### `tf.TensorArray.pack(*args, **kwargs)` {#TensorArray.pack}
 
-Return the values in the TensorArray as a packed `Tensor`.
+Return the values in the TensorArray as a stacked `Tensor`.
 
 All of the values must have been written and their shapes must all match.
+If input shapes have rank-`R`, then output shape will have rank-`(R+1)`.
 
 ##### Args:
 
@@ -89,7 +90,7 @@ All of the values must have been written and their shapes must all match.
 
 ##### Returns:
 
-  All the tensors in the TensorArray packed into one tensor.
+  All the tensors in the TensorArray stacked into one tensor.
 
 
 - - -
@@ -163,19 +164,22 @@ Scatter the values of a `Tensor` in specific indices of a `TensorArray`.
 
 - - -
 
-#### `tf.TensorArray.unpack(value, name=None)` {#TensorArray.unpack}
+#### `tf.TensorArray.unpack(*args, **kwargs)` {#TensorArray.unpack}
 
-Pack the values of a `Tensor` in the TensorArray.
+Unstack the values of a `Tensor` in the TensorArray.
+
+If input value shapes have rank-`R`, then the output TensorArray will
+contain elements whose shapes are rank-`(R-1)`.
 
 ##### Args:
 
 
-*  <b>`value`</b>: (N+1)-D.  Tensor of type `dtype`.  The Tensor to unpack.
+*  <b>`value`</b>: (N+1)-D.  Tensor of type `dtype`.  The Tensor to unstack.
 *  <b>`name`</b>: A name for the operation (optional).
 
 ##### Returns:
 
-  A new TensorArray object with flow that ensures the unpack occurs.
+  A new TensorArray object with flow that ensures the unstack occurs.
   Use this object all for subsequent operations.
 
 ##### Raises:
@@ -221,7 +225,7 @@ Split the values of a `Tensor` into the TensorArray.
 #### Other Methods
 - - -
 
-#### `tf.TensorArray.__init__(dtype, size=None, dynamic_size=None, clear_after_read=None, tensor_array_name=None, handle=None, flow=None, infer_shape=True, elem_shape=None, name=None)` {#TensorArray.__init__}
+#### `tf.TensorArray.__init__(dtype, size=None, dynamic_size=None, clear_after_read=None, tensor_array_name=None, handle=None, flow=None, infer_shape=True, element_shape=None, name=None)` {#TensorArray.__init__}
 
 Construct a new TensorArray or wrap an existing TensorArray handle.
 
@@ -252,8 +256,9 @@ is created within a `while_loop`.
     `TensorArray.flow`.
 *  <b>`infer_shape`</b>: (optional, default: True) If True, shape inference
     is enabled.  In this case, all elements must have the same shape.
-*  <b>`elem_shape`</b>: (optional, default: None) A TensorShape object specifying
-    the shape of all the elements of the TensorArray.
+*  <b>`element_shape`</b>: (optional, default: None) A `TensorShape` object specifying
+    the shape constraints of each of the elements of the TensorArray.
+    Need not be fully defined.
 *  <b>`name`</b>: A name for the operation (optional).
 
 ##### Raises:
@@ -284,4 +289,49 @@ The data type of this TensorArray.
 Return the size of the TensorArray.
 
 
+- - -
+
+#### `tf.TensorArray.stack(name=None)` {#TensorArray.stack}
+
+Return the values in the TensorArray as a stacked `Tensor`.
+
+All of the values must have been written and their shapes must all match.
+If input shapes have rank-`R`, then output shape will have rank-`(R+1)`.
+
+##### Args:
+
+
+*  <b>`name`</b>: A name for the operation (optional).
+
+##### Returns:
+
+  All the tensors in the TensorArray stacked into one tensor.
+
+
+- - -
+
+#### `tf.TensorArray.unstack(value, name=None)` {#TensorArray.unstack}
+
+Unstack the values of a `Tensor` in the TensorArray.
+
+If input value shapes have rank-`R`, then the output TensorArray will
+contain elements whose shapes are rank-`(R-1)`.
+
+##### Args:
+
+
+*  <b>`value`</b>: (N+1)-D.  Tensor of type `dtype`.  The Tensor to unstack.
+*  <b>`name`</b>: A name for the operation (optional).
+
+##### Returns:
+
+  A new TensorArray object with flow that ensures the unstack occurs.
+  Use this object all for subsequent operations.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: if the shape inference fails.
+
+
 
diff --git a/tensorflow/g3doc/api_docs/python/test.md b/tensorflow/g3doc/api_docs/python/test.md
index f34a7465891add..20cc3a55b728d6 100644
--- a/tensorflow/g3doc/api_docs/python/test.md
+++ b/tensorflow/g3doc/api_docs/python/test.md
@@ -492,7 +492,7 @@ then compares them using self._AssertProtoEqual().
 
 - - -
 
-#### `tf.test.TestCase.assertProtoEqualsVersion(expected, actual, producer=17, min_consumer=0)` {#TestCase.assertProtoEqualsVersion}
+#### `tf.test.TestCase.assertProtoEqualsVersion(expected, actual, producer=18, min_consumer=0)` {#TestCase.assertProtoEqualsVersion}
 
 
 
diff --git a/tensorflow/g3doc/api_docs/python/train.md b/tensorflow/g3doc/api_docs/python/train.md
index fb4ff94f4f41ec..e7b0e1a2b5e158 100644
--- a/tensorflow/g3doc/api_docs/python/train.md
+++ b/tensorflow/g3doc/api_docs/python/train.md
@@ -3880,489 +3880,12 @@ This method is completely compatible with the `tf.Session.run()` method.
 
 
 
-## Summary Operations
-
-The following ops output
-[`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-protocol buffers as serialized string tensors.
-
-You can fetch the output of a summary op in a session, and pass it to
-a [SummaryWriter](../../api_docs/python/train.md#SummaryWriter) to append it
-to an event file.  Event files contain
-[`Event`](https://www.tensorflow.org/code/tensorflow/core/util/event.proto)
-protos that can contain `Summary` protos along with the timestamp and
-step.  You can then use TensorBoard to visualize the contents of the
-event files.  See [TensorBoard and
-Summaries](../../how_tos/summaries_and_tensorboard/index.md) for more
-details.
-
-- - -
-
-### `tf.scalar_summary(tags, values, collections=None, name=None)` {#scalar_summary}
-
-Outputs a `Summary` protocol buffer with scalar values.
-
-The input `tags` and `values` must have the same shape.  The generated
-summary has a summary value for each tag-value pair in `tags` and `values`.
-
-##### Args:
-
-
-*  <b>`tags`</b>: A `string` `Tensor`.  Tags for the summaries.
-*  <b>`values`</b>: A real numeric Tensor.  Values for the summaries.
-*  <b>`collections`</b>: Optional list of graph collections keys. The new summary op is
-    added to these collections. Defaults to `[GraphKeys.SUMMARIES]`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A scalar `Tensor` of type `string`. The serialized `Summary` protocol
-  buffer.
-
-
-- - -
-
-### `tf.image_summary(tag, tensor, max_images=3, collections=None, name=None)` {#image_summary}
-
-Outputs a `Summary` protocol buffer with images.
-
-The summary has up to `max_images` summary values containing images. The
-images are built from `tensor` which must be 4-D with shape `[batch_size,
-height, width, channels]` and where `channels` can be:
-
-*  1: `tensor` is interpreted as Grayscale.
-*  3: `tensor` is interpreted as RGB.
-*  4: `tensor` is interpreted as RGBA.
-
-The images have the same number of channels as the input tensor. For float
-input, the values are normalized one image at a time to fit in the range
-`[0, 255]`.  `uint8` values are unchanged.  The op uses two different
-normalization algorithms:
-
-*  If the input values are all positive, they are rescaled so the largest one
-   is 255.
-
-*  If any input value is negative, the values are shifted so input value 0.0
-   is at 127.  They are then rescaled so that either the smallest value is 0,
-   or the largest one is 255.
-
-The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-build the `tag` of the summary values:
-
-*  If `max_images` is 1, the summary value tag is '*tag*/image'.
-*  If `max_images` is greater than 1, the summary value tags are
-   generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
-
-##### Args:
-
-
-*  <b>`tag`</b>: A scalar `Tensor` of type `string`. Used to build the `tag`
-    of the summary values.
-*  <b>`tensor`</b>: A 4-D `uint8` or `float32` `Tensor` of shape `[batch_size, height,
-    width, channels]` where `channels` is 1, 3, or 4.
-*  <b>`max_images`</b>: Max number of batch elements to generate images for.
-*  <b>`collections`</b>: Optional list of ops.GraphKeys.  The collections to add the
-    summary to.  Defaults to [ops.GraphKeys.SUMMARIES]
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A scalar `Tensor` of type `string`. The serialized `Summary` protocol
-  buffer.
-
-
-- - -
-
-### `tf.audio_summary(tag, tensor, sample_rate, max_outputs=3, collections=None, name=None)` {#audio_summary}
-
-Outputs a `Summary` protocol buffer with audio.
-
-The summary has up to `max_outputs` summary values containing audio. The
-audio is built from `tensor` which must be 3-D with shape `[batch_size,
-frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-assumed to be in the range of `[-1.0, 1.0]` with a sample rate of
-`sample_rate`.
-
-The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-build the `tag` of the summary values:
-
-*  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-*  If `max_outputs` is greater than 1, the summary value tags are
-   generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
-
-##### Args:
-
-
-*  <b>`tag`</b>: A scalar `Tensor` of type `string`. Used to build the `tag`
-    of the summary values.
-*  <b>`tensor`</b>: A 3-D `float32` `Tensor` of shape `[batch_size, frames, channels]`
-    or a 2-D `float32` `Tensor` of shape `[batch_size, frames]`.
-*  <b>`sample_rate`</b>: A Scalar `float32` `Tensor` indicating the sample rate of the
-    signal in hertz.
-*  <b>`max_outputs`</b>: Max number of batch elements to generate audio for.
-*  <b>`collections`</b>: Optional list of ops.GraphKeys.  The collections to add the
-    summary to.  Defaults to [ops.GraphKeys.SUMMARIES]
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A scalar `Tensor` of type `string`. The serialized `Summary` protocol
-  buffer.
-
-
-- - -
-
-### `tf.histogram_summary(tag, values, collections=None, name=None)` {#histogram_summary}
-
-Outputs a `Summary` protocol buffer with a histogram.
-
-The generated
-[`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-has one summary value containing a histogram for `values`.
-
-This op reports an `InvalidArgument` error if any value is not finite.
-
-##### Args:
-
-
-*  <b>`tag`</b>: A `string` `Tensor`. 0-D.  Tag to use for the summary value.
-*  <b>`values`</b>: A real numeric `Tensor`. Any shape. Values to use to
-    build the histogram.
-*  <b>`collections`</b>: Optional list of graph collections keys. The new summary op is
-    added to these collections. Defaults to `[GraphKeys.SUMMARIES]`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A scalar `Tensor` of type `string`. The serialized `Summary` protocol
-  buffer.
-
-
-- - -
-
-### `tf.nn.zero_fraction(value, name=None)` {#zero_fraction}
-
-Returns the fraction of zeros in `value`.
-
-If `value` is empty, the result is `nan`.
-
-This is useful in summaries to measure and report sparsity.  For example,
-
-```python
-    z = tf.Relu(...)
-    summ = tf.contrib.deprecated.scalar_summary('sparsity', tf.nn.zero_fraction(z))
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: A tensor of numeric type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The fraction of zeros in `value`, with type `float32`.
-
-
-
-- - -
-
-### `tf.merge_summary(inputs, collections=None, name=None)` {#merge_summary}
-
-Merges summaries.
-
-This op creates a
-[`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-protocol buffer that contains the union of all the values in the input
-summaries.
-
-When the Op is run, it reports an `InvalidArgument` error if multiple values
-in the summaries to merge use the same tag.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A list of `string` `Tensor` objects containing serialized `Summary`
-    protocol buffers.
-*  <b>`collections`</b>: Optional list of graph collections keys. The new summary op is
-    added to these collections. Defaults to `[GraphKeys.SUMMARIES]`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A scalar `Tensor` of type `string`. The serialized `Summary` protocol
-  buffer resulting from the merging.
-
-
-- - -
-
-### `tf.merge_all_summaries(key='summaries')` {#merge_all_summaries}
-
-Merges all summaries collected in the default graph.
-
-##### Args:
-
-
-*  <b>`key`</b>: `GraphKey` used to collect the summaries.  Defaults to
-    `GraphKeys.SUMMARIES`.
-
-##### Returns:
-
-  If no summaries were collected, returns None.  Otherwise returns a scalar
-  `Tensor` of type `string` containing the serialized `Summary` protocol
-  buffer resulting from the merging.
-
-
-
-## Adding Summaries to Event Files
+## Reading Summaries from Event Files
 
 See [Summaries and
 TensorBoard](../../how_tos/summaries_and_tensorboard/index.md) for an
 overview of summaries, event files, and visualization in TensorBoard.
 
-- - -
-
-### `class tf.train.SummaryWriter` {#SummaryWriter}
-
-
-- - -
-
-#### `tf.train.SummaryWriter.__init__(*args, **kwargs)` {#SummaryWriter.__init__}
-
-Creates a `SummaryWriter` and an event file. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-11-30.
-Instructions for updating:
-Please switch to tf.summary.FileWriter. The interface and behavior is the same; this is just a rename.
-
-    This class is deprecated, and should be replaced with tf.summary.FileWriter.
-
-    On construction the summary writer creates a new event file in `logdir`.
-    This event file will contain `Event` protocol buffers constructed when you
-    call one of the following functions: `add_summary()`, `add_session_log()`,
-    `add_event()`, or `add_graph()`.
-
-    If you pass a `Graph` to the constructor it is added to
-    the event file. (This is equivalent to calling `add_graph()` later).
-
-    TensorBoard will pick the graph from the file and display it graphically so
-    you can interactively explore the graph you built. You will usually pass
-    the graph from the session in which you launched it:
-
-    ```python
-    ...create a graph...
-    # Launch the graph in a session.
-    sess = tf.Session()
-    # Create a summary writer, add the 'graph' to the event file.
-    writer = tf.train.SummaryWriter(<some-directory>, sess.graph)
-    ```
-
-    The other arguments to the constructor control the asynchronous writes to
-    the event file:
-
-    *  `flush_secs`: How often, in seconds, to flush the added summaries
-       and events to disk.
-    *  `max_queue`: Maximum number of summaries or events pending to be
-       written to disk before one of the 'add' calls block.
-
-    Args:
-      logdir: A string. Directory where event file will be written.
-      graph: A `Graph` object, such as `sess.graph`.
-      max_queue: Integer. Size of the queue for pending events and summaries.
-      flush_secs: Number. How often, in seconds, to flush the
-        pending events and summaries to disk.
-      graph_def: DEPRECATED: Use the `graph` argument instead.
-
-
-- - -
-
-#### `tf.train.SummaryWriter.add_event(event)` {#SummaryWriter.add_event}
-
-Adds an event to the event file.
-
-##### Args:
-
-
-*  <b>`event`</b>: An `Event` protocol buffer.
-
-
-- - -
-
-#### `tf.train.SummaryWriter.add_graph(graph, global_step=None, graph_def=None)` {#SummaryWriter.add_graph}
-
-Adds a `Graph` to the event file.
-
-The graph described by the protocol buffer will be displayed by
-TensorBoard. Most users pass a graph in the constructor instead.
-
-##### Args:
-
-
-*  <b>`graph`</b>: A `Graph` object, such as `sess.graph`.
-*  <b>`global_step`</b>: Number. Optional global step counter to record with the
-    graph.
-*  <b>`graph_def`</b>: DEPRECATED. Use the `graph` parameter instead.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If both graph and graph_def are passed to the method.
-
-
-- - -
-
-#### `tf.train.SummaryWriter.add_meta_graph(meta_graph_def, global_step=None)` {#SummaryWriter.add_meta_graph}
-
-Adds a `MetaGraphDef` to the event file.
-
-The `MetaGraphDef` allows running the given graph via
-`saver.import_meta_graph()`.
-
-##### Args:
-
-
-*  <b>`meta_graph_def`</b>: A `MetaGraphDef` object, often as retured by
-    `saver.export_meta_graph()`.
-*  <b>`global_step`</b>: Number. Optional global step counter to record with the
-    graph.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If both `meta_graph_def` is not an instance of `MetaGraphDef`.
-
-
-- - -
-
-#### `tf.train.SummaryWriter.add_run_metadata(run_metadata, tag, global_step=None)` {#SummaryWriter.add_run_metadata}
-
-Adds a metadata information for a single session.run() call.
-
-##### Args:
-
-
-*  <b>`run_metadata`</b>: A `RunMetadata` protobuf object.
-*  <b>`tag`</b>: The tag name for this metadata.
-*  <b>`global_step`</b>: Number. Optional global step counter to record with the
-    StepStats.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the provided tag was already used for this type of event.
-
-
-- - -
-
-#### `tf.train.SummaryWriter.add_session_log(session_log, global_step=None)` {#SummaryWriter.add_session_log}
-
-Adds a `SessionLog` protocol buffer to the event file.
-
-This method wraps the provided session in an `Event` protocol buffer
-and adds it to the event file.
-
-##### Args:
-
-
-*  <b>`session_log`</b>: A `SessionLog` protocol buffer.
-*  <b>`global_step`</b>: Number. Optional global step value to record with the
-    summary.
-
-
-- - -
-
-#### `tf.train.SummaryWriter.add_summary(summary, global_step=None)` {#SummaryWriter.add_summary}
-
-Adds a `Summary` protocol buffer to the event file.
-
-This method wraps the provided summary in an `Event` protocol buffer
-and adds it to the event file.
-
-You can pass the result of evaluating any summary op, using
-[`Session.run()`](client.md#Session.run) or
-[`Tensor.eval()`](framework.md#Tensor.eval), to this
-function. Alternatively, you can pass a `tf.Summary` protocol
-buffer that you populate with your own data. The latter is
-commonly done to report evaluation results in event files.
-
-##### Args:
-
-
-*  <b>`summary`</b>: A `Summary` protocol buffer, optionally serialized as a string.
-*  <b>`global_step`</b>: Number. Optional global step value to record with the
-    summary.
-
-
-- - -
-
-#### `tf.train.SummaryWriter.close()` {#SummaryWriter.close}
-
-Flushes the event file to disk and close the file.
-
-Call this method when you do not need the summary writer anymore.
-
-
-- - -
-
-#### `tf.train.SummaryWriter.flush()` {#SummaryWriter.flush}
-
-Flushes the event file to disk.
-
-Call this method to make sure that all pending events have been written to
-disk.
-
-
-- - -
-
-#### `tf.train.SummaryWriter.get_logdir()` {#SummaryWriter.get_logdir}
-
-Returns the directory where event file will be written.
-
-
-- - -
-
-#### `tf.train.SummaryWriter.reopen()` {#SummaryWriter.reopen}
-
-Reopens the EventFileWriter.
-
-Can be called after `close()` to add more events in the same directory.
-The events will go into a new events file.
-
-Does nothing if the EventFileWriter was not closed.
-
-
-
-- - -
-
-### `class tf.train.SummaryWriterCache` {#SummaryWriterCache}
-
-Cache for file writers.
-
-This class caches file writers, one per directory.
-- - -
-
-#### `tf.train.SummaryWriterCache.clear()` {#SummaryWriterCache.clear}
-
-Clear cached summary writers. Currently only used for unit tests.
-
-
-- - -
-
-#### `tf.train.SummaryWriterCache.get(logdir)` {#SummaryWriterCache.get}
-
-Returns the FileWriter for the specified directory.
-
-##### Args:
-
-
-*  <b>`logdir`</b>: str, name of the directory.
-
-##### Returns:
-
-  A `FileWriter`.
-
-
-
 - - -
 
 ### `tf.train.summary_iterator(path)` {#summary_iterator}
@@ -4409,7 +3932,7 @@ for more information about their attributes.
 
 
 
-## Training utilities
+## Training Utilities
 
 - - -
 
@@ -4760,7 +4283,7 @@ such as saving a last checkpoint.
 Saves checkpoints every N steps or seconds.
 - - -
 
-#### `tf.train.CheckpointSaverHook.__init__(checkpoint_dir, save_secs=None, save_steps=None, saver=None, checkpoint_basename='model.ckpt', scaffold=None)` {#CheckpointSaverHook.__init__}
+#### `tf.train.CheckpointSaverHook.__init__(checkpoint_dir, save_secs=None, save_steps=None, saver=None, checkpoint_basename='model.ckpt', scaffold=None, listeners=None)` {#CheckpointSaverHook.__init__}
 
 Initialize CheckpointSaverHook monitor.
 
@@ -4773,6 +4296,10 @@ Initialize CheckpointSaverHook monitor.
 *  <b>`saver`</b>: `Saver` object, used for saving.
 *  <b>`checkpoint_basename`</b>: `str`, base name for the checkpoint files.
 *  <b>`scaffold`</b>: `Scaffold`, use to get saver object.
+*  <b>`listeners`</b>: List of `CheckpointSaverListener` subclass instances.
+    Used for callbacks that run immediately after the corresponding
+    CheckpointSaverHook callbacks, only in steps where the
+    CheckpointSaverHook was triggered.
 
 ##### Raises:
 
diff --git a/tensorflow/g3doc/get_started/os_setup.md b/tensorflow/g3doc/get_started/os_setup.md
index f4177dc47ad094..0ed4f2bad67b9a 100644
--- a/tensorflow/g3doc/get_started/os_setup.md
+++ b/tensorflow/g3doc/get_started/os_setup.md
@@ -885,10 +885,10 @@ remove the `_python_build` directory before running `bazel test`.
 
 ## Train your first TensorFlow neural net model
 
-Starting from the root of your source tree, run:
+Start by cloning the [TensorFlow models repo](https://github.com/tensorflow/models) from GitHub. Run the following commands:
 
 ```bash
-$ cd tensorflow/models/image/mnist
+$ cd models/tutorials/image/mnist
 $ python convolutional.py
 Successfully downloaded train-images-idx3-ubyte.gz 9912422 bytes.
 Successfully downloaded train-labels-idx1-ubyte.gz 28881 bytes.
diff --git a/tensorflow/g3doc/how_tos/distributed/index.md b/tensorflow/g3doc/how_tos/distributed/index.md
index 1e2e2131828a08..859bd3b7aa5e78 100644
--- a/tensorflow/g3doc/how_tos/distributed/index.md
+++ b/tensorflow/g3doc/how_tos/distributed/index.md
@@ -171,7 +171,7 @@ simplify the work of specifying a replicated model. Possible approaches include:
   values for the current parameters, compute gradients in parallel, and then
   apply them together. It is compatible with in-graph replication (e.g. using
   gradient averaging as in the
-  [CIFAR-10 multi-GPU trainer](https://www.tensorflow.org/code/tensorflow/models/image/cifar10/cifar10_multi_gpu_train.py)),
+  [CIFAR-10 multi-GPU trainer](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10_multi_gpu_train.py)),
   and between-graph replication (e.g. using the
   `tf.train.SyncReplicasOptimizer`).
 
diff --git a/tensorflow/g3doc/how_tos/reading_data/index.md b/tensorflow/g3doc/how_tos/reading_data/index.md
index ea100fa7f1eec7..1a364c3816f667 100644
--- a/tensorflow/g3doc/how_tos/reading_data/index.md
+++ b/tensorflow/g3doc/how_tos/reading_data/index.md
@@ -135,7 +135,7 @@ uses a file format where each record is represented using a fixed number of
 bytes: 1 byte for the label followed by 3072 bytes of image data. Once you have
 a uint8 tensor, standard operations can slice out each piece and reformat as
 needed. For CIFAR-10, you can see how to do the reading and decoding in
-[`tensorflow/models/image/cifar10/cifar10_input.py`](https://www.tensorflow.org/code/tensorflow/models/image/cifar10/cifar10_input.py)
+[`tensorflow_models/tutorials/image/cifar10/cifar10_input.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10_input.py)
 and described in
 [this tutorial](../../tutorials/deep_cnn/index.md#prepare-the-data).
 
@@ -172,7 +172,7 @@ You can then do any preprocessing of these examples you want. This would be any
 processing that doesn't depend on trainable parameters. Examples include
 normalization of your data, picking a random slice, adding noise or distortions,
 etc.  See
-[`tensorflow/models/image/cifar10/cifar10_input.py`](https://www.tensorflow.org/code/tensorflow/models/image/cifar10/cifar10_input.py)
+[`tensorflow_models/tutorials/image/cifar10/cifar10_input.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10_input.py)
 for an example.
 
 ### Batching
diff --git a/tensorflow/g3doc/how_tos/summaries_and_tensorboard/index.md b/tensorflow/g3doc/how_tos/summaries_and_tensorboard/index.md
index 5f16c52bd416b1..eaed898e8c13fe 100644
--- a/tensorflow/g3doc/how_tos/summaries_and_tensorboard/index.md
+++ b/tensorflow/g3doc/how_tos/summaries_and_tensorboard/index.md
@@ -54,17 +54,17 @@ to combine them into a single op that generates all the summary data.
 Then, you can just run the merged summary op, which will generate a serialized
 `Summary` protobuf object with all of your summary data at a given step.
 Finally, to write this summary data to disk, pass the summary protobuf to a
-[`tf.train.SummaryWriter`](../../api_docs/python/train.md#SummaryWriter).
+[`tf.summary.FileWriter`](../../api_docs/python/summary.md#FileWriter).
 
-The `SummaryWriter` takes a logdir in its constructor - this logdir is quite
+The `FileWriter` takes a logdir in its constructor - this logdir is quite
 important, it's the directory where all of the events will be written out.
-Also, the `SummaryWriter` can optionally take a `Graph` in its constructor.
+Also, the `FileWriter` can optionally take a `Graph` in its constructor.
 If it receives a `Graph` object, then TensorBoard will visualize your graph
 along with tensor shape information. This will give you a much better sense of
 what flows through the graph: see
 [Tensor shape information](../../how_tos/graph_viz/index.md#tensor-shape-information).
 
-Now that you've modified your graph and have a `SummaryWriter`, you're ready to
+Now that you've modified your graph and have a `FileWriter`, you're ready to
 start running your network! If you want, you could run the merged summary op
 every single step, and record a ton of training data. That's likely to be more
 data than you need, though. Instead, consider running the merged summary op
@@ -153,14 +153,14 @@ tf.summary.scalar('accuracy', accuracy)
 
 # Merge all the summaries and write them out to /tmp/mnist_logs (by default)
 merged = tf.summary.merge_all()
-train_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/train',
+train_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/train',
                                       sess.graph)
-test_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/test')
+test_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/test')
 tf.global_variables_initializer().run()
 ```
 
-After we've initialized the `SummaryWriters`, we have to add summaries to the
-`SummaryWriters` as we train and test the model.
+After we've initialized the `FileWriters`, we have to add summaries to the
+`FileWriters` as we train and test the model.
 
 ```python
 # Train the model, and also write summaries.
@@ -199,7 +199,7 @@ tensorflow.tensorboard`)
 tensorboard --logdir=path/to/log-directory
 ```
 
-where `logdir` points to the directory where the `SummaryWriter` serialized its
+where `logdir` points to the directory where the `FileWriter` serialized its
 data.  If this `logdir` directory contains subdirectories which contain
 serialized data from separate runs, then TensorBoard will visualize the data
 from all of those runs. Once TensorBoard is running, navigate your web browser
diff --git a/tensorflow/g3doc/resources/versions.md b/tensorflow/g3doc/resources/versions.md
index b7c9c1722a2ee5..34a8e6bc308e0d 100644
--- a/tensorflow/g3doc/resources/versions.md
+++ b/tensorflow/g3doc/resources/versions.md
@@ -3,7 +3,7 @@
 ## Semantic Versioning 2.0
 
 Once we reach version 1.0, TensorFlow will follow Semantic Versioning 2.0
-(semver). For details, see <http://semver.org>.  Each release version of
+([semver](http://semver.org)) for its public API. Each release version of
 TensorFlow has the form `MAJOR.MINOR.PATCH`.  Changes to the each number have
 the following meaning:
 
@@ -32,10 +32,11 @@ the restrictions of semver).
 
 ## Public API
 
-Only the public API of TensorFlow is backwards compatible across minor and patch
-versions.  The public API consists of
+Only the C, C++, and Python public APIs of TensorFlow are backwards compatible
+across minor and patch versions.  The public APIs consist of
 
-* The documented [C++ and Python APIs](../api_docs).
+* The documented [Python](../api_docs/python), [C++](../api_docs/cc) and
+  the [C](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/c/c_api.h) APIs.
 
 * The following protocol buffer files:
   [`attr_value`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/attr_value.proto),
@@ -51,14 +52,21 @@ versions.  The public API consists of
 
 The public C++ API is exposed through the header files in
 [`tensorflow/core/public`](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/public).
+
 The public Python API is unfortunately **not** everything available through the
 tensorflow python module and its submodules, since we do not yet use `__all__`
 everywhere ([#421](https://github.com/tensorflow/tensorflow/issues/421)).
- Please refer to the documentation to determine whether a given Python feature
+Please refer to the documentation to determine whether a given Python feature
 is part of the public API. For now, the protocol buffers are defined in
 [`tensorflow/core/framework/*.proto`](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/framework)
 ([#484](https://github.com/tensorflow/tensorflow/issues/484)).
 
+> The [Java](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/java)
+> ([#5](https://github.com/tensorflow/tensorflow/issues/5)) and
+> [Go](https://godoc.org/github.com/tensorflow/tensorflow/tensorflow/go) APIs
+> are experimental and are **not** covered by the versioning scheme at this time.
+> They are not guaranteed to backward compatible between releases.
+
 
 ## Details That Are Not Public
 
@@ -139,7 +147,7 @@ the versions to account for changes, see [TensorFlow Data
 Versioning](data_versions.md).
 
 
-## C++ API Compatibility
+## C++ ABI Compatibility
 
 Only patch releases will be binary compatible at the C++ level.  That is, minor
 releases are backwards compatible in terms of behavior but may require a
diff --git a/tensorflow/g3doc/tutorials/deep_cnn/index.md b/tensorflow/g3doc/tutorials/deep_cnn/index.md
index 9f44295c287b0f..ec9d726b3ae2d2 100644
--- a/tensorflow/g3doc/tutorials/deep_cnn/index.md
+++ b/tensorflow/g3doc/tutorials/deep_cnn/index.md
@@ -83,21 +83,21 @@ for details.  It consists of 1,068,298 learnable parameters and requires about
 ## Code Organization
 
 The code for this tutorial resides in
-[`tensorflow/models/image/cifar10/`](https://www.tensorflow.org/code/tensorflow/models/image/cifar10/).
+[`tensorflow_models/tutorials/image/cifar10/`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/).
 
 File | Purpose
 --- | ---
-[`cifar10_input.py`](https://www.tensorflow.org/code/tensorflow/models/image/cifar10/cifar10_input.py) | Reads the native CIFAR-10 binary file format.
-[`cifar10.py`](https://www.tensorflow.org/code/tensorflow/models/image/cifar10/cifar10.py) | Builds the CIFAR-10 model.
-[`cifar10_train.py`](https://www.tensorflow.org/code/tensorflow/models/image/cifar10/cifar10_train.py) | Trains a CIFAR-10 model on a CPU or GPU.
-[`cifar10_multi_gpu_train.py`](https://www.tensorflow.org/code/tensorflow/models/image/cifar10/cifar10_multi_gpu_train.py) | Trains a CIFAR-10 model on multiple GPUs.
-[`cifar10_eval.py`](https://www.tensorflow.org/code/tensorflow/models/image/cifar10/cifar10_eval.py) | Evaluates the predictive performance of a CIFAR-10 model.
+[`cifar10_input.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10_input.py) | Reads the native CIFAR-10 binary file format.
+[`cifar10.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10.py) | Builds the CIFAR-10 model.
+[`cifar10_train.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10_train.py) | Trains a CIFAR-10 model on a CPU or GPU.
+[`cifar10_multi_gpu_train.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10_multi_gpu_train.py) | Trains a CIFAR-10 model on multiple GPUs.
+[`cifar10_eval.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10_eval.py) | Evaluates the predictive performance of a CIFAR-10 model.
 
 
 ## CIFAR-10 Model
 
 The CIFAR-10 network is largely contained in
-[`cifar10.py`](https://www.tensorflow.org/code/tensorflow/models/image/cifar10/cifar10.py).
+[`cifar10.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10.py).
 The complete training
 graph contains roughly 765 operations. We find that we can make the code most
 reusable by constructing the graph with the following modules:
diff --git a/tensorflow/g3doc/tutorials/image_recognition/index.md b/tensorflow/g3doc/tutorials/image_recognition/index.md
index 990f906a443f15..1d20d7ddb39437 100644
--- a/tensorflow/g3doc/tutorials/image_recognition/index.md
+++ b/tensorflow/g3doc/tutorials/image_recognition/index.md
@@ -67,10 +67,9 @@ We're excited to see what the community will do with this model.
 when the program is run for the first time. You'll need about 200M of free space
 available on your hard disk.
 
-The following instructions assume you installed TensorFlow from a PIP package
-and that your terminal resides in the TensorFlow root directory.
+Start by cloning the [TensorFlow models repo](https://github.com/tensorflow/models) from GitHub. Run the following commands:
 
-    cd tensorflow/models/image/imagenet
+    cd models/tutorials/image/imagenet
     python classify_image.py
 
 The above command will classify a supplied image of a panda bear.
diff --git a/tensorflow/g3doc/tutorials/input_fn/index.md b/tensorflow/g3doc/tutorials/input_fn/index.md
index 51202a9c5a5048..831576433e28d2 100644
--- a/tensorflow/g3doc/tutorials/input_fn/index.md
+++ b/tensorflow/g3doc/tutorials/input_fn/index.md
@@ -222,6 +222,9 @@ logging verbosity](../monitors/index.md#enabling-logging-with-tensorflow) to
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+
+import itertools
+
 import pandas as pd
 import tensorflow as tf
 
@@ -301,8 +304,6 @@ which means the function can process any of the `DataFrame`s you've imported:
 To train the neural network regressor, run `fit` with the `training_set` passed
 to the `input_fn` as follows:
 
-<!-- TODO(skleinfeld): Decide on the best step value to use here for pedagogical purposes -->
-
 ```python
 regressor.fit(input_fn=lambda: input_fn(training_set), steps=5000)
 ```
@@ -355,7 +356,9 @@ Finally, you can use the model to predict median house values for the
 
 ```python
 y = regressor.predict(input_fn=lambda: input_fn(prediction_set))
-print ("Predictions: {}".format(str(y)))
+# .predict() returns an iterator; convert to a list and print predictions
+predictions = list(itertools.islice(y, 6))
+print ("Predictions: {}".format(str(predictions)))
 ```
 
 Your results should contain six house-value predictions in thousands of dollars,
diff --git a/tensorflow/g3doc/tutorials/recurrent/index.md b/tensorflow/g3doc/tutorials/recurrent/index.md
index e22976100f74db..6ddec390f5857c 100644
--- a/tensorflow/g3doc/tutorials/recurrent/index.md
+++ b/tensorflow/g3doc/tutorials/recurrent/index.md
@@ -16,12 +16,12 @@ purpose we will use the [Penn Tree Bank](http://www.cis.upenn.edu/~treebank/)
 models, whilst being small and relatively fast to train.
 
 Language modeling is key to many interesting problems such as speech
-recognition, machine translation, or image captioning. It is also fun, too --
+recognition, machine translation, or image captioning. It is also fun --
 take a look [here](http://karpathy.github.io/2015/05/21/rnn-effectiveness/).
 
 For the purpose of this tutorial, we will reproduce the results from
 [Zaremba et al., 2014](http://arxiv.org/abs/1409.2329)
-([pdf](http://arxiv.org/pdf/1409.2329.pdf)), which achieves very good results
+([pdf](http://arxiv.org/pdf/1409.2329.pdf)), which achieves very good quality
 on the PTB dataset.
 
 ## Tutorial Files
@@ -41,20 +41,20 @@ http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
 
 The dataset is already preprocessed and contains overall 10000 different words,
 including the end-of-sentence marker and a special symbol (\<unk\>) for rare
-words. We convert all of them in the `reader.py` to unique integer identifiers
-to make it easy for the neural network to process.
+words. In `reader.py`, we convert each word to a unique integer identifier,
+in order to make it easy for the neural network to process the data.
 
 ## The Model
 
 ### LSTM
 
 The core of the model consists of an LSTM cell that processes one word at a
-time and computes probabilities of the possible continuations of the sentence.
-The memory state of the network is initialized with a vector of zeros and gets
-updated after reading each word. Also, for computational reasons, we will
+time and computes probabilities of the possible values for the next word in the
+sentence. The memory state of the network is initialized with a vector of zeros
+and gets updated after reading each word. For computational reasons, we will
 process data in mini-batches of size `batch_size`.
 
-The basic pseudocode looks as follows:
+The basic pseudocode is as follows:
 
 ```python
 lstm = rnn_cell.BasicLSTMCell(lstm_size)
@@ -74,14 +74,17 @@ for current_batch_of_words in words_in_dataset:
 
 ### Truncated Backpropagation
 
-In order to make the learning process tractable, it is a common practice to
-truncate the gradients for backpropagation to a fixed number (`num_steps`)
-of unrolled steps.
-This is easy to implement by feeding inputs of length `num_steps` at a time and
-doing backward pass after each iteration.
+By design, the output of a recurrent neural network (RNN) depends on arbitrarily
+distant inputs. Unfortunately, this makes backpropagation computation difficult.
+In order to make the learning process tractable, it is common practice to create
+an "unrolled" version of the network, which contains a fixed number
+(`num_steps`) of LSTM inputs and outputs. The model is then trained on this
+finite approximation of the RNN. This can be implemented by feeding inputs of
+length `num_steps` at a time and performing a backward pass after each
+such input block.
 
-A simplified version of the code for the graph creation for truncated
-backpropagation:
+Here is a simplified block of code for creating a graph which performs
+truncated backpropagation:
 
 ```python
 # Placeholder for the inputs in a given iteration.
@@ -171,16 +174,10 @@ final_state = state
 
 ## Run the Code
 
-We are assuming you have already installed via the pip package, have cloned the
-tensorflow git repository, and are in the root of the git tree. (If [building
-from source](
-https://github.com/tensorflow/tensorflow/blob/master/tensorflow/g3doc/get_started/os_setup.md#installing-from-sources), build the `tensorflow/models/rnn/ptb:ptb_word_lm` target using
-[bazel](https://github.com/bazelbuild/bazel)).
-
-Next:
+Start by cloning the [TensorFlow models repo](https://github.com/tensorflow/models) from GitHub. Run the following commands:
 
 ```bash
-cd tensorflow/models/rnn/ptb
+cd models/tutorials/rnn/ptb
 python ptb_word_lm.py --data_path=/tmp/simple-examples/data/ --model small
 ```
 
diff --git a/tensorflow/g3doc/tutorials/seq2seq/index.md b/tensorflow/g3doc/tutorials/seq2seq/index.md
index 4cfcc56b291caa..ed29a9a0947c62 100644
--- a/tensorflow/g3doc/tutorials/seq2seq/index.md
+++ b/tensorflow/g3doc/tutorials/seq2seq/index.md
@@ -8,14 +8,10 @@ some input and generate a meaningful response? For example, could we train
 a neural network to translate from English to French? It turns out that
 the answer is *yes*.
 
-This tutorial will show you how to build and train such a system end-to-end.
-We are assuming you have already installed via the pip package, have cloned the
-tensorflow git repository, and are in the root of the git tree.
-
-You can then start by running the translate program:
+This tutorial will show you how to build and train such a system end-to-end. Clone the [TensorFlow models repo](https://github.com/tensorflow/models) from GitHub. You can then start by running the translate program:
 
 ```
-cd tensorflow/models/rnn/translate
+cd models/tutorials/rnn/translate
 python translate.py --data_dir [your_data_directory]
 ```
 
diff --git a/tensorflow/g3doc/tutorials/word2vec/index.md b/tensorflow/g3doc/tutorials/word2vec/index.md
index 936cb24a23603e..36f9f762ef41aa 100644
--- a/tensorflow/g3doc/tutorials/word2vec/index.md
+++ b/tensorflow/g3doc/tutorials/word2vec/index.md
@@ -23,7 +23,7 @@ straight in, feel free to look at the minimalistic implementation in
 This basic example contains the code needed to download some data, train on it a
 bit and visualize the result. Once you get comfortable with reading and running
 the basic version, you can graduate to
-[tensorflow/models/embedding/word2vec.py](https://www.tensorflow.org/code/tensorflow/models/embedding/word2vec.py)
+[tensorflow_models/tutorials/embedding/word2vec.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec.py)
 which is a more serious implementation that showcases some more advanced
 TensorFlow principles about how to efficiently use threads to move data into a
 text model, how to checkpoint during training, etc.
@@ -337,7 +337,7 @@ t-SNE.
 Et voila! As expected, words that are similar end up clustering nearby each
 other. For a more heavyweight implementation of word2vec that showcases more of
 the advanced features of TensorFlow, see the implementation in
-[tensorflow/models/embedding/word2vec.py](https://www.tensorflow.org/code/tensorflow/models/embedding/word2vec.py).
+[tensorflow_models/tutorials/embedding/word2vec.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec.py).
 
 ## Evaluating Embeddings: Analogical Reasoning
 
@@ -353,7 +353,7 @@ Download the dataset for this task from
 
 To see how we do this evaluation, have a look at the `build_eval_graph()` and
 `eval()` functions in
-[tensorflow/models/embedding/word2vec.py](https://www.tensorflow.org/code/tensorflow/models/embedding/word2vec.py).
+[tensorflow_models/tutorials/embedding/word2vec.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec.py).
 
 The choice of hyperparameters can strongly influence the accuracy on this task.
 To achieve state-of-the-art performance on this task requires training over a
@@ -381,13 +381,13 @@ your model is seriously bottlenecked on input data, you may want to implement a
 custom data reader for your problem, as described in
 [New Data Formats](../../how_tos/new_data_formats/index.md).  For the case of Skip-Gram
 modeling, we've actually already done this for you as an example in
-[tensorflow/models/embedding/word2vec.py](https://www.tensorflow.org/code/tensorflow/models/embedding/word2vec.py).
+[tensorflow_models/tutorials/embedding/word2vec.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec.py).
 
 If your model is no longer I/O bound but you want still more performance, you
 can take things further by writing your own TensorFlow Ops, as described in
 [Adding a New Op](../../how_tos/adding_an_op/index.md).  Again we've provided an
 example of this for the Skip-Gram case
-[tensorflow/models/embedding/word2vec_optimized.py](https://www.tensorflow.org/code/tensorflow/models/embedding/word2vec_optimized.py).
+[tensorflow_models/tutorials/embedding/word2vec_optimized.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec_optimized.py).
 Feel free to benchmark these against each other to measure performance
 improvements at each stage.
 
diff --git a/tensorflow/go/tensor.go b/tensorflow/go/tensor.go
index f755e9d4f8ba08..c50e5e30aa64cf 100644
--- a/tensorflow/go/tensor.go
+++ b/tensorflow/go/tensor.go
@@ -108,6 +108,35 @@ func NewTensor(value interface{}) (*Tensor, error) {
 	return t, nil
 }
 
+// ReadTensor constructs a Tensor with the provided type and shape from the
+// serialized tensor contents in r.
+//
+// See also WriteContentsTo.
+func ReadTensor(dataType DataType, shape []int64, r io.Reader) (*Tensor, error) {
+	if err := isTensorSerializable(dataType); err != nil {
+		return nil, err
+	}
+	nbytes := typeOf(dataType, nil).Size() * uintptr(numElements(shape))
+	var shapePtr *C.int64_t
+	if len(shape) > 0 {
+		shapePtr = (*C.int64_t)(unsafe.Pointer(&shape[0]))
+	}
+	t := &Tensor{
+		c:     C.TF_AllocateTensor(C.TF_DataType(dataType), shapePtr, C.int(len(shape)), C.size_t(nbytes)),
+		shape: shape,
+	}
+	runtime.SetFinalizer(t, (*Tensor).finalize)
+	raw := tensorData(t.c)
+	n, err := r.Read(raw)
+	if err != nil {
+		return nil, err
+	}
+	if uintptr(n) != nbytes {
+		return nil, fmt.Errorf("expected serialized tensor to be %v bytes, read %v", nbytes, n)
+	}
+	return t, nil
+}
+
 // newTensorFromC takes ownership of c and returns the owning Tensor.
 func newTensorFromC(c *C.TF_Tensor) *Tensor {
 	var shape []int64
@@ -156,6 +185,21 @@ func (t *Tensor) Value() interface{} {
 	return reflect.Indirect(val).Interface()
 }
 
+// WriteContentsTo writes the serialized contents of t to w.
+//
+// Returns the number of bytes written. See ReadTensor for
+// reconstructing a Tensor from the serialized form.
+//
+// WARNING: WriteContentsTo is not comprehensive and will fail
+// if t.DataType() is non-numeric (e.g., String). See
+// https://github.com/tensorflow/tensorflow/issues/6003.
+func (t *Tensor) WriteContentsTo(w io.Writer) (int64, error) {
+	if err := isTensorSerializable(t.DataType()); err != nil {
+		return 0, err
+	}
+	return io.Copy(w, bytes.NewReader(tensorData(t.c)))
+}
+
 func tensorData(c *C.TF_Tensor) []byte {
 	// See: https://github.com/golang/go/wiki/cgo#turning-c-arrays-into-go-slices
 	cbytes := C.TF_TensorData(c)
@@ -385,6 +429,23 @@ func bug(format string, args ...interface{}) error {
 	return fmt.Errorf("BUG: Please report at https://github.com/tensorflow/tensorflow/issues with the note: Go TensorFlow %v: %v", Version(), fmt.Sprintf(format, args...))
 }
 
+func isTensorSerializable(dataType DataType) error {
+	// For numeric types, the serialized Tensor matches the in-memory
+	// representation.  See the implementation of Tensor::AsProtoContent in
+	// https://www.tensorflow.org/code/tensorflow/core/framework/tensor.cc
+	//
+	// The more appropriate way to be in sync with Tensor::AsProtoContent
+	// would be to have the TensorFlow C library export functions for
+	// serialization and deserialization of Tensors.  Till then capitalize
+	// on knowledge of the implementation for numeric types.
+	switch dataType {
+	case Float, Double, Int32, Uint8, Int16, Int8, Complex, Int64, Bool, Quint8, Qint32, Bfloat16, Qint16, Quint16, Uint16, Complex128, Half:
+		return nil
+	default:
+		return fmt.Errorf("serialization of tensors with the DataType %d is not yet supported, see https://github.com/tensorflow/tensorflow/issues/6003", dataType)
+	}
+}
+
 // nativeEndian is the byte order for the local platform. Used to send back and
 // forth Tensors with the C API. We test for endianness at runtime because
 // some architectures can be booted into different endian modes.
diff --git a/tensorflow/go/tensor_test.go b/tensorflow/go/tensor_test.go
index 073da0cc6e471c..2a3ed416bdb83d 100644
--- a/tensorflow/go/tensor_test.go
+++ b/tensorflow/go/tensor_test.go
@@ -15,6 +15,7 @@
 package tensorflow
 
 import (
+	"bytes"
 	"reflect"
 	"testing"
 )
@@ -28,7 +29,6 @@ func TestNewTensor(t *testing.T) {
 		{nil, int16(5)},
 		{nil, int32(5)},
 		{nil, int64(5)},
-		{nil, int64(5)},
 		{nil, uint8(5)},
 		{nil, uint16(5)},
 		{nil, float32(5)},
@@ -103,6 +103,114 @@ func TestNewTensor(t *testing.T) {
 	}
 }
 
+func TestTensorSerialization(t *testing.T) {
+	var tests = []interface{}{
+		int8(5),
+		int16(5),
+		int32(5),
+		int64(5),
+		uint8(5),
+		uint16(5),
+		float32(5),
+		float64(5),
+		complex(float32(5), float32(6)),
+		complex(float64(5), float64(6)),
+		[]float64{1},
+		[][]float32{{1, 2}, {3, 4}, {5, 6}},
+		[][][]int8{
+			{{1, 2}, {3, 4}, {5, 6}},
+			{{7, 8}, {9, 10}, {11, 12}},
+			{{0, -1}, {-2, -3}, {-4, -5}},
+			{{-6, -7}, {-8, -9}, {-10, -11}},
+		},
+	}
+	for _, v := range tests {
+		t1, err := NewTensor(v)
+		if err != nil {
+			t.Errorf("(%v): %v", v, err)
+			continue
+		}
+		buf := new(bytes.Buffer)
+		n, err := t1.WriteContentsTo(buf)
+		if err != nil {
+			t.Errorf("(%v): %v", v, err)
+			continue
+		}
+		if n != int64(buf.Len()) {
+			t.Errorf("(%v): WriteContentsTo said it wrote %v bytes, but wrote %v", v, n, buf.Len())
+		}
+		t2, err := ReadTensor(t1.DataType(), t1.Shape(), buf)
+		if err != nil {
+			t.Errorf("(%v): %v", v, err)
+			continue
+		}
+		if buf.Len() != 0 {
+			t.Errorf("(%v): %v bytes written by WriteContentsTo not read by ReadTensor", v, buf.Len())
+		}
+		if got, want := t2.DataType(), t1.DataType(); got != want {
+			t.Errorf("(%v): Got %v, want %v", v, got, want)
+		}
+		if got, want := t2.Shape(), t1.Shape(); !reflect.DeepEqual(got, want) {
+			t.Errorf("(%v): Got %v, want %v", v, got, want)
+		}
+		if got, want := t2.Value(), v; !reflect.DeepEqual(got, want) {
+			t.Errorf("(%v): Got %v, want %v", v, got, want)
+		}
+	}
+}
+
+func TestReadTensorDoesNotReadBeyondContent(t *testing.T) {
+	t1, _ := NewTensor(int8(7))
+	t2, _ := NewTensor(float32(2.718))
+	buf := new(bytes.Buffer)
+	if _, err := t1.WriteContentsTo(buf); err != nil {
+		t.Fatal(err)
+	}
+	if _, err := t2.WriteContentsTo(buf); err != nil {
+		t.Fatal(err)
+	}
+
+	t3, err := ReadTensor(t1.DataType(), t1.Shape(), buf)
+	if err != nil {
+		t.Fatal(err)
+	}
+	t4, err := ReadTensor(t2.DataType(), t2.Shape(), buf)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if v, ok := t3.Value().(int8); !ok || v != 7 {
+		t.Errorf("Got (%v (%T), %v), want (7 (int8), true)", v, v, ok)
+	}
+	if v, ok := t4.Value().(float32); !ok || v != 2.718 {
+		t.Errorf("Got (%v (%T), %v), want (2.718 (float32), true)", v, v, ok)
+	}
+}
+
+func TestTensorSerializationErrors(t *testing.T) {
+	// String tensors cannot be serialized
+	t1, err := NewTensor("abcd")
+	if err != nil {
+		t.Fatal(err)
+	}
+	buf := new(bytes.Buffer)
+	if n, err := t1.WriteContentsTo(buf); n != 0 || err == nil || buf.Len() != 0 {
+		t.Errorf("Got (%v, %v, %v) want (0, <non-nil>, 0)", n, err, buf.Len())
+	}
+	// Should fail to read a truncated value.
+	if t1, err = NewTensor(int8(8)); err != nil {
+		t.Fatal(err)
+	}
+	n, err := t1.WriteContentsTo(buf)
+	if err != nil {
+		t.Fatal(err)
+	}
+	r := bytes.NewReader(buf.Bytes()[:n-1])
+	if _, err = ReadTensor(t1.DataType(), t1.Shape(), r); err == nil {
+		t.Error("ReadTensor should have failed if the tensor content was truncated")
+	}
+}
+
 func benchmarkNewTensor(b *testing.B, v interface{}) {
 	for i := 0; i < b.N; i++ {
 		if t, err := NewTensor(v); err != nil || t == nil {
diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD
index 2aa077c2b7a325..4522c4a59ea192 100644
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@@ -12,8 +12,52 @@ java_library(
     visibility = ["//visibility:public"],
 )
 
+java_library(
+    name = "testutil",
+    testonly = 1,
+    srcs = ["src/test/java/org/tensorflow/TestUtil.java"],
+    deps = [":tensorflow"],
+)
+
+java_test(
+    name = "GraphTest",
+    size = "small",
+    srcs = ["src/test/java/org/tensorflow/GraphTest.java"],
+    test_class = "org.tensorflow.GraphTest",
+    deps = [
+        ":tensorflow",
+        ":testutil",
+        "//external:junit",
+    ],
+)
+
+java_test(
+    name = "OperationBuilderTest",
+    size = "small",
+    srcs = ["src/test/java/org/tensorflow/OperationBuilderTest.java"],
+    test_class = "org.tensorflow.OperationBuilderTest",
+    deps = [
+        ":tensorflow",
+        ":testutil",
+        "//external:junit",
+    ],
+)
+
+java_test(
+    name = "SessionTest",
+    size = "small",
+    srcs = ["src/test/java/org/tensorflow/SessionTest.java"],
+    test_class = "org.tensorflow.SessionTest",
+    deps = [
+        ":tensorflow",
+        ":testutil",
+        "//external:junit",
+    ],
+)
+
 java_test(
     name = "TensorFlowTest",
+    size = "small",
     srcs = ["src/test/java/org/tensorflow/TensorFlowTest.java"],
     test_class = "org.tensorflow.TensorFlowTest",
     deps = [
@@ -22,6 +66,17 @@ java_test(
     ],
 )
 
+java_test(
+    name = "TensorTest",
+    size = "small",
+    srcs = ["src/test/java/org/tensorflow/TensorTest.java"],
+    test_class = "org.tensorflow.TensorTest",
+    deps = [
+        ":tensorflow",
+        "//external:junit",
+    ],
+)
+
 filegroup(
     name = "libtensorflow-jni",
     srcs = select({
diff --git a/tensorflow/java/src/main/java/org/tensorflow/DataType.java b/tensorflow/java/src/main/java/org/tensorflow/DataType.java
new file mode 100644
index 00000000000000..a7c6e12b41a069
--- /dev/null
+++ b/tensorflow/java/src/main/java/org/tensorflow/DataType.java
@@ -0,0 +1,63 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow;
+
+/** Type of elements in a {@link Tensor}. */
+public enum DataType {
+  /** 32-bit single precision floating point. */
+  FLOAT(1),
+
+  /** 64-bit double precision floating point. */
+  DOUBLE(2),
+
+  /** 32-bit signed integer. */
+  INT32(3),
+
+  /**
+   * A sequence of bytes.
+   *
+   * <p>TensorFlow uses the STRING type for an arbitrary sequence of bytes.
+   */
+  STRING(7),
+
+  /** 64-bit signed integer. */
+  INT64(9),
+
+  /** Boolean. */
+  BOOL(10);
+
+  private final int value;
+
+  // The integer value must match the corresponding TF_* value in the TensorFlow C API.
+  DataType(int value) {
+    this.value = value;
+  }
+
+  /** Corresponding value of the TF_DataType enum in the TensorFlow C API. */
+  int c() {
+    return value;
+  }
+
+  static DataType fromC(int c) {
+    for (DataType t : DataType.values()) {
+      if (t.c() == c) {
+        return t;
+      }
+    }
+    throw new IllegalArgumentException(
+        "DataType " + c + " is not recognized in Java (version " + TensorFlow.version() + ")");
+  }
+}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Graph.java b/tensorflow/java/src/main/java/org/tensorflow/Graph.java
new file mode 100644
index 00000000000000..6a1dd4c1136b35
--- /dev/null
+++ b/tensorflow/java/src/main/java/org/tensorflow/Graph.java
@@ -0,0 +1,191 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow;
+
+/**
+ * A data flow graph representing a TensorFlow computation.
+ *
+ * <p>Instances of a Graph are thread-safe.
+ *
+ * <p><b>WARNING:</b> Resources consumed by the Graph object msut be explicitly freed by invoking
+ * the {@link #close()} method then the Graph object is no longer needed.
+ */
+public final class Graph implements AutoCloseable {
+
+  /** Create an empty Graph. */
+  public Graph() {
+    nativeHandle = allocate();
+  }
+
+  /**
+   * Release resources associated with the Graph.
+   *
+   * <p>Blocks until there are no active {@link Session} instances referring to this Graph. A Graph
+   * is not usable after close returns.
+   */
+  @Override
+  public void close() {
+    synchronized (nativeHandleLock) {
+      if (nativeHandle == 0) {
+        return;
+      }
+      while (refcount > 0) {
+        try {
+          nativeHandleLock.wait();
+        } catch (InterruptedException e) {
+          Thread.currentThread().interrupt();
+          // Possible leak of the graph in this case?
+          return;
+        }
+      }
+      delete(nativeHandle);
+      nativeHandle = 0;
+    }
+  }
+
+  /**
+   * Returns the operation (node in the Graph) with the provided name.
+   *
+   * <p>Or {@code null} if no such operation exists in the Graph.
+   */
+  public Operation operation(String name) {
+    synchronized (nativeHandleLock) {
+      long oph = operation(nativeHandle, name);
+      if (oph == 0) {
+        return null;
+      }
+      return new Operation(this, oph);
+    }
+  }
+
+  /**
+   * Returns a builder to add {@link Operation}s to the Graph.
+   *
+   * @param type of the Operation (i.e., identifies the computation to be performed)
+   * @param name to refer to the created Operation in the graph.
+   * @return an {@link OperationBuilder}, which will add the Operation to the graph when {@link
+   *     OperationBuilder#build()} is invoked. If {@link OperationBuilder#build()} is not invoked,
+   *     then some resources may leak.
+   */
+  public OperationBuilder opBuilder(String type, String name) {
+    return new OperationBuilder(this, type, name);
+  }
+
+  /**
+   * Import a serialized representation of a TensorFlow graph.
+   *
+   * <p>The serialized representation of the graph, often referred to as a <i>GraphDef</i>, can be
+   * generated by {@link #toGraphDef()} and equivalents in other language APIs.
+   *
+   * @throws IllegalArgumentException if graphDef is not a recognized serialization of a graph.
+   * @see #importGraphDef(byte[], String)
+   */
+  public void importGraphDef(byte[] graphDef) throws IllegalArgumentException {
+    importGraphDef(graphDef, "");
+  }
+
+  /**
+   * Import a serialized representation of a TensorFlow graph.
+   *
+   * @param graphDef the serialized representation of a TensorFlow graph.
+   * @param prefix a prefix that will be prepended to names in graphDef
+   * @throws IllegalArgumentException if graphDef is not a recognized serialization of a graph.
+   * @see #importGraphDef(byte[])
+   */
+  public void importGraphDef(byte[] graphDef, String prefix) throws IllegalArgumentException {
+    if (graphDef == null || prefix == null) {
+      throw new IllegalArgumentException("graphDef and prefix cannot be null");
+    }
+    synchronized (nativeHandleLock) {
+      importGraphDef(nativeHandle, graphDef, prefix);
+    }
+  }
+
+  /**
+   * Generate a serialized representation of the Graph.
+   *
+   * @see #importGraphDef(byte[])
+   * @see #importGraphDef(byte[], String)
+   */
+  public byte[] toGraphDef() {
+    synchronized (nativeHandleLock) {
+      return toGraphDef(nativeHandle);
+    }
+  }
+
+  private final Object nativeHandleLock = new Object();
+  private long nativeHandle;
+  private int refcount = 0;
+
+  // Related native objects (such as the TF_Operation object backing an Operation instance)
+  // have a validity tied to that of the Graph. The handles to those native objects are not
+  // valid after Graph.close() has been invoked.
+  //
+  // Instances of the Reference class should be used to ensure the Graph has not been closed
+  // while dependent handles are in use.
+  class Reference implements AutoCloseable {
+    private Reference() {
+      synchronized (Graph.this.nativeHandleLock) {
+        active = Graph.this.nativeHandle != 0;
+        if (!active) {
+          throw new IllegalStateException("close() has been called on the Graph");
+        }
+        active = true;
+        Graph.this.refcount++;
+      }
+    }
+
+    @Override
+    public void close() {
+      synchronized (Graph.this.nativeHandleLock) {
+        if (!active) {
+          return;
+        }
+        active = false;
+        if (--Graph.this.refcount == 0) {
+          Graph.this.nativeHandleLock.notifyAll();
+        }
+      }
+    }
+
+    public long nativeHandle() {
+      synchronized (Graph.this.nativeHandleLock) {
+        return active ? Graph.this.nativeHandle : 0;
+      }
+    }
+
+    private boolean active;
+  }
+
+  Reference ref() {
+    return new Reference();
+  }
+
+  private static native long allocate();
+
+  private static native void delete(long handle);
+
+  private static native long operation(long handle, String name);
+
+  private static native void importGraphDef(long handle, byte[] graphDef, String prefix)
+      throws IllegalArgumentException;
+
+  private static native byte[] toGraphDef(long handle);
+
+  static {
+    TensorFlow.init();
+  }
+}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Operation.java b/tensorflow/java/src/main/java/org/tensorflow/Operation.java
new file mode 100644
index 00000000000000..576fa5ee888513
--- /dev/null
+++ b/tensorflow/java/src/main/java/org/tensorflow/Operation.java
@@ -0,0 +1,81 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow;
+
+/**
+ * A Graph node that performs computation on Tensors.
+ *
+ * <p>An Operation is a node in a {@link Graph} that takes zero or more {@link Tensor}s (produced by
+ * other Operations in the Graph) as input, and produces zero or more {@link Tensor}s as output.
+ *
+ * <p>Operation instances are valid only as long as the Graph they are a part of is valid. Thus, if
+ * {@link Graph#close()} has been invoked, then methods on the Operation instance may fail with an
+ * {@code IllegalStateException}.
+ *
+ * <p>Operation instances are immutable and thread-safe.
+ */
+public final class Operation {
+
+  // Create an Operation instance referring to an operation in g, with the given handle to the C
+  // TF_Operation object.  The handle is valid only as long as g has not been closed, hence it is
+  // called unsafeHandle.  Graph.ref() is used to safely use the unsafeHandle.
+  Operation(Graph g, long unsafeNativeHandle) {
+    this.graph = g;
+    this.unsafeNativeHandle = unsafeNativeHandle;
+  }
+
+  /** Returns the full name of the Operation. */
+  public String name() {
+    try (Graph.Reference r = graph.ref()) {
+      return name(unsafeNativeHandle);
+    }
+  }
+
+  /**
+   * Returns the type of the operation, i.e., the name of the computation performed by the
+   * operation.
+   */
+  public String type() {
+    try (Graph.Reference r = graph.ref()) {
+      return type(unsafeNativeHandle);
+    }
+  }
+
+  /** Returns the number of tensors produced by this operation. */
+  public int numOutputs() {
+    try (Graph.Reference r = graph.ref()) {
+      return numOutputs(unsafeNativeHandle);
+    }
+  }
+
+  /** Returns a symbolic handle to one of the tensors produced by this operation. */
+  public Output output(int idx) {
+    return new Output(this, idx);
+  }
+
+  long getUnsafeNativeHandle() {
+    return unsafeNativeHandle;
+  }
+
+  private final long unsafeNativeHandle;
+  private final Graph graph;
+
+  private static native String name(long handle);
+
+  private static native String type(long handle);
+
+  private static native int numOutputs(long handle);
+}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/OperationBuilder.java b/tensorflow/java/src/main/java/org/tensorflow/OperationBuilder.java
new file mode 100644
index 00000000000000..37468b508a427f
--- /dev/null
+++ b/tensorflow/java/src/main/java/org/tensorflow/OperationBuilder.java
@@ -0,0 +1,220 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow;
+
+import java.nio.charset.Charset;
+
+/**
+ * A builder for {@link Operation}s in a {@link Graph}.
+ *
+ * <p>Instances of an OperationBuilder are <b>not</b> thread-safe.
+ *
+ * <p>A builder for adding {@link Operation}s to a {@link Graph}. For example, the following uses
+ * the builder to create an operation that produces the constant "3" as its output:
+ *
+ * <pre>{@code
+ * // g is a Graph instance.
+ * try (Tensor c1 = Tensor.create(3.0f)) {
+ *   g.opBuilder("Constant", "MyConst")
+ *       .setAttr("dtype", c1.dataType())
+ *       .setAttr("value", c1)
+ *       .build();
+ * }
+ * }</pre>
+ */
+public final class OperationBuilder {
+
+  OperationBuilder(Graph graph, String type, String name) {
+    this.graph = graph;
+    try (Graph.Reference r = graph.ref()) {
+      this.unsafeNativeHandle = allocate(r.nativeHandle(), type, name);
+    }
+  }
+
+  /**
+   * Add the {@link Operation} being built to the {@link Graph}.
+   *
+   * <p>The OperationBuilder is not usable after build() returns.
+   */
+  public Operation build() {
+    try (Graph.Reference r = graph.ref()) {
+      Operation op = new Operation(graph, finish(unsafeNativeHandle));
+      unsafeNativeHandle = 0;
+      return op;
+    }
+  }
+
+  public OperationBuilder addInput(Output input) {
+    try (Graph.Reference r = graph.ref()) {
+      addInput(unsafeNativeHandle, input.op().getUnsafeNativeHandle(), input.index());
+    }
+    return this;
+  }
+
+  public OperationBuilder addInputList(Output[] inputs) {
+    try (Graph.Reference r = graph.ref()) {
+      long[] opHandles = new long[inputs.length];
+      int[] indices = new int[inputs.length];
+      for (int i = 0; i < inputs.length; ++i) {
+        opHandles[i] = inputs[i].op().getUnsafeNativeHandle();
+        indices[i] = inputs[i].index();
+      }
+      addInputList(unsafeNativeHandle, opHandles, indices);
+    }
+    return this;
+  }
+
+  public OperationBuilder setDevice(String device) {
+    try (Graph.Reference r = graph.ref()) {
+      setDevice(unsafeNativeHandle, device);
+    }
+    return this;
+  }
+
+  public OperationBuilder setAttr(String name, String value) {
+    setAttr(name, value.getBytes(Charset.forName("UTF-8")));
+    return this;
+  }
+
+  public OperationBuilder setAttr(String name, byte[] value) {
+    try (Graph.Reference r = graph.ref()) {
+      setAttrString(unsafeNativeHandle, name, value);
+    }
+    return this;
+  }
+
+  public OperationBuilder setAttr(String name, long value) {
+    try (Graph.Reference r = graph.ref()) {
+      setAttrInt(unsafeNativeHandle, name, value);
+    }
+    return this;
+  }
+
+  public OperationBuilder setAttr(String name, long[] value) {
+    try (Graph.Reference r = graph.ref()) {
+      setAttrIntList(unsafeNativeHandle, name, value);
+    }
+    return this;
+  }
+
+  public OperationBuilder setAttr(String name, float value) {
+    try (Graph.Reference r = graph.ref()) {
+      setAttrFloat(unsafeNativeHandle, name, value);
+    }
+    return this;
+  }
+
+  public OperationBuilder setAttr(String name, float[] value) {
+    try (Graph.Reference r = graph.ref()) {
+      setAttrFloatList(unsafeNativeHandle, name, value);
+    }
+    return this;
+  }
+
+  public OperationBuilder setAttr(String name, boolean value) {
+    try (Graph.Reference r = graph.ref()) {
+      setAttrBool(unsafeNativeHandle, name, value);
+    }
+    return this;
+  }
+
+  public OperationBuilder setAttr(String name, boolean[] value) {
+    try (Graph.Reference r = graph.ref()) {
+      setAttrBoolList(unsafeNativeHandle, name, value);
+    }
+    return this;
+  }
+
+  public OperationBuilder setAttr(String name, DataType value) {
+    try (Graph.Reference r = graph.ref()) {
+      setAttrType(unsafeNativeHandle, name, value.c());
+    }
+    return this;
+  }
+
+  public OperationBuilder setAttr(String name, DataType[] value) {
+    int[] ctypes = new int[value.length];
+    for (int i = 0; i < value.length; ++i) {
+      ctypes[i] = value[i].c();
+    }
+    try (Graph.Reference r = graph.ref()) {
+      setAttrTypeList(unsafeNativeHandle, name, ctypes);
+    }
+    return this;
+  }
+
+  public OperationBuilder setAttr(String name, Tensor value) {
+    try (Graph.Reference r = graph.ref()) {
+      setAttrTensor(unsafeNativeHandle, name, value.getNativeHandle());
+    }
+    return this;
+  }
+
+  public OperationBuilder setAttr(String name, Tensor[] value) {
+    long[] handles = new long[value.length];
+    int idx = 0;
+    for (Tensor t : value) {
+      handles[idx++] = t.getNativeHandle();
+    }
+    try (Graph.Reference r = graph.ref()) {
+      setAttrTensorList(unsafeNativeHandle, name, handles);
+    }
+    return this;
+  }
+
+  private long unsafeNativeHandle;
+  private Graph graph;
+
+  private static native long allocate(long graphHandle, String type, String name);
+
+  private static native long finish(long handle);
+
+  private static native void addInput(long handle, long opHandle, int index);
+
+  private static native void addInputList(long handle, long[] opHandles, int[] indices);
+
+  private static native void setDevice(long handle, String device);
+
+  // The names of all the setAttr* family functions below correspond to the C library types, not the
+  // Java library types. Roughly, setAttrFoo calls the TensorFlow C library function: TF_SetAttrFoo.
+  //
+  // TODO(ashankar):
+  // - setAttrStringList: Which would take in an array of byte[] (java Strings will need to be UTF-8
+  //   encoded?)
+  // - setAttrShape and setAttrShapeList: Which would take in a long[] or long[][]?
+
+  private static native void setAttrString(long handle, String name, byte[] value);
+
+  private static native void setAttrInt(long handle, String name, long value);
+
+  private static native void setAttrIntList(long handle, String name, long[] value);
+
+  private static native void setAttrFloat(long handle, String name, float value);
+
+  private static native void setAttrFloatList(long handle, String name, float[] value);
+
+  private static native void setAttrBool(long handle, String name, boolean value);
+
+  private static native void setAttrBoolList(long handle, String name, boolean[] value);
+
+  private static native void setAttrType(long handle, String name, int type);
+
+  private static native void setAttrTypeList(long handle, String name, int[] type);
+
+  private static native void setAttrTensor(long handle, String name, long tensorHandle);
+
+  private static native void setAttrTensorList(long handle, String name, long[] tensorHandle);
+}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Output.java b/tensorflow/java/src/main/java/org/tensorflow/Output.java
new file mode 100644
index 00000000000000..f0fffc2c1df20d
--- /dev/null
+++ b/tensorflow/java/src/main/java/org/tensorflow/Output.java
@@ -0,0 +1,44 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow;
+
+/**
+ * A symbolic handle to a tensor produced by an {@link Operation}.
+ *
+ * <p>An Output is a symbolic handle to a tensor. The value of the Tensor is computed by executing
+ * the {@link Operation} in a {@link Session}.
+ */
+public final class Output {
+
+  /** Handle to the idx-th output of the Operation {@code op}. */
+  public Output(Operation op, int idx) {
+    operation = op;
+    index = idx;
+  }
+
+  /** Returns the Operation that will produce the tensor referred to by this Output. */
+  public Operation op() {
+    return operation;
+  }
+
+  /** Returns the index into the outputs of the Operation. */
+  public int index() {
+    return index;
+  }
+
+  private final Operation operation;
+  private final int index;
+}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Session.java b/tensorflow/java/src/main/java/org/tensorflow/Session.java
new file mode 100644
index 00000000000000..ca9f96b1a829cd
--- /dev/null
+++ b/tensorflow/java/src/main/java/org/tensorflow/Session.java
@@ -0,0 +1,315 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Driver for {@link Graph} execution.
+ *
+ * <p>A {@code Session} instance encapsulates the environment in which {@link Operation}s in a
+ * {@link Graph} are executed to compute {@link Tensor}s. For example:
+ *
+ * <pre>{@code
+ * // Let's say graph is an instance of the Graph class
+ * // for the computation y = 3 * x
+ *
+ * try (Session s = new Session(graph)) {
+ *   try (Tensor x = Tensor.create(2.0f);
+ *       Tensor y = s.runner().feed("x", x).fetch("y").run().get(0)) {
+ *       System.out.println(y.floatValue());  // Will print 6.0f
+ *   }
+ *   try (Tensor x = Tensor.create(1.1f);
+ *       Tensor y = s.runner().feed("x", x).fetch("y").run().get(0)) {
+ *       System.out.println(y.floatValue());  // Will print 3.3f
+ *   }
+ * }
+ * }</pre>
+ *
+ * <p><b>WARNING:</b>A {@code Session} ownes resources that <b>must</b> be explicitly freed by
+ * invoking {@link #close()}.
+ *
+ * <p>Instances of a Session are thread-safe.
+ */
+public final class Session implements AutoCloseable {
+
+  /** Construct a new session with the associated {@link Graph}. */
+  public Session(Graph g) {
+    graph = g;
+    try (Graph.Reference r = g.ref()) {
+      nativeHandle = allocate(r.nativeHandle());
+      graphRef = g.ref();
+    }
+  }
+
+  /**
+   * Release resources associated with the Session.
+   *
+   * <p>Blocks until there are no active executions ({@link Session.Runner#run()} calls). A Session
+   * is not usable after close returns.
+   */
+  @Override
+  public void close() {
+    graphRef.close();
+    synchronized (nativeHandleLock) {
+      if (nativeHandle == 0) {
+        return;
+      }
+      while (numActiveRuns > 0) {
+        try {
+          nativeHandleLock.wait();
+        } catch (InterruptedException e) {
+          Thread.currentThread().interrupt();
+          // Possible leak of the Session and Graph in this case?
+          return;
+        }
+      }
+      delete(nativeHandle);
+      nativeHandle = 0;
+    }
+  }
+
+  /**
+   * Run {@link Operation}s and evaluate {@link Tensor}s.
+   *
+   * <p>A Runner runs the necessary graph fragments to execute every {@link Operation} required to
+   * evaluate the {@link Tensor}s to fetch. The {@link #feed(String,int,Tensor)} call allows callers
+   * to override the value of {@link Tensor}s in the graph by substituing the provided {@link
+   * Tensor}s for the outputs of the operations provided to {@link #feed(String,int,Tensor)}.
+   */
+  public final class Runner {
+    /**
+     * Avoid evaluating {@code operation} and substitute {@code t} for the value it produces.
+     *
+     * <p>This method is a shorthand for {@code feed(operation, 0, t)}.
+     */
+    public Runner feed(String operation, Tensor t) {
+      return feed(operation, 0, t);
+    }
+
+    /**
+     * Avoid evaluating the {@code index}-th output of {@code operation} by substituting {@code t}
+     * for the value it produces.
+     *
+     * <p>Operations in a {@link Graph} can have multiple outputs, {@code index} identifies which
+     * one {@code t} is being provided for.
+     */
+    public Runner feed(String operation, int index, Tensor t) {
+      Operation op = operationByName(operation);
+      if (op != null) {
+        inputs.add(op.output(index));
+        inputTensors.add(t);
+      }
+      return this;
+    }
+
+    /**
+     * Make {@link #run()} return the output of {@code operation}.
+     *
+     * <p>This method is a shorthand for {@code fetch(operation, 0)}
+     */
+    public Runner fetch(String operation) {
+      return fetch(operation, 0);
+    }
+
+    /**
+     * Make {@link #run()} return the {@code index}-th output of {@code operation}.
+     *
+     * <p>Operations in a {@link Graph} can have multiple outputs, {@code index} identifies which
+     * one to return.
+     */
+    public Runner fetch(String operation, int index) {
+      Operation op = operationByName(operation);
+      if (op != null) {
+        outputs.add(op.output(index));
+      }
+      return this;
+    }
+
+    /**
+     * Make {@link #run()} execute {@code operation}, but not return the evaluated {@link Tensor}.
+     */
+    public Runner addTarget(String operation) {
+      Operation op = operationByName(operation);
+      if (op != null) {
+        targets.add(op);
+      }
+      return this;
+    }
+
+    /**
+     * Execute the graph fragments necessary to compute all requested fetches.
+     *
+     * <p><b>WARNING:</b> The caller assumes ownership of all returned {@link Tensor}s, i.e., the
+     * caller must call {@link Tensor#close()} on all elements of the returned list to free up
+     * resources.
+     *
+     * <p>TODO(ashankar): Reconsider the return type here. Two things in particular: (a) Make it
+     * easier for the caller to cleanup (perhaps returning something like AutoCloseableList in
+     * SessionTest.java), and (b) Evaluate whether the return value should be a list, or maybe a
+     * {@code Map<Output, Tensor>}?
+     */
+    public List<Tensor> run() {
+      long[] inputTensorHandles = new long[inputTensors.size()];
+      long[] inputOpHandles = new long[inputs.size()];
+      int[] inputOpIndices = new int[inputs.size()];
+      long[] outputOpHandles = new long[outputs.size()];
+      int[] outputOpIndices = new int[outputs.size()];
+      long[] targetOpHandles = new long[targets.size()];
+      long[] outputTensorHandles = new long[outputs.size()];
+
+      // It's okay to use Operation.getUnsafeNativeHandle() here since the safety depends on the
+      // validity of the Graph and graphRef ensures that.
+      int idx = 0;
+      for (Tensor t : inputTensors) {
+        inputTensorHandles[idx++] = t.getNativeHandle();
+      }
+      idx = 0;
+      for (Output o : inputs) {
+        inputOpHandles[idx] = o.op().getUnsafeNativeHandle();
+        inputOpIndices[idx] = o.index();
+        idx++;
+      }
+      idx = 0;
+      for (Output o : outputs) {
+        outputOpHandles[idx] = o.op().getUnsafeNativeHandle();
+        outputOpIndices[idx] = o.index();
+      }
+      idx = 0;
+      for (Operation op : targets) {
+        targetOpHandles[idx++] = op.getUnsafeNativeHandle();
+      }
+      try (Reference runref = new Reference()) {
+        Session.run(
+            nativeHandle,
+            null, /* runOptions */
+            inputTensorHandles,
+            inputOpHandles,
+            inputOpIndices,
+            outputOpHandles,
+            outputOpIndices,
+            targetOpHandles,
+            false, /* wantRunMetadata */
+            outputTensorHandles);
+      }
+      List<Tensor> ret = new ArrayList<Tensor>();
+      for (long h : outputTensorHandles) {
+        try {
+          ret.add(Tensor.fromHandle(h));
+        } catch (Exception e) {
+          for (Tensor t : ret) {
+            t.close();
+          }
+          ret.clear();
+          throw e;
+        }
+      }
+      return ret;
+    }
+
+    private class Reference implements AutoCloseable {
+      public Reference() {
+        synchronized (nativeHandleLock) {
+          if (nativeHandle == 0) {
+            throw new IllegalStateException("run() called after the Session was close()d");
+          }
+          ++numActiveRuns;
+        }
+      }
+
+      @Override
+      public void close() {
+        synchronized (nativeHandleLock) {
+          if (nativeHandle == 0) {
+            return;
+          }
+          if (--numActiveRuns == 0) {
+            nativeHandleLock.notifyAll();
+          }
+        }
+      }
+    }
+
+    private Operation operationByName(String opName) {
+      Operation op = graph.operation(opName);
+      if (op == null) {
+        throw new IllegalArgumentException("No Operation named [" + opName + "] in the Graph");
+      }
+      return op;
+    }
+
+    private ArrayList<Output> inputs = new ArrayList<Output>();
+    private ArrayList<Tensor> inputTensors = new ArrayList<Tensor>();
+    private ArrayList<Output> outputs = new ArrayList<Output>();
+    private ArrayList<Operation> targets = new ArrayList<Operation>();
+  }
+
+  /** Create a Runner to execute graph operations and evaluate Tensors. */
+  public Runner runner() {
+    return new Runner();
+  }
+
+  private final Graph graph;
+  private final Graph.Reference graphRef;
+
+  private final Object nativeHandleLock = new Object();
+  private long nativeHandle;
+  private int numActiveRuns;
+
+  private static native long allocate(long graphHandle);
+
+  private static native void delete(long handle);
+
+  /**
+   * Execute a session.
+   *
+   * <p>The author apologizes for the ugliness of the long argument list of this method. However,
+   * take solace in the fact that this is a private method meant to cross the JNI boundary.
+   *
+   * @param handle to the C API TF_Session object (Session.nativeHandle)
+   * @param runOptions serialized representation of a RunOptions protocol buffer, or null
+   * @param inputOpHandles (see inputOpIndices)
+   * @param inputOpIndices (see inputTensorHandles)
+   * @param inputTensorHandles together with inputOpHandles and inputOpIndices specifies the values
+   *     that are being "fed" (do not need to be computed) during graph execution.
+   *     inputTensorHandles[i] (which correponds to a Tensor.nativeHandle) is considered to be the
+   *     inputOpIndices[i]-th output of the Operation inputOpHandles[i]. Thus, it is required that
+   *     inputOpHandles.length == inputOpIndices.length == inputTensorHandles.length.
+   * @param outputOpHandles (see outputOpIndices)
+   * @param outputOpIndices together with outputOpHandles identifies the set of values that should
+   *     be computed. The outputOpIndices[i]-th output of the Operation outputOpHandles[i], It is
+   *     required that outputOpHandles.length == outputOpIndices.length.
+   * @param targetOpHandles is the set of Operations in the graph that are to be executed but whose
+   *     output will not be returned
+   * @param wantRunMetadata indicates whether metadata about this execution should be returned.
+   * @param outputTensorHandles will be filled in with handles to the outputs requested. It is
+   *     required that outputTensorHandles.length == outputOpHandles.length.
+   * @return if wantRunMetadata is true, serialized representation of the RunMetadata protocol
+   *     buffer, false otherwise.
+   */
+  private static native byte[] run(
+      long handle,
+      byte[] runOptions,
+      long[] inputTensorHandles,
+      long[] inputOpHandles,
+      int[] inputOpIndices,
+      long[] outputOpHandles,
+      int[] outputOpIndices,
+      long[] targetOpHandles,
+      boolean wantRunMetadata,
+      long[] outputTensorHandles);
+}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Tensor.java b/tensorflow/java/src/main/java/org/tensorflow/Tensor.java
new file mode 100644
index 00000000000000..5478bb85e9bdd0
--- /dev/null
+++ b/tensorflow/java/src/main/java/org/tensorflow/Tensor.java
@@ -0,0 +1,349 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow;
+
+import java.lang.reflect.Array;
+import java.util.Arrays;
+
+/**
+ * A typed multi-dimensional array.
+ *
+ * <p>Instances of a Tensor are <b>not</b> thread-safe.
+ *
+ * <p><b>WARNING:</b> Resources consumed by the Tensor object <b>must</b> be explicitly freed by
+ * invoking the {@link #close()} method when the object is no longer needed. For example, using a
+ * try-with-resources block like:
+ *
+ * <pre>{@code
+ * try(Tensor t = Tensor.create(...)) {
+ *   doSomethingWith(t);
+ * }
+ * }</pre>
+ */
+public final class Tensor implements AutoCloseable {
+  /**
+   * Create a Tensor from a Java object.
+   *
+   * <p>A Tensor is a multi-dimensional array of elements of a limited set of types ({@link
+   * DataType}). Thus, not all Java objects can be converted to a Tensor. In particular, {@code obj}
+   * must be either a primitive (float, double, int, long, boolean) or a multi-dimensional array of
+   * one of those primitives. For example:
+   *
+   * <pre>{@code
+   * // Valid: A 64-bit integer scalar.
+   * Tensor s = Tensor.create(42L);
+   *
+   * // Valid: A 3x2 matrix of floats.
+   * float[][] matrix = new float[3][2];
+   * Tensor m = Tensor.create(matrix);
+   *
+   * // Invalid: Will throw an IllegalArgumentException as an arbitrary Object
+   * // does not fit into the TensorFlow type system.
+   * Tensor o = Tensor.create(new Object());
+   *
+   * // Invalid: Will throw an IllegalArgumentException since there are
+   * // a differing number of elements in each row of this 2-D array.
+   * int[][] twoD = new int[2][];
+   * twoD[0] = new int[1];
+   * twoD[1] = new int[2];
+   * Tensor x = Tensor.create(twoD);
+   * }</pre>
+   *
+   * @throws IllegalArgumentException if {@code obj} is not compatible with the TensorFlow type
+   *     system.
+   */
+  public static Tensor create(Object obj) {
+    Tensor t = new Tensor();
+    t.dtype = dataTypeOf(obj);
+    t.shapeCopy = new long[numDimensions(obj)];
+    fillShape(obj, 0, t.shapeCopy);
+    if (t.dtype != DataType.STRING) {
+      t.nativeHandle = allocate(t.dtype.c(), t.shapeCopy);
+      setValue(t.nativeHandle, obj);
+    } else if (t.shapeCopy.length != 0) {
+      throw new UnsupportedOperationException(
+          String.format(
+              "non-scalar DataType.STRING tensors are not supported yet (version %s). Please file a feature request at https://github.com/tensorflow/tensorflow/issues/new",
+              TensorFlow.version()));
+    } else {
+      t.nativeHandle = allocateScalarBytes((byte[]) obj);
+    }
+    return t;
+  }
+
+  /**
+   * Release resources associated with the Tensor.
+   *
+   * <p><b>WARNING:</b>If not invoked, memory will be leaked.
+   *
+   * <p>The Tensor object is no longer usable after {@code close} returns.
+   */
+  @Override
+  public void close() {
+    if (nativeHandle != 0) {
+      delete(nativeHandle);
+      nativeHandle = 0;
+    }
+  }
+
+  /** Returns the {@link DataType} of elements stored in the Tensor. */
+  public DataType dataType() {
+    return dtype;
+  }
+
+  /**
+   * Returns the number of dimensions (sometimes referred to as <a
+   * href="https://www.tensorflow.org/resources/dims_types.html#rank">rank</a>) of the Tensor.
+   *
+   * <p>Will be 0 for a scalar, 1 for a vector, 2 for a matrix, 3 for a 3-dimensional tensor etc.
+   */
+  public int numDimensions() {
+    return shapeCopy.length;
+  }
+
+  /**
+   * Returns the <a href="https://www.tensorflow.org/resources/dims_types.html#shape">shape</a> of
+   * the Tensor, i.e., the sizes of each dimension.
+   *
+   * @return an array where the i-th element is the size of the i-th dimension of the tensor.
+   */
+  public long[] shape() {
+    return shapeCopy;
+  }
+
+  /**
+   * Returns the value in a scalar {@link DataType#FLOAT} tensor.
+   *
+   * @throws IllegalArgumentException if the Tensor does not represent a float scalar.
+   */
+  public float floatValue() {
+    return scalarFloat(nativeHandle);
+  }
+
+  /**
+   * Returns the value in a scalar {@link DataType#DOUBLE} tensor.
+   *
+   * @throws IllegalArgumentException if the Tensor does not represent a double scalar.
+   */
+  public double doubleValue() {
+    return scalarDouble(nativeHandle);
+  }
+
+  /**
+   * Returns the value in a scalar {@link DataType#INT32} tensor.
+   *
+   * @throws IllegalArgumentException if the Tensor does not represent a int scalar.
+   */
+  public int intValue() {
+    return scalarInt(nativeHandle);
+  }
+
+  /**
+   * Returns the value in a scalar {@link DataType#INT64} tensor.
+   *
+   * @throws IllegalArgumentException if the Tensor does not represent a long scalar.
+   */
+  public long longValue() {
+    return scalarLong(nativeHandle);
+  }
+
+  /**
+   * Returns the value in a scalar {@link DataType#BOOL} tensor.
+   *
+   * @throws IllegalArgumentException if the Tensor does not represent a boolean scalar.
+   */
+  public boolean booleanValue() {
+    return scalarBoolean(nativeHandle);
+  }
+
+  /**
+   * Returns the value in a scalar {@link DataType#STRING} tensor.
+   *
+   * @throws IllegalArgumentException if the Tensor does not represent a boolean scalar.
+   */
+  public byte[] bytesValue() {
+    return scalarBytes(nativeHandle);
+  }
+
+  /**
+   * Copies the contents of the tensor to {@code dst} and returns {@code dst}.
+   *
+   * <p>For non-scalar tensors, this method copies the contents of the underlying tensor to a Java
+   * array. For scalar tensors, use one of {@link #floatValue()}, {@link #doubleValue()}, {@link
+   * #intValue()}, {@link #longValue()} or {@link #booleanValue()} instead. The type and shape of
+   * {@code dst} must be compatible with the tensor. For example:
+   *
+   * <pre>{@code
+   * int matrix[2][2] = {{1,2},{3,4}};
+   * try(Tensor t = Tensor.create(matrix)) {
+   *   // Succeeds and prints "3"
+   *   int[][] copy = new int[2][2];
+   *   System.out.println(t.copyTo(copy)[1][0]);
+   *
+   *   // Throws IllegalArgumentException since the shape of dst does not match the shape of t.
+   *   int[][] dst = new int[4][1];
+   *   t.copyTo(dst);
+   * }
+   * }</pre>
+   *
+   * @throws IllegalArgumentException if the tensor is a scalar or if {@code dst} is not compatible
+   *     with the tensor (for example, mismatched data types or shapes).
+   */
+  public <T> T copyTo(T dst) {
+    throwExceptionIfTypeIsIncompatible(dst);
+    readNDArray(nativeHandle, dst);
+    return dst;
+  }
+
+  /** Returns a string describing the type and shape of the Tensor. */
+  @Override
+  public String toString() {
+    return String.format("%s tensor with shape %s", dtype.toString(), Arrays.toString(shape()));
+  }
+
+  /**
+   * Create a Tensor object from a handle to the C TF_Tensor object.
+   *
+   * <p>Takes ownership of the handle.
+   */
+  static Tensor fromHandle(long handle) {
+    Tensor t = new Tensor();
+    t.dtype = DataType.fromC(dtype(handle));
+    t.shapeCopy = shape(handle);
+    t.nativeHandle = handle;
+    return t;
+  }
+
+  long getNativeHandle() {
+    return nativeHandle;
+  }
+
+  private long nativeHandle;
+  private DataType dtype;
+  private long[] shapeCopy = null;
+
+  private Tensor() {}
+
+  private static DataType dataTypeOf(Object o) {
+    if (o.getClass().isArray()) {
+      if (Array.getLength(o) == 0) {
+        throw new IllegalArgumentException("cannot create Tensors with a 0 dimension");
+      }
+      // byte[] is a DataType.STRING scalar.
+      Object e = Array.get(o, 0);
+      if (Byte.class.isInstance(e) || byte.class.isInstance(e)) {
+        return DataType.STRING;
+      }
+      return dataTypeOf(e);
+    }
+    if (Float.class.isInstance(o) || float.class.isInstance(o)) {
+      return DataType.FLOAT;
+    } else if (Double.class.isInstance(o) || double.class.isInstance(o)) {
+      return DataType.DOUBLE;
+    } else if (Integer.class.isInstance(o) || int.class.isInstance(o)) {
+      return DataType.INT32;
+    } else if (Long.class.isInstance(o) || long.class.isInstance(o)) {
+      return DataType.INT64;
+    } else if (Boolean.class.isInstance(o) || boolean.class.isInstance(o)) {
+      return DataType.BOOL;
+    } else {
+      throw new IllegalArgumentException("cannot create Tensors of " + o.getClass().getName());
+    }
+  }
+
+  private static int numDimensions(Object o) {
+    if (o.getClass().isArray()) {
+      // byte[] is a DataType.STRING scalar.
+      Object e = Array.get(o, 0);
+      if (Byte.class.isInstance(e) || byte.class.isInstance(e)) {
+        return 0;
+      }
+      return 1 + numDimensions(e);
+    }
+    return 0;
+  }
+
+  private static void fillShape(Object o, int dim, long[] shape) {
+    if (shape == null || dim == shape.length) {
+      return;
+    }
+    final int len = Array.getLength(o);
+    if (shape[dim] == 0) {
+      shape[dim] = len;
+    } else if (shape[dim] != len) {
+      throw new IllegalArgumentException(
+          String.format("mismatched lengths (%d and %d) in dimension %d", shape[dim], len, dim));
+    }
+    for (int i = 0; i < len; ++i) {
+      fillShape(Array.get(o, i), dim + 1, shape);
+    }
+  }
+
+  private void throwExceptionIfTypeIsIncompatible(Object o) {
+    if (numDimensions(o) != numDimensions()) {
+      throw new IllegalArgumentException(
+          String.format(
+              "cannot copy Tensor with %d dimensions into an object with %d",
+              numDimensions(), numDimensions(o)));
+    }
+    if (dataTypeOf(o) != dtype) {
+      throw new IllegalArgumentException(
+          String.format(
+              "cannot copy Tensor with DataType %s into an object of type %s",
+              dtype.toString(), o.getClass().getName()));
+    }
+    long[] oShape = new long[numDimensions()];
+    fillShape(o, 0, oShape);
+    for (int i = 0; i < oShape.length; ++i) {
+      if (oShape[i] != shape()[i]) {
+        throw new IllegalArgumentException(
+            String.format(
+                "cannot copy Tensor with shape %s into object with shape %s",
+                Arrays.toString(shape()), Arrays.toString(oShape)));
+      }
+    }
+  }
+
+  private static native long allocate(int dtype, long[] shape);
+
+  private static native long allocateScalarBytes(byte[] value);
+
+  private static native void delete(long handle);
+
+  private static native int dtype(long handle);
+
+  private static native long[] shape(long handle);
+
+  private static native void setValue(long handle, Object value);
+
+  private static native float scalarFloat(long handle);
+
+  private static native double scalarDouble(long handle);
+
+  private static native int scalarInt(long handle);
+
+  private static native long scalarLong(long handle);
+
+  private static native boolean scalarBoolean(long handle);
+
+  private static native byte[] scalarBytes(long handle);
+
+  private static native void readNDArray(long handle, Object value);
+
+  static {
+    TensorFlow.init();
+  }
+}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/TensorFlow.java b/tensorflow/java/src/main/java/org/tensorflow/TensorFlow.java
index dc7f87b9282e77..c42dfc8e200b4b 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/TensorFlow.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/TensorFlow.java
@@ -17,12 +17,17 @@
 
 /** Static utility methods describing the TensorFlow runtime. */
 public final class TensorFlow {
+  /** Returns the version of the underlying TensorFlow runtime. */
+  public static native String version();
+
   private TensorFlow() {}
 
-  static {
+  /** Load the TensorFlow runtime C library. */
+  static void init() {
     System.loadLibrary("tensorflow-jni");
   }
 
-  /** Returns the version of the underlying TensorFlow runtime. */
-  public static native String getVersion();
+  static {
+    init();
+  }
 }
diff --git a/tensorflow/java/src/main/java/org/tensorflow/TensorFlowException.java b/tensorflow/java/src/main/java/org/tensorflow/TensorFlowException.java
new file mode 100644
index 00000000000000..02aafe62d5a9dd
--- /dev/null
+++ b/tensorflow/java/src/main/java/org/tensorflow/TensorFlowException.java
@@ -0,0 +1,23 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow;
+
+/** Unchecked exception thrown when executing TensorFlow Graphs. */
+public final class TensorFlowException extends RuntimeException {
+  TensorFlowException(String message) {
+    super(message);
+  }
+}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/examples/Example.java b/tensorflow/java/src/main/java/org/tensorflow/examples/Example.java
index f61c44b4abbfcf..630632087a22cc 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/examples/Example.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/examples/Example.java
@@ -24,6 +24,6 @@
  */
 public class Example {
   public static void main(String[] args) {
-    System.out.println("TensorFlow version: " + TensorFlow.getVersion());
+    System.out.println("TensorFlow version: " + TensorFlow.version());
   }
 }
diff --git a/tensorflow/java/src/main/native/BUILD b/tensorflow/java/src/main/native/BUILD
index 3a2d0cbbfbbbfd..451906ab61c434 100644
--- a/tensorflow/java/src/main/native/BUILD
+++ b/tensorflow/java/src/main/native/BUILD
@@ -10,15 +10,17 @@ load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
 
 tf_cuda_library(
     name = "native",
-    srcs = [
-        "tensorflow.cc",
+    srcs = glob(["*.cc"]) + [
         ":jni.h",
         ":jni_md.h",
     ],
-    hdrs = ["tensorflow.h"],
+    hdrs = glob(["*.h"]),
     includes = ["."],
     deps = [
         "//tensorflow/c:c_api",
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:direct_session",
+        "//tensorflow/core:ops",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/java/src/main/native/exception_jni.cc b/tensorflow/java/src/main/native/exception_jni.cc
new file mode 100644
index 00000000000000..3ae610a15db8d8
--- /dev/null
+++ b/tensorflow/java/src/main/native/exception_jni.cc
@@ -0,0 +1,69 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stdarg.h>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/java/src/main/native/exception_jni.h"
+
+const char kIllegalArgumentException[] = "java/lang/IllegalArgumentException";
+const char kIllegalStateException[] = "java/lang/IllegalStateException";
+const char kNullPointerException[] = "java/lang/NullPointerException";
+const char kIndexOutOfBoundsException[] = "java/lang/IndexOutOfBoundsException";
+const char kUnsupportedOperationException[] =
+    "java/lang/UnsupportedOperationException";
+
+void throwException(JNIEnv* env, const char* clazz, const char* fmt, ...) {
+  va_list args;
+  va_start(args, fmt);
+  char* message = nullptr;
+  if (vasprintf(&message, fmt, args) >= 0) {
+    env->ThrowNew(env->FindClass(clazz), message);
+  } else {
+    env->ThrowNew(env->FindClass(clazz), "");
+  }
+  va_end(args);
+}
+
+namespace {
+// Map TF_Codes to unchecked exceptions.
+const char* exceptionClassName(TF_Code code) {
+  switch (code) {
+    case TF_OK:
+      return nullptr;
+    case TF_INVALID_ARGUMENT:
+      return kIllegalArgumentException;
+    case TF_UNAUTHENTICATED:
+    case TF_PERMISSION_DENIED:
+      return "java/lang/SecurityException";
+    case TF_RESOURCE_EXHAUSTED:
+    case TF_FAILED_PRECONDITION:
+      return kIllegalStateException;
+    case TF_OUT_OF_RANGE:
+      return kIndexOutOfBoundsException;
+    case TF_UNIMPLEMENTED:
+      return kUnsupportedOperationException;
+    default:
+      return "org/tensorflow/TensorFlowException";
+  }
+}
+}  // namespace
+
+bool throwExceptionIfNotOK(JNIEnv* env, const TF_Status* status) {
+  const char* clazz = exceptionClassName(TF_GetCode(status));
+  if (clazz == nullptr) return true;
+  env->ThrowNew(env->FindClass(clazz), TF_Message(status));
+  return false;
+}
diff --git a/tensorflow/java/src/main/native/exception_jni.h b/tensorflow/java/src/main/native/exception_jni.h
new file mode 100644
index 00000000000000..4dfcdf60c810a0
--- /dev/null
+++ b/tensorflow/java/src/main/native/exception_jni.h
@@ -0,0 +1,42 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_JAVA_EXCEPTION_JNI_H_
+#define TENSORFLOW_JAVA_EXCEPTION_JNI_H_
+
+#include <jni.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+class TF_Status;
+
+extern const char kIllegalArgumentException[];
+extern const char kIllegalStateException[];
+extern const char kNullPointerException[];
+extern const char kIndexOutOfBoundsException[];
+extern const char kUnsupportedOperationException[];
+
+void throwException(JNIEnv* env, const char* clazz, const char* fmt, ...);
+
+// If status is not TF_OK, then throw an appropriate exception.
+// Returns true iff TF_GetCode(status) == TF_OK.
+bool throwExceptionIfNotOK(JNIEnv* env, const TF_Status* status);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // TENSORFLOW_JAVA_EXCEPTION_JNI_H_
diff --git a/tensorflow/java/src/main/native/graph_jni.cc b/tensorflow/java/src/main/native/graph_jni.cc
new file mode 100644
index 00000000000000..8e9187b4373606
--- /dev/null
+++ b/tensorflow/java/src/main/native/graph_jni.cc
@@ -0,0 +1,112 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/java/src/main/native/graph_jni.h"
+
+#include <limits>
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/java/src/main/native/exception_jni.h"
+
+namespace {
+TF_Graph* requireHandle(JNIEnv* env, jlong handle) {
+  static_assert(sizeof(jlong) >= sizeof(TF_Graph*),
+                "Cannot package C object pointers as a Java long");
+  if (handle == 0) {
+    throwException(env, kIllegalStateException,
+                   "close() has been called on the Graph");
+    return nullptr;
+  }
+  return reinterpret_cast<TF_Graph*>(handle);
+}
+}  // namespace
+
+JNIEXPORT jlong JNICALL Java_org_tensorflow_Graph_allocate(JNIEnv*, jclass) {
+  return reinterpret_cast<jlong>(TF_NewGraph());
+}
+
+JNIEXPORT void JNICALL Java_org_tensorflow_Graph_delete(JNIEnv*, jclass,
+                                                        jlong handle) {
+  if (handle == 0) return;
+  TF_DeleteGraph(reinterpret_cast<TF_Graph*>(handle));
+}
+
+JNIEXPORT jlong JNICALL Java_org_tensorflow_Graph_operation(JNIEnv* env,
+                                                            jclass clazz,
+                                                            jlong handle,
+                                                            jstring name) {
+  TF_Graph* g = requireHandle(env, handle);
+  if (g == nullptr) return 0;
+  const char* cname = env->GetStringUTFChars(name, nullptr);
+  TF_Operation* op = TF_GraphOperationByName(g, cname);
+  env->ReleaseStringUTFChars(name, cname);
+  return reinterpret_cast<jlong>(op);
+}
+
+JNIEXPORT void JNICALL Java_org_tensorflow_Graph_importGraphDef(
+    JNIEnv* env, jclass clazz, jlong handle, jbyteArray graph_def,
+    jstring prefix) {
+  TF_Graph* g = requireHandle(env, handle);
+  if (g == nullptr) return;
+
+  TF_ImportGraphDefOptions* opts = TF_NewImportGraphDefOptions();
+
+  jboolean is_copy;
+  const char* cprefix = env->GetStringUTFChars(prefix, &is_copy);
+  TF_ImportGraphDefOptionsSetPrefix(opts, cprefix);
+  env->ReleaseStringUTFChars(prefix, cprefix);
+
+  static_assert(sizeof(jbyte) == 1, "unexpected size of the jbyte type");
+  jbyte* bytes = env->GetByteArrayElements(graph_def, &is_copy);
+  TF_Buffer* buf =
+      TF_NewBufferFromString(bytes, env->GetArrayLength(graph_def));
+  TF_Status* status = TF_NewStatus();
+
+  TF_GraphImportGraphDef(g, buf, opts, status);
+  throwExceptionIfNotOK(env, status);
+  // Continue cleaning up resources even if an exception was thrown.
+
+  TF_DeleteStatus(status);
+  TF_DeleteBuffer(buf);
+  env->ReleaseByteArrayElements(graph_def, bytes, JNI_ABORT);
+
+  TF_DeleteImportGraphDefOptions(opts);
+}
+
+JNIEXPORT jbyteArray JNICALL
+Java_org_tensorflow_Graph_toGraphDef(JNIEnv* env, jclass clazz, jlong handle) {
+  jbyteArray ret = nullptr;
+  TF_Graph* g = requireHandle(env, handle);
+  if (g == nullptr) return ret;
+
+  TF_Buffer* buf = TF_NewBuffer();
+  TF_Status* status = TF_NewStatus();
+  TF_GraphToGraphDef(g, buf, status);
+  if (throwExceptionIfNotOK(env, status)) {
+    // sizeof(jsize) is less than sizeof(size_t) on some platforms.
+    if (buf->length > std::numeric_limits<jint>::max()) {
+      throwException(env, kIndexOutOfBoundsException,
+                     "GraphDef is too large to serialize into a byte[] array");
+    } else {
+      static_assert(sizeof(jbyte) == 1, "unexpected size of the jbyte type");
+      jint ret_len = static_cast<jint>(buf->length);
+      ret = env->NewByteArray(ret_len);
+      env->SetByteArrayRegion(ret, 0, ret_len,
+                              static_cast<const jbyte*>(buf->data));
+    }
+  }
+  TF_DeleteStatus(status);
+  TF_DeleteBuffer(buf);
+  return ret;
+}
diff --git a/tensorflow/java/src/main/native/graph_jni.h b/tensorflow/java/src/main/native/graph_jni.h
new file mode 100644
index 00000000000000..b84c11578ee584
--- /dev/null
+++ b/tensorflow/java/src/main/native/graph_jni.h
@@ -0,0 +1,70 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_JAVA_GRAPH_JNI_H_
+#define TENSORFLOW_JAVA_GRAPH_JNI_H_
+
+#include <jni.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Class:     org_tensorflow_Graph
+ * Method:    allocate
+ * Signature: ()J
+ */
+JNIEXPORT jlong JNICALL Java_org_tensorflow_Graph_allocate(JNIEnv *, jclass);
+
+/*
+ * Class:     org_tensorflow_Graph
+ * Method:    delete
+ * Signature: (J)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_Graph_delete(JNIEnv *, jclass,
+                                                        jlong);
+
+/*
+ * Class:     org_tensorflow_Graph
+ * Method:    operation
+ * Signature: (JLjava/lang/String;)J
+ */
+JNIEXPORT jlong JNICALL Java_org_tensorflow_Graph_operation(JNIEnv *, jclass,
+                                                            jlong, jstring);
+
+/*
+ * Class:     org_tensorflow_Graph
+ * Method:    importGraphDef
+ * Signature: (J[BLjava/lang/String;)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_Graph_importGraphDef(JNIEnv *,
+                                                                jclass, jlong,
+                                                                jbyteArray,
+                                                                jstring);
+
+/*
+ * Class:     org_tensorflow_Graph
+ * Method:    toGraphDef
+ * Signature: (J)[B
+ */
+JNIEXPORT jbyteArray JNICALL Java_org_tensorflow_Graph_toGraphDef(JNIEnv *,
+                                                                  jclass,
+                                                                  jlong);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // TENSORFLOW_JAVA_GRAPH_JNI_H_
diff --git a/tensorflow/java/src/main/native/operation_builder_jni.cc b/tensorflow/java/src/main/native/operation_builder_jni.cc
new file mode 100644
index 00000000000000..2a32cba70c7122
--- /dev/null
+++ b/tensorflow/java/src/main/native/operation_builder_jni.cc
@@ -0,0 +1,221 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/java/src/main/native/operation_builder_jni.h"
+
+#include <memory>
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/java/src/main/native/exception_jni.h"
+
+namespace {
+TF_OperationDescription* requireHandle(JNIEnv* env, jlong handle) {
+  if (handle == 0) {
+    throwException(env, kIllegalStateException,
+                   "Operation has already been built");
+    return 0;
+  }
+  return reinterpret_cast<TF_OperationDescription*>(handle);
+}
+
+bool resolveOutput(JNIEnv* env, jlong op_handle, jint index, TF_Output* out) {
+  if (op_handle == 0) {
+    throwException(env, kIllegalStateException,
+                   "close() was called on the Graph");
+    return false;
+  }
+  out->oper = reinterpret_cast<TF_Operation*>(op_handle);
+  out->index = static_cast<int>(index);
+  return true;
+}
+
+TF_Tensor* requireTensor(JNIEnv* env, jlong handle) {
+  if (handle == 0) {
+    throwException(env, kIllegalStateException,
+                   "close() has been called on the Tensor");
+    return nullptr;
+  }
+  return reinterpret_cast<TF_Tensor*>(handle);
+}
+}  // namespace
+
+JNIEXPORT jlong JNICALL Java_org_tensorflow_OperationBuilder_allocate(
+    JNIEnv* env, jclass clazz, jlong graph_handle, jstring type, jstring name) {
+  if (graph_handle == 0) {
+    throwException(env, kIllegalStateException,
+                   "close() has been called on the Graph");
+    return 0;
+  }
+  TF_Graph* graph = reinterpret_cast<TF_Graph*>(graph_handle);
+  const char* op_type = env->GetStringUTFChars(type, nullptr);
+  const char* op_name = env->GetStringUTFChars(name, nullptr);
+  TF_OperationDescription* d = TF_NewOperation(graph, op_type, op_name);
+  env->ReleaseStringUTFChars(name, op_name);
+  env->ReleaseStringUTFChars(type, op_type);
+  static_assert(sizeof(jlong) >= sizeof(TF_OperationDescription*),
+                "Cannot represent a C TF_OperationDescription as a Java long");
+  return reinterpret_cast<jlong>(d);
+}
+
+JNIEXPORT jlong JNICALL Java_org_tensorflow_OperationBuilder_finish(
+    JNIEnv* env, jclass clazz, jlong handle) {
+  TF_OperationDescription* d = requireHandle(env, handle);
+  if (d == nullptr) return 0;
+  TF_Status* status = TF_NewStatus();
+  TF_Operation* op = TF_FinishOperation(d, status);
+  if (throwExceptionIfNotOK(env, status)) {
+    return reinterpret_cast<jlong>(op);
+  }
+  return 0;
+}
+
+JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_addInput(
+    JNIEnv* env, jclass clazz, jlong handle, jlong op_handle, jint index) {
+  TF_Output out;
+  if (!resolveOutput(env, op_handle, index, &out)) return;
+  TF_OperationDescription* d = requireHandle(env, handle);
+  if (d == nullptr) return;
+  TF_AddInput(d, out);
+}
+
+JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_addInputList(
+    JNIEnv* env, jclass clazz, jlong handle, jlongArray op_handles,
+    jintArray indices) {
+  TF_OperationDescription* d = requireHandle(env, handle);
+  if (d == nullptr) return;
+  const size_t n = static_cast<size_t>(env->GetArrayLength(op_handles));
+  if (env->GetArrayLength(indices) != n) {
+    throwException(env, kIllegalArgumentException,
+                   "mismatch in number of Operations (%d) and output indices "
+                   "(%d) provided",
+                   n, env->GetArrayLength(indices));
+    return;
+  }
+  std::unique_ptr<TF_Output[]> o(new TF_Output[n]);
+  jlong* oph = env->GetLongArrayElements(op_handles, nullptr);
+  jint* idx = env->GetIntArrayElements(indices, nullptr);
+  bool ok = true;
+  for (int i = 0; i < n && ok; ++i) {
+    ok = resolveOutput(env, oph[i], idx[i], &o[i]);
+  }
+  env->ReleaseIntArrayElements(indices, idx, JNI_ABORT);
+  env->ReleaseLongArrayElements(op_handles, oph, JNI_ABORT);
+  if (!ok) return;
+  TF_AddInputList(d, o.get(), n);
+}
+
+JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_setDevice(
+    JNIEnv* env, jclass clazz, jlong handle, jstring device) {
+  TF_OperationDescription* d = requireHandle(env, handle);
+  if (d == nullptr) return;
+  const char* cdevice = env->GetStringUTFChars(device, nullptr);
+  TF_SetDevice(d, cdevice);
+  env->ReleaseStringUTFChars(device, cdevice);
+}
+
+JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_setAttrString(
+    JNIEnv* env, jclass clazz, jlong handle, jstring name, jbyteArray value) {
+  static_assert(sizeof(jbyte) == 1,
+                "Require Java byte to be represented as a single byte");
+  TF_OperationDescription* d = requireHandle(env, handle);
+  if (d == nullptr) return;
+  const char* cname = env->GetStringUTFChars(name, nullptr);
+  jbyte* cvalue = env->GetByteArrayElements(value, nullptr);
+  TF_SetAttrString(d, cname, cvalue, env->GetArrayLength(value));
+  env->ReleaseByteArrayElements(value, cvalue, JNI_ABORT);
+  env->ReleaseStringUTFChars(name, cname);
+}
+
+#define DEFINE_SET_ATTR_SCALAR(name, jtype, ctype)                           \
+  JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_setAttr##name( \
+      JNIEnv* env, jclass clazz, jlong handle, jstring name, jtype value) {  \
+    static_assert(                                                           \
+        sizeof(ctype) >= sizeof(jtype),                                      \
+        "Information loss when converting between Java and C types");        \
+    TF_OperationDescription* d = requireHandle(env, handle);                 \
+    if (d == nullptr) return;                                                \
+    const char* cname = env->GetStringUTFChars(name, nullptr);               \
+    TF_SetAttr##name(d, cname, static_cast<ctype>(value));                   \
+    env->ReleaseStringUTFChars(name, cname);                                 \
+  }
+
+#define DEFINE_SET_ATTR_LIST(name, jname, jtype, ctype)            \
+  JNIEXPORT void JNICALL                                           \
+      Java_ord_tensorflow_OperationBuilder_setAttr##name##List(    \
+          JNIEnv* env, jclass clazz, jlong handle, jstring name,   \
+          jtype##Array value) {                                    \
+    const char* cname = env->GetStringUTFChars(name, nullptr);     \
+    /* Make a copy of the array to paper over any differences */   \
+    /* in byte representations of the jtype and ctype         */   \
+    /* For example, jint vs TF_DataType.                      */   \
+    /* If this copy turns out to be a problem in practice     */   \
+    /* can avoid it for many types.                           */   \
+    const int n = env->GetArrayLength(value);                      \
+    std::unique_ptr<ctype[]> cvalue(new ctype[n]);                 \
+    jtype* elems = env->Get##jname##ArrayElements(value, nullptr); \
+    for (int i = 0; i < n; ++i) {                                  \
+      cvalue[i] = static_cast<ctype>(elems[i]);                    \
+    }                                                              \
+    env->Release##jname##ArrayElements(value, elems, JNI_ABORT);   \
+    env->ReleaseStringUTFChars(name, cname);                       \
+  }
+
+#define DEFINE_SET_ATTR(name, jname, jtype, ctype) \
+  DEFINE_SET_ATTR_SCALAR(name, jtype, ctype)       \
+  DEFINE_SET_ATTR_LIST(name, jname, jtype, ctype)
+
+DEFINE_SET_ATTR(Int, Long, jlong, int64_t);
+DEFINE_SET_ATTR(Float, Float, jfloat, float);
+DEFINE_SET_ATTR(Bool, Boolean, jboolean, unsigned char);
+DEFINE_SET_ATTR(Type, Int, jint, TF_DataType);
+#undef DEFINE_SET_ATTR
+#undef DEFINE_SET_ATTR_LIST
+#undef DEFINE_SET_ATTR_SCALAR
+
+JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_setAttrTensor(
+    JNIEnv* env, jclass clazz, jlong handle, jstring name,
+    jlong tensor_handle) {
+  TF_OperationDescription* d = requireHandle(env, handle);
+  if (d == nullptr) return;
+  TF_Tensor* t = requireTensor(env, tensor_handle);
+  if (t == nullptr) return;
+  const char* cname = env->GetStringUTFChars(name, nullptr);
+  TF_Status* status = TF_NewStatus();
+  TF_SetAttrTensor(d, cname, t, status);
+  throwExceptionIfNotOK(env, status);
+  env->ReleaseStringUTFChars(name, cname);
+}
+
+JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_setAttrTensorList(
+    JNIEnv* env, jclass clazz, jlong handle, jstring name,
+    jlongArray tensor_handles) {
+  TF_OperationDescription* d = requireHandle(env, handle);
+  if (d == nullptr) return;
+  const int n = env->GetArrayLength(tensor_handles);
+  std::unique_ptr<TF_Tensor* []> tensors(new TF_Tensor*[n]);
+  jlong* jhandles = env->GetLongArrayElements(tensor_handles, nullptr);
+  bool ok = true;
+  for (int i = 0; i < n && ok; ++i) {
+    tensors[i] = requireTensor(env, jhandles[i]);
+    ok = !env->ExceptionCheck();
+  }
+  env->ReleaseLongArrayElements(tensor_handles, jhandles, JNI_ABORT);
+  if (!ok) return;
+
+  const char* cname = env->GetStringUTFChars(name, nullptr);
+  TF_Status* status = TF_NewStatus();
+  TF_SetAttrTensorList(d, cname, tensors.get(), n, status);
+  throwExceptionIfNotOK(env, status);
+  env->ReleaseStringUTFChars(name, cname);
+}
diff --git a/tensorflow/java/src/main/native/operation_builder_jni.h b/tensorflow/java/src/main/native/operation_builder_jni.h
new file mode 100644
index 00000000000000..bd298cf8c8dd77
--- /dev/null
+++ b/tensorflow/java/src/main/native/operation_builder_jni.h
@@ -0,0 +1,159 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_JAVA_OPERATION_BUILDER_JNI_H_
+#define TENSORFLOW_JAVA_OPERATION_BUILDER_JNI_H_
+
+#include <jni.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Class:     org_tensorflow_OperationBuilder
+ * Method:    allocate
+ * Signature: (JLjava/lang/String;Ljava/lang/String;)J
+ */
+JNIEXPORT jlong JNICALL Java_org_tensorflow_OperationBuilder_allocate(
+    JNIEnv *, jclass, jlong, jstring, jstring);
+
+/*
+ * Class:     org_tensorflow_OperationBuilder
+ * Method:    finish
+ * Signature: (J)J
+ */
+JNIEXPORT jlong JNICALL Java_org_tensorflow_OperationBuilder_finish(JNIEnv *,
+                                                                    jclass,
+                                                                    jlong);
+
+/*
+ * Class:     org_tensorflow_OperationBuilder
+ * Method:    addInput
+ * Signature: (JJI)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_addInput(
+    JNIEnv *, jclass, jlong, jlong, jint);
+
+/*
+ * Class:     org_tensorflow_OperationBuilder
+ * Method:    addInputList
+ * Signature: (J[J[I)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_addInputList(
+    JNIEnv *, jclass, jlong, jlongArray, jintArray);
+
+/*
+ * Class:     org_tensorflow_OperationBuilder
+ * Method:    setDevice
+ * Signature: (JLjava/lang/String;)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_setDevice(JNIEnv *,
+                                                                      jclass,
+                                                                      jlong,
+                                                                      jstring);
+
+/*
+ * Class:     org_tensorflow_OperationBuilder
+ * Method:    setAttrString
+ * Signature: (JLjava/lang/String;[B)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_setAttrString(
+    JNIEnv *, jclass, jlong, jstring, jbyteArray);
+
+/*
+ * Class:     org_tensorflow_OperationBuilder
+ * Method:    setAttrInt
+ * Signature: (JLjava/lang/String;J)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_setAttrInt(
+    JNIEnv *, jclass, jlong, jstring, jlong);
+
+/*
+ * Class:     org_tensorflow_OperationBuilder
+ * Method:    setAttrIntList
+ * Signature: (JLjava/lang/String;[J)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_setAttrIntList(
+    JNIEnv *, jclass, jlong, jstring, jlongArray);
+
+/*
+ * Class:     org_tensorflow_OperationBuilder
+ * Method:    setAttrFloat
+ * Signature: (JLjava/lang/String;F)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_setAttrFloat(
+    JNIEnv *, jclass, jlong, jstring, jfloat);
+
+/*
+ * Class:     org_tensorflow_OperationBuilder
+ * Method:    setAttrFloatList
+ * Signature: (JLjava/lang/String;[F)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_setAttrFloatList(
+    JNIEnv *, jclass, jlong, jstring, jfloatArray);
+
+/*
+ * Class:     org_tensorflow_OperationBuilder
+ * Method:    setAttrBool
+ * Signature: (JLjava/lang/String;Z)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_setAttrBool(
+    JNIEnv *, jclass, jlong, jstring, jboolean);
+
+/*
+ * Class:     org_tensorflow_OperationBuilder
+ * Method:    setAttrBoolList
+ * Signature: (JLjava/lang/String;[Z)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_setAttrBoolList(
+    JNIEnv *, jclass, jlong, jstring, jbooleanArray);
+
+/*
+ * Class:     org_tensorflow_OperationBuilder
+ * Method:    setAttrType
+ * Signature: (JLjava/lang/String;I)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_setAttrType(
+    JNIEnv *, jclass, jlong, jstring, jint);
+
+/*
+ * Class:     org_tensorflow_OperationBuilder
+ * Method:    setAttrTypeList
+ * Signature: (JLjava/lang/String;[I)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_setAttrTypeList(
+    JNIEnv *, jclass, jlong, jstring, jintArray);
+
+/*
+ * Class:     org_tensorflow_OperationBuilder
+ * Method:    setAttrTensor
+ * Signature: (JLjava/lang/String;J)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_setAttrTensor(
+    JNIEnv *, jclass, jlong, jstring, jlong);
+
+/*
+ * Class:     org_tensorflow_OperationBuilder
+ * Method:    setAttrTensorList
+ * Signature: (JLjava/lang/String;[J)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_setAttrTensorList(
+    JNIEnv *, jclass, jlong, jstring, jlongArray);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // TENSORFLOW_JAVA_OPERATION_BUILDER_JNI_H_
diff --git a/tensorflow/java/src/main/native/operation_jni.cc b/tensorflow/java/src/main/native/operation_jni.cc
new file mode 100644
index 00000000000000..e2eaacd4189e48
--- /dev/null
+++ b/tensorflow/java/src/main/native/operation_jni.cc
@@ -0,0 +1,57 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/java/src/main/native/operation_jni.h"
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/java/src/main/native/exception_jni.h"
+
+namespace {
+TF_Operation* requireHandle(JNIEnv* env, jlong handle) {
+  static_assert(sizeof(jlong) >= sizeof(TF_Operation*),
+                "Cannot package C object pointers as a Java long");
+  if (handle == 0) {
+    throwException(
+        env, kNullPointerException,
+        "close() has been called on the Graph this Operation was a part of");
+    return nullptr;
+  }
+  return reinterpret_cast<TF_Operation*>(handle);
+}
+}  // namespace
+
+JNIEXPORT jstring JNICALL Java_org_tensorflow_Operation_name(JNIEnv* env,
+                                                             jclass clazz,
+                                                             jlong handle) {
+  TF_Operation* op = requireHandle(env, handle);
+  if (op == nullptr) return nullptr;
+  return env->NewStringUTF(TF_OperationName(op));
+}
+
+JNIEXPORT jstring JNICALL Java_org_tensorflow_Operation_type(JNIEnv* env,
+                                                             jclass clazz,
+                                                             jlong handle) {
+  TF_Operation* op = requireHandle(env, handle);
+  if (op == nullptr) return nullptr;
+  return env->NewStringUTF(TF_OperationOpType(op));
+}
+
+JNIEXPORT jint JNICALL Java_org_tensorflow_Operation_numOutputs(JNIEnv* env,
+                                                                jclass clazz,
+                                                                jlong handle) {
+  TF_Operation* op = requireHandle(env, handle);
+  if (op == nullptr) return 0;
+  return TF_OperationNumOutputs(op);
+}
diff --git a/tensorflow/java/src/main/native/operation_jni.h b/tensorflow/java/src/main/native/operation_jni.h
new file mode 100644
index 00000000000000..ca25ef728eeb54
--- /dev/null
+++ b/tensorflow/java/src/main/native/operation_jni.h
@@ -0,0 +1,53 @@
+
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_JAVA_OPERATION_JNI_H_
+#define TENSORFLOW_JAVA_OPERATION_JNI_H_
+
+#include <jni.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Class:     org_tensorflow_Operation
+ * Method:    name
+ * Signature: (J)Ljava/lang/String;
+ */
+JNIEXPORT jstring JNICALL Java_org_tensorflow_Operation_name(JNIEnv *, jclass,
+                                                             jlong);
+
+/*
+ * Class:     org_tensorflow_Operation
+ * Method:    type
+ * Signature: (J)Ljava/lang/String;
+ */
+JNIEXPORT jstring JNICALL Java_org_tensorflow_Operation_type(JNIEnv *, jclass,
+                                                             jlong);
+
+/*
+ * Class:     org_tensorflow_Operation
+ * Method:    numOutputs
+ * Signature: (J)I
+ */
+JNIEXPORT jint JNICALL Java_org_tensorflow_Operation_numOutputs(JNIEnv *,
+                                                                jclass, jlong);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // TENSORFLOW_JAVA_OPERATION_JNI_H_
diff --git a/tensorflow/java/src/main/native/session_jni.cc b/tensorflow/java/src/main/native/session_jni.cc
new file mode 100644
index 00000000000000..7c054245e68949
--- /dev/null
+++ b/tensorflow/java/src/main/native/session_jni.cc
@@ -0,0 +1,175 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/java/src/main/native/exception_jni.h"
+#include "tensorflow/java/src/main/native/session_jni.h"
+
+namespace {
+TF_Session* requireHandle(JNIEnv* env, jlong handle) {
+  static_assert(sizeof(jlong) >= sizeof(TF_Session*),
+                "Cannot package C object pointers as a Java long");
+  if (handle == 0) {
+    throwException(env, kNullPointerException,
+                   "close() has been called on the Session");
+    return nullptr;
+  }
+  return reinterpret_cast<TF_Session*>(handle);
+}
+
+template <class T>
+void resolveHandles(JNIEnv* env, const char* type, jlongArray src_array,
+                    T** dst, jint n) {
+  if (env->ExceptionCheck()) return;
+  jint len = env->GetArrayLength(src_array);
+  if (len != n) {
+    throwException(env, kIllegalArgumentException, "expected %d, got %d %s", n,
+                   len, type);
+    return;
+  }
+  jlong* src_start = env->GetLongArrayElements(src_array, nullptr);
+  jlong* src = src_start;
+  for (int i = 0; i < n; ++i, ++src, ++dst) {
+    if (*src == 0) {
+      throwException(env, kNullPointerException, "invalid %s (#%d of %d)", type,
+                     i, n);
+      break;
+    }
+    *dst = reinterpret_cast<T*>(*src);
+  }
+  env->ReleaseLongArrayElements(src_array, src_start, JNI_ABORT);
+}
+
+void resolveOutputs(JNIEnv* env, const char* type, jlongArray src_op,
+                    jintArray src_index, TF_Output* dst, jint n) {
+  jint len = env->GetArrayLength(src_op);
+  if (len != n) {
+    throwException(env, kIllegalArgumentException,
+                   "expected %d, got %d %s Operations", n, len, type);
+    return;
+  }
+  len = env->GetArrayLength(src_index);
+  if (len != n) {
+    throwException(env, kIllegalArgumentException,
+                   "expected %d, got %d %s Operation output indices", n, len,
+                   type);
+    return;
+  }
+  jlong* op_handles = env->GetLongArrayElements(src_op, nullptr);
+  jint* indices = env->GetIntArrayElements(src_index, nullptr);
+  for (int i = 0; i < n; ++i) {
+    if (op_handles[i] == 0) {
+      throwException(env, kNullPointerException, "invalid %s (#%d of %d)", type,
+                     i, n);
+      break;
+    }
+    dst[i] = TF_Output{reinterpret_cast<TF_Operation*>(op_handles[i]),
+                       static_cast<int>(indices[i])};
+  }
+  env->ReleaseIntArrayElements(src_index, indices, JNI_ABORT);
+  env->ReleaseLongArrayElements(src_op, op_handles, JNI_ABORT);
+}
+}  // namespace
+
+JNIEXPORT jlong JNICALL Java_org_tensorflow_Session_allocate(
+    JNIEnv* env, jclass clazz, jlong graph_handle) {
+  if (graph_handle == 0) {
+    throwException(env, kNullPointerException, "Graph has been close()d");
+    return 0;
+  }
+  TF_Graph* graph = reinterpret_cast<TF_Graph*>(graph_handle);
+  TF_Status* status = TF_NewStatus();
+  TF_SessionOptions* opts = TF_NewSessionOptions();
+  TF_Session* session = TF_NewSession(graph, opts, status);
+  TF_DeleteSessionOptions(opts);
+  bool ok = throwExceptionIfNotOK(env, status);
+  TF_DeleteStatus(status);
+
+  return ok ? reinterpret_cast<jlong>(session) : 0;
+}
+
+JNIEXPORT void JNICALL Java_org_tensorflow_Session_delete(JNIEnv* env,
+                                                          jclass clazz,
+                                                          jlong handle) {
+  TF_Session* session = requireHandle(env, handle);
+  if (session == nullptr) return;
+  TF_Status* status = TF_NewStatus();
+  TF_CloseSession(session, status);
+  // Result of close is ignored, delete anyway.
+  TF_DeleteSession(session, status);
+  throwExceptionIfNotOK(env, status);
+  TF_DeleteStatus(status);
+}
+
+JNIEXPORT jbyteArray JNICALL Java_org_tensorflow_Session_run(
+    JNIEnv* env, jclass clazz, jlong handle, jbyteArray run_options,
+    jlongArray input_tensor_handles, jlongArray input_op_handles,
+    jintArray input_op_indices, jlongArray output_op_handles,
+    jintArray output_op_indices, jlongArray target_op_handles,
+    jboolean want_run_metadata, jlongArray output_tensor_handles) {
+  TF_Session* session = requireHandle(env, handle);
+  if (session == nullptr) return nullptr;
+
+  // Some limitations of this function implementation that should be addressed.
+  if (run_options != nullptr && env->GetArrayLength(run_options) != 0) {
+    throwException(env, kUnsupportedOperationException,
+                   "runOptions not supported");
+    return nullptr;
+  }
+  if (want_run_metadata) {
+    throwException(env, kUnsupportedOperationException,
+                   "run metadata collection not supported");
+    return nullptr;
+  }
+
+  const jint ninputs = env->GetArrayLength(input_tensor_handles);
+  const jint noutputs = env->GetArrayLength(output_tensor_handles);
+  const jint ntargets = env->GetArrayLength(target_op_handles);
+
+  const TF_Buffer* crun_options = nullptr;
+  std::unique_ptr<TF_Output[]> inputs(new TF_Output[ninputs]);
+  std::unique_ptr<TF_Tensor* []> input_values(new TF_Tensor*[ninputs]);
+  std::unique_ptr<TF_Output[]> outputs(new TF_Output[noutputs]);
+  std::unique_ptr<TF_Tensor* []> output_values(new TF_Tensor*[noutputs]);
+  std::unique_ptr<TF_Operation* []> targets(new TF_Operation*[ntargets]);
+  TF_Buffer* run_metadata = nullptr;
+
+  resolveHandles(env, "input Tensors", input_tensor_handles, input_values.get(),
+                 ninputs);
+  resolveOutputs(env, "input", input_op_handles, input_op_indices, inputs.get(),
+                 ninputs);
+  resolveOutputs(env, "output", output_op_handles, output_op_indices,
+                 outputs.get(), noutputs);
+  resolveHandles(env, "target Operations", target_op_handles, targets.get(),
+                 ntargets);
+  if (env->ExceptionCheck()) return nullptr;
+
+  TF_Status* status = TF_NewStatus();
+  TF_SessionRun(session, crun_options, inputs.get(), input_values.get(),
+                static_cast<int>(ninputs), outputs.get(), output_values.get(),
+                static_cast<int>(noutputs), targets.get(),
+                static_cast<int>(ntargets), run_metadata, status);
+  if (!throwExceptionIfNotOK(env, status)) {
+    return nullptr;
+  }
+  jlong* t = env->GetLongArrayElements(output_tensor_handles, nullptr);
+  for (int i = 0; i < noutputs; ++i) {
+    t[i] = reinterpret_cast<jlong>(output_values[i]);
+  }
+  env->ReleaseLongArrayElements(output_tensor_handles, t, 0);
+  return nullptr;
+}
diff --git a/tensorflow/java/src/main/native/session_jni.h b/tensorflow/java/src/main/native/session_jni.h
new file mode 100644
index 00000000000000..56b8f0c2ddbef5
--- /dev/null
+++ b/tensorflow/java/src/main/native/session_jni.h
@@ -0,0 +1,53 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_JAVA_SESSION_JNI_H_
+#define TENSORFLOW_JAVA_SESSION_JNI_H_
+
+#include <jni.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Class:     org_tensorflow_Session
+ * Method:    allocate
+ * Signature: (J)J
+ */
+JNIEXPORT jlong JNICALL Java_org_tensorflow_Session_allocate(JNIEnv *, jclass,
+                                                             jlong);
+
+/*
+ * Class:     org_tensorflow_Session
+ * Method:    delete
+ * Signature: (J)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_Session_delete(JNIEnv *, jclass,
+                                                          jlong);
+
+/*
+ * Class:     org_tensorflow_Session
+ * Method:    run
+ * Signature: (J[B[J[J[I[J[I[JZ[J)[B
+ */
+JNIEXPORT jbyteArray JNICALL Java_org_tensorflow_Session_run(
+    JNIEnv *, jclass, jlong, jbyteArray, jlongArray, jlongArray, jintArray,
+    jlongArray, jintArray, jlongArray, jboolean, jlongArray);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // TENSORFLOW_JAVA_SESSION_JNI_H_
diff --git a/tensorflow/java/src/main/native/tensor_jni.cc b/tensorflow/java/src/main/native/tensor_jni.cc
new file mode 100644
index 00000000000000..c98d6807acc345
--- /dev/null
+++ b/tensorflow/java/src/main/native/tensor_jni.cc
@@ -0,0 +1,435 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/java/src/main/native/tensor_jni.h"
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <algorithm>
+#include <memory>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/java/src/main/native/exception_jni.h"
+
+namespace {
+
+TF_Tensor* requireHandle(JNIEnv* env, jlong handle) {
+  if (handle == 0) {
+    throwException(env, kNullPointerException,
+                   "close() was called on the Tensor");
+    return nullptr;
+  }
+  return reinterpret_cast<TF_Tensor*>(handle);
+}
+
+size_t elemByteSize(TF_DataType dtype) {
+  // The code in this file makes the assumption that the
+  // TensorFlow TF_DataTypes and the Java primitive types
+  // have the same byte sizes. Validate that:
+  switch (dtype) {
+    case TF_BOOL:
+      static_assert(sizeof(jboolean) == 1,
+                    "Java boolean not compatible with TF_BOOL");
+      return 1;
+    case TF_FLOAT:
+    case TF_INT32:
+      static_assert(sizeof(jfloat) == 4,
+                    "Java float not compatible with TF_FLOAT");
+      static_assert(sizeof(jint) == 4, "Java int not compatible with TF_INT32");
+      return 4;
+    case TF_DOUBLE:
+    case TF_INT64:
+      static_assert(sizeof(jdouble) == 8,
+                    "Java double not compatible with TF_DOUBLE");
+      static_assert(sizeof(jlong) == 8,
+                    "Java long not compatible with TF_INT64");
+      return 8;
+    default:
+      return 0;
+  }
+}
+
+// Write a Java scalar object (java.lang.Integer etc.) to a TF_Tensor.
+void writeScalar(JNIEnv* env, jobject src, TF_DataType dtype, void* dst,
+                 size_t dst_size) {
+  size_t sz = elemByteSize(dtype);
+  if (sz != dst_size) {
+    throwException(
+        env, kIllegalStateException,
+        "scalar (%d bytes) not compatible with allocated tensor (%d bytes)", sz,
+        dst_size);
+    return;
+  }
+  switch (dtype) {
+// env->FindClass and env->GetMethodID are expensive and JNI best practices
+// suggest that they should be cached. However, until the creation of scalar
+// valued tensors seems to become a noticeable fraction of program execution,
+// ignore that cost.
+#define CASE(dtype, jtype, method_name, method_signature, call_type)           \
+  case dtype: {                                                                \
+    jclass clazz = env->FindClass("java/lang/Number");                         \
+    jmethodID method = env->GetMethodID(clazz, method_name, method_signature); \
+    jtype v = env->Call##call_type##Method(src, method);                       \
+    memcpy(dst, &v, sz);                                                       \
+    return;                                                                    \
+  }
+    CASE(TF_FLOAT, jfloat, "floatValue", "()F", Float);
+    CASE(TF_DOUBLE, jdouble, "doubleValue", "()D", Double);
+    CASE(TF_INT32, jint, "intValue", "()I", Int);
+    CASE(TF_INT64, jlong, "longValue", "()J", Long);
+#undef CASE
+    case TF_BOOL: {
+      jclass clazz = env->FindClass("java/lang/Boolean");
+      jmethodID method = env->GetMethodID(clazz, "booleanValue", "()Z");
+      jboolean v = env->CallBooleanMethod(src, method);
+      *(static_cast<unsigned char*>(dst)) = v ? 1 : 0;
+      return;
+    }
+    default:
+      throwException(env, kIllegalStateException, "invalid DataType(%d)",
+                     dtype);
+      return;
+  }
+}
+
+// Copy a 1-D array of Java primitive types to the tensor buffer dst.
+// Returns the number of bytes written to dst.
+size_t write1DArray(JNIEnv* env, jarray array, TF_DataType dtype, void* dst,
+                    size_t dst_size) {
+  const int nelems = env->GetArrayLength(array);
+  jboolean is_copy;
+  switch (dtype) {
+#define CASE(dtype, jtype, get_type)                                   \
+  case dtype: {                                                        \
+    jtype##Array a = static_cast<jtype##Array>(array);                 \
+    jtype* values = env->Get##get_type##ArrayElements(a, &is_copy);    \
+    size_t to_copy = nelems * elemByteSize(dtype);                     \
+    if (to_copy > dst_size) {                                          \
+      throwException(                                                  \
+          env, kIllegalStateException,                                 \
+          "cannot write Java array of %d bytes to Tensor of %d bytes", \
+          to_copy, dst_size);                                          \
+      to_copy = 0;                                                     \
+    } else {                                                           \
+      memcpy(dst, values, to_copy);                                    \
+    }                                                                  \
+    env->Release##get_type##ArrayElements(a, values, JNI_ABORT);       \
+    return to_copy;                                                    \
+  }
+    CASE(TF_FLOAT, jfloat, Float);
+    CASE(TF_DOUBLE, jdouble, Double);
+    CASE(TF_INT32, jint, Int);
+    CASE(TF_INT64, jlong, Long);
+    CASE(TF_BOOL, jboolean, Boolean);
+#undef CASE
+    default:
+      throwException(env, kIllegalStateException, "invalid DataType(%d)",
+                     dtype);
+      return 0;
+  }
+}
+
+// Copy the elements of a 1-D array from the tensor buffer src to a 1-D array of
+// Java primitive types. Returns the number of bytes read from src.
+size_t read1DArray(JNIEnv* env, TF_DataType dtype, const void* src,
+                   size_t src_size, jarray dst) {
+  const int len = env->GetArrayLength(dst);
+  const size_t sz = len * elemByteSize(dtype);
+  if (sz > src_size) {
+    throwException(
+        env, kIllegalStateException,
+        "cannot fill a Java array of %d bytes with a Tensor of %d bytes", sz,
+        src_size);
+    return 0;
+  }
+  switch (dtype) {
+#define CASE(dtype, jtype, primitive_type)                                 \
+  case dtype: {                                                            \
+    jtype##Array arr = static_cast<jtype##Array>(dst);                     \
+    env->Set##primitive_type##ArrayRegion(arr, 0, len,                     \
+                                          static_cast<const jtype*>(src)); \
+    return sz;                                                             \
+  }
+    CASE(TF_FLOAT, jfloat, Float);
+    CASE(TF_DOUBLE, jdouble, Double);
+    CASE(TF_INT32, jint, Int);
+    CASE(TF_INT64, jlong, Long);
+    CASE(TF_BOOL, jboolean, Boolean);
+#undef CASE
+    default:
+      throwException(env, kIllegalStateException, "invalid DataType(%d)",
+                     dtype);
+  }
+  return 0;
+}
+
+size_t writeNDArray(JNIEnv* env, jarray src, TF_DataType dtype, int dims_left,
+                    char* dst, size_t dst_size) {
+  if (dims_left == 1) {
+    return write1DArray(env, src, dtype, dst, dst_size);
+  } else {
+    jobjectArray ndarray = static_cast<jobjectArray>(src);
+    int len = env->GetArrayLength(ndarray);
+    size_t sz = 0;
+    for (int i = 0; i < len; ++i) {
+      jarray row = static_cast<jarray>(env->GetObjectArrayElement(ndarray, i));
+      sz +=
+          writeNDArray(env, row, dtype, dims_left - 1, dst + sz, dst_size - sz);
+      env->DeleteLocalRef(row);
+      if (env->ExceptionCheck()) return sz;
+    }
+    return sz;
+  }
+}
+
+size_t readNDArray(JNIEnv* env, TF_DataType dtype, const char* src,
+                   size_t src_size, int dims_left, jarray dst) {
+  if (dims_left == 1) {
+    return read1DArray(env, dtype, src, src_size, dst);
+  } else {
+    jobjectArray ndarray = static_cast<jobjectArray>(dst);
+    int len = env->GetArrayLength(ndarray);
+    size_t sz = 0;
+    for (int i = 0; i < len; ++i) {
+      jarray row = static_cast<jarray>(env->GetObjectArrayElement(ndarray, i));
+      sz +=
+          readNDArray(env, dtype, src + sz, src_size - sz, dims_left - 1, row);
+      env->DeleteLocalRef(row);
+      if (env->ExceptionCheck()) return sz;
+    }
+    return sz;
+  }
+}
+}  // namespace
+
+JNIEXPORT jlong JNICALL Java_org_tensorflow_Tensor_allocate(JNIEnv* env,
+                                                            jclass clazz,
+                                                            jint dtype,
+                                                            jlongArray shape) {
+  size_t elem_size = elemByteSize(static_cast<TF_DataType>(dtype));
+  if (elem_size == 0) {
+    throwException(env, kIllegalArgumentException,
+                   "cannot allocate Tensor with DataType %d", dtype);
+    return 0;
+  }
+  int num_dims = static_cast<int>(env->GetArrayLength(shape));
+  jlong* dims = nullptr;
+  if (num_dims > 0) {
+    jboolean is_copy;
+    dims = env->GetLongArrayElements(shape, &is_copy);
+  }
+  size_t num_elems = 1;
+  for (int i = 0; i < num_dims; ++i) {
+    num_elems *= dims[i];
+  }
+  static_assert(sizeof(jlong) == sizeof(int64_t),
+                "Java long is not compatible with the TensorFlow C API");
+  // On some platforms "jlong" is a "long" while "int64_t" is a "long long".
+  //
+  // Thus, static_cast<int64_t*>(dims) will trigger a compiler error:
+  // static_cast from 'jlong *' (aka 'long *') to 'int64_t *' (aka 'long long
+  // *') is not allowed
+  //
+  // Since this array is typically very small, use the guaranteed safe scheme of
+  // creating a copy.
+  int64_t* dims_copy = new int64_t[num_dims];
+  for (int i = 0; i < num_dims; ++i) {
+    dims_copy[i] = static_cast<int64_t>(dims[i]);
+  }
+  TF_Tensor* t = TF_AllocateTensor(static_cast<TF_DataType>(dtype), dims_copy,
+                                   num_dims, elem_size * num_elems);
+  delete[] dims_copy;
+  if (dims != nullptr) {
+    env->ReleaseLongArrayElements(shape, dims, JNI_ABORT);
+  }
+  if (t == nullptr) {
+    throwException(env, kNullPointerException,
+                   "unable to allocate memory for the Tensor");
+    return 0;
+  }
+  return reinterpret_cast<jlong>(t);
+}
+
+JNIEXPORT jlong JNICALL Java_org_tensorflow_Tensor_allocateScalarBytes(
+    JNIEnv* env, jclass clazz, jbyteArray value) {
+  // TF_STRING tensors are encoded with a table of 8-byte offsets followed by
+  // TF_StringEncode-encoded bytes.
+  size_t src_len = static_cast<int>(env->GetArrayLength(value));
+  size_t dst_len = TF_StringEncodedSize(src_len);
+  TF_Tensor* t = TF_AllocateTensor(TF_STRING, nullptr, 0, 8 + dst_len);
+  char* dst = static_cast<char*>(TF_TensorData(t));
+  memset(dst, 0, 8);  // The offset table
+
+  // jbyte is a signed char, while the C standard doesn't require char and
+  // signed char to be the same. As a result, static_cast<char*>(src) will
+  // complain. Copy the string instead. sigh!
+  jbyte* jsrc = env->GetByteArrayElements(value, nullptr);
+  std::unique_ptr<char[]> src(new char[src_len]);
+  static_assert(sizeof(jbyte) == sizeof(char),
+                "Cannot convert Java byte to a C char");
+  memcpy(src.get(), jsrc, src_len);
+  env->ReleaseByteArrayElements(value, jsrc, JNI_ABORT);
+
+  TF_Status* status = TF_NewStatus();
+  TF_StringEncode(src.get(), src_len, dst + 8, dst_len, status);
+  if (TF_GetCode(status) != TF_OK) {
+    // TODO(ashankar): Replace with throwExceptionIfNotOK() being added to
+    // exception_jni.h in another change.
+    throwException(env, kIllegalStateException, TF_Message(status));
+    TF_DeleteStatus(status);
+    return 0;
+  }
+  TF_DeleteStatus(status);
+  return reinterpret_cast<jlong>(t);
+}
+
+JNIEXPORT void JNICALL Java_org_tensorflow_Tensor_delete(JNIEnv* env,
+                                                         jclass clazz,
+                                                         jlong handle) {
+  if (handle == 0) return;
+  TF_DeleteTensor(reinterpret_cast<TF_Tensor*>(handle));
+}
+
+JNIEXPORT jint JNICALL Java_org_tensorflow_Tensor_dtype(JNIEnv* env,
+                                                        jclass clazz,
+                                                        jlong handle) {
+  static_assert(sizeof(jint) >= sizeof(TF_DataType),
+                "TF_DataType in C cannot be represented as an int in Java");
+  TF_Tensor* t = requireHandle(env, handle);
+  if (t == nullptr) return 0;
+  return static_cast<jint>(TF_TensorType(t));
+}
+
+JNIEXPORT jlongArray JNICALL Java_org_tensorflow_Tensor_shape(JNIEnv* env,
+                                                              jclass clazz,
+                                                              jlong handle) {
+  TF_Tensor* t = requireHandle(env, handle);
+  if (t == nullptr) return nullptr;
+  static_assert(sizeof(jlong) == sizeof(int64_t),
+                "Java long is not compatible with the TensorFlow C API");
+  const jsize num_dims = TF_NumDims(t);
+  jlongArray ret = env->NewLongArray(num_dims);
+  jlong* dims = env->GetLongArrayElements(ret, nullptr);
+  for (int i = 0; i < num_dims; ++i) {
+    dims[i] = static_cast<jlong>(TF_Dim(t, i));
+  }
+  env->ReleaseLongArrayElements(ret, dims, 0);
+  return ret;
+}
+
+JNIEXPORT void JNICALL Java_org_tensorflow_Tensor_setValue(JNIEnv* env,
+                                                           jclass clazz,
+                                                           jlong handle,
+                                                           jobject value) {
+  TF_Tensor* t = requireHandle(env, handle);
+  if (t == nullptr) return;
+  int num_dims = TF_NumDims(t);
+  TF_DataType dtype = TF_TensorType(t);
+  void* data = TF_TensorData(t);
+  const size_t sz = TF_TensorByteSize(t);
+  if (num_dims == 0) {
+    writeScalar(env, value, dtype, data, sz);
+  } else {
+    writeNDArray(env, static_cast<jarray>(value), dtype, num_dims,
+                 static_cast<char*>(data), sz);
+  }
+}
+
+#define DEFINE_GET_SCALAR_METHOD(jtype, dtype, method_suffix)                  \
+  JNIEXPORT jtype JNICALL Java_org_tensorflow_Tensor_scalar##method_suffix(    \
+      JNIEnv* env, jclass clazz, jlong handle) {                               \
+    jtype ret = 0;                                                             \
+    TF_Tensor* t = requireHandle(env, handle);                                 \
+    if (t == nullptr) return ret;                                              \
+    if (TF_NumDims(t) != 0) {                                                  \
+      throwException(env, kIllegalStateException, "Tensor is not a scalar");   \
+    } else if (TF_TensorType(t) != dtype) {                                    \
+      throwException(env, kIllegalStateException, "Tensor is not a %s scalar", \
+                     #method_suffix);                                          \
+    } else {                                                                   \
+      memcpy(&ret, TF_TensorData(t), elemByteSize(dtype));                     \
+    }                                                                          \
+    return ret;                                                                \
+  }
+DEFINE_GET_SCALAR_METHOD(jfloat, TF_FLOAT, Float);
+DEFINE_GET_SCALAR_METHOD(jdouble, TF_DOUBLE, Double);
+DEFINE_GET_SCALAR_METHOD(jint, TF_INT32, Int);
+DEFINE_GET_SCALAR_METHOD(jlong, TF_INT64, Long);
+DEFINE_GET_SCALAR_METHOD(jboolean, TF_BOOL, Boolean);
+#undef DEFINE_GET_SCALAR_METHOD
+
+JNIEXPORT jbyteArray JNICALL Java_org_tensorflow_Tensor_scalarBytes(
+    JNIEnv* env, jclass clazz, jlong handle) {
+  TF_Tensor* t = requireHandle(env, handle);
+  if (t == nullptr) return nullptr;
+  if (TF_NumDims(t) != 0) {
+    throwException(env, kIllegalStateException, "Tensor is not a scalar");
+    return nullptr;
+  }
+  if (TF_TensorType(t) != TF_STRING) {
+    throwException(env, kIllegalArgumentException,
+                   "Tensor is not a string/bytes scalar");
+    return nullptr;
+  }
+  const char* data = static_cast<const char*>(TF_TensorData(t));
+  const char* src = data + 8;
+  size_t src_len = TF_TensorByteSize(t) - 8;
+  uint64_t offset = 0;
+  memcpy(&offset, data, sizeof(offset));
+  if (offset >= src_len) {
+    throwException(env, kIllegalArgumentException,
+                   "invalid tensor encoding: bad offsets");
+    return nullptr;
+  }
+  jbyteArray ret = nullptr;
+  const char* dst = nullptr;
+  size_t dst_len = 0;
+  TF_Status* status = TF_NewStatus();
+  TF_StringDecode(src, src_len, &dst, &dst_len, status);
+  if (TF_GetCode(status) != TF_OK) {
+    // TODO(ashankar): Replace with throwExceptionIfNotOK introduced into
+    // exception_jni.h by another change.
+    throwException(env, kIllegalArgumentException,
+                   "invalid tensor encoding: %s", TF_Message(status));
+  } else {
+    ret = env->NewByteArray(dst_len);
+    jbyte* cpy = env->GetByteArrayElements(ret, nullptr);
+    memcpy(cpy, dst, dst_len);
+    env->ReleaseByteArrayElements(ret, cpy, 0);
+  }
+  TF_DeleteStatus(status);
+  return ret;
+}
+
+JNIEXPORT void JNICALL Java_org_tensorflow_Tensor_readNDArray(JNIEnv* env,
+                                                              jclass clazz,
+                                                              jlong handle,
+                                                              jobject value) {
+  TF_Tensor* t = requireHandle(env, handle);
+  if (t == nullptr) return;
+  int num_dims = TF_NumDims(t);
+  TF_DataType dtype = TF_TensorType(t);
+  const void* data = TF_TensorData(t);
+  const size_t sz = TF_TensorByteSize(t);
+  if (num_dims == 0) {
+    throwException(env, kIllegalArgumentException,
+                   "copyTo() is not meant for scalar Tensors, use the scalar "
+                   "accessor (floatValue(), intValue() etc.) instead");
+    return;
+  }
+  readNDArray(env, dtype, static_cast<const char*>(data), sz, num_dims,
+              static_cast<jarray>(value));
+}
diff --git a/tensorflow/java/src/main/native/tensor_jni.h b/tensorflow/java/src/main/native/tensor_jni.h
new file mode 100644
index 00000000000000..ea0dfc819efdb4
--- /dev/null
+++ b/tensorflow/java/src/main/native/tensor_jni.h
@@ -0,0 +1,139 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_JAVA_TENSOR_JNI_H_
+#define TENSORFLOW_JAVA_TENSOR_JNI_H_
+
+#include <jni.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Class:     org_tensorflow_Tensor
+ * Method:    allocate
+ * Signature: (I[J)J
+ */
+JNIEXPORT jlong JNICALL Java_org_tensorflow_Tensor_allocate(JNIEnv *, jclass,
+                                                            jint, jlongArray);
+
+/*
+ * Class:     org_tensorflow_Tensor
+ * Method:    allocateScalarBytes
+ * Signature: ([B)J
+ */
+JNIEXPORT jlong JNICALL
+Java_org_tensorflow_Tensor_allocateScalarBytes(JNIEnv *, jclass, jbyteArray);
+
+/*
+ * Class:     org_tensorflow_Tensor
+ * Method:    delete
+ * Signature: (J)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_Tensor_delete(JNIEnv *, jclass,
+                                                         jlong);
+
+/*
+ * Class:     org_tensorflow_Tensor
+ * Method:    dtype
+ * Signature: (J)I
+ */
+JNIEXPORT jint JNICALL Java_org_tensorflow_Tensor_dtype(JNIEnv *, jclass,
+                                                        jlong);
+
+/*
+ * Class:     org_tensorflow_Tensor
+ * Method:    shape
+ * Signature: (J)[J
+ */
+JNIEXPORT jlongArray JNICALL Java_org_tensorflow_Tensor_shape(JNIEnv *, jclass,
+                                                              jlong);
+
+/*
+ * Class:     org_tensorflow_Tensor
+ * Method:    setValue
+ * Signature: (JLjava/lang/Object;)V
+ *
+ * REQUIRES: The jobject's type and shape are compatible the with the DataType
+ * and shape of the Tensor referred to by the jlong handle.
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_Tensor_setValue(JNIEnv *, jclass,
+                                                           jlong, jobject);
+
+/*
+ * Class:     org_tensorflow_Tensor
+ * Method:    scalarFloat
+ * Signature: (J)F
+ *
+ */
+JNIEXPORT jfloat JNICALL Java_org_tensorflow_Tensor_scalarFloat(JNIEnv *,
+                                                                jclass, jlong);
+
+/*
+ * Class:     org_tensorflow_Tensor
+ * Method:    scalarDouble
+ * Signature: (J)D
+ */
+JNIEXPORT jdouble JNICALL Java_org_tensorflow_Tensor_scalarDouble(JNIEnv *,
+                                                                  jclass,
+                                                                  jlong);
+
+/*
+ * Class:     org_tensorflow_Tensor
+ * Method:    scalarInt
+ * Signature: (J)I
+ */
+JNIEXPORT jint JNICALL Java_org_tensorflow_Tensor_scalarInt(JNIEnv *, jclass,
+                                                            jlong);
+
+/*
+ * Class:     org_tensorflow_Tensor
+ * Method:    scalarLong
+ * Signature: (J)J
+ */
+JNIEXPORT jlong JNICALL Java_org_tensorflow_Tensor_scalarLong(JNIEnv *, jclass,
+                                                              jlong);
+
+/*
+ * Class:     org_tensorflow_Tensor
+ * Method:    scalarBoolean
+ * Signature: (J)Z
+ */
+JNIEXPORT jboolean JNICALL Java_org_tensorflow_Tensor_scalarBoolean(JNIEnv *,
+                                                                    jclass,
+                                                                    jlong);
+
+/*
+ * Class:     org_tensorflow_Tensor
+ * Method:    scalarBytes
+ * Signature: (J)[B
+ */
+JNIEXPORT jbyteArray JNICALL Java_org_tensorflow_Tensor_scalarBytes(JNIEnv *,
+                                                                    jclass,
+                                                                    jlong);
+
+/*
+ * Class:     org_tensorflow_Tensor
+ * Method:    readNDArray
+ * Signature: (JLjava/lang/Object;)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_Tensor_readNDArray(JNIEnv *, jclass,
+                                                              jlong, jobject);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // TENSORFLOW_JAVA_TENSOR_JNI_H_
diff --git a/tensorflow/java/src/main/native/tensorflow.cc b/tensorflow/java/src/main/native/tensorflow_jni.cc
similarity index 77%
rename from tensorflow/java/src/main/native/tensorflow.cc
rename to tensorflow/java/src/main/native/tensorflow_jni.cc
index 55de5771dd10c3..746550adbd2422 100644
--- a/tensorflow/java/src/main/native/tensorflow.cc
+++ b/tensorflow/java/src/main/native/tensorflow_jni.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/java/src/main/native/tensorflow.h"
+#include "tensorflow/java/src/main/native/tensorflow_jni.h"
 #include "tensorflow/c/c_api.h"
 
-JNIEXPORT jstring JNICALL
-Java_org_tensorflow_TensorFlow_getVersion(JNIEnv* env, jclass clazz) {
+JNIEXPORT jstring JNICALL Java_org_tensorflow_TensorFlow_version(JNIEnv* env,
+                                                                 jclass clazz) {
   return env->NewStringUTF(TF_Version());
 }
diff --git a/tensorflow/java/src/main/native/tensorflow.h b/tensorflow/java/src/main/native/tensorflow_jni.h
similarity index 76%
rename from tensorflow/java/src/main/native/tensorflow.h
rename to tensorflow/java/src/main/native/tensorflow_jni.h
index 897a000ac0f57b..102951c472c38d 100644
--- a/tensorflow/java/src/main/native/tensorflow.h
+++ b/tensorflow/java/src/main/native/tensorflow_jni.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_JAVA_JNI_H_
-#define TENSORFLOW_JAVA_JNI_H_
+#ifndef TENSORFLOW_JAVA_TENSORFLOW_JNI_H_
+#define TENSORFLOW_JAVA_TENSORFLOW_JNI_H_
 
 #include <jni.h>
 
@@ -23,14 +23,14 @@ extern "C" {
 #endif  // __cplusplus
 
 /*
- *  Class:     TensorFlow
- *  Method:    getVersion
+ *  Class:     org_tensorflow_TensorFlow
+ *  Method:    version
  *  Signature: ()Ljava/lang/String;
  */
-JNIEXPORT jstring JNICALL Java_org_tensorflow_TensorFlow_getVersion(JNIEnv*,
-                                                                    jclass);
+JNIEXPORT jstring JNICALL Java_org_tensorflow_TensorFlow_version(JNIEnv*,
+                                                                 jclass);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
-#endif  // TENSORFLOW_JAVA_JNI_H_
+#endif  // TENSORFLOW_JAVA_TENSORFLOW_JNI_H_
diff --git a/tensorflow/java/src/test/java/org/tensorflow/GraphTest.java b/tensorflow/java/src/test/java/org/tensorflow/GraphTest.java
new file mode 100644
index 00000000000000..fa975e55cd9c3d
--- /dev/null
+++ b/tensorflow/java/src/test/java/org/tensorflow/GraphTest.java
@@ -0,0 +1,100 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+/** Unit tests for {@link org.tensorflow.Graph}. */
+@RunWith(JUnit4.class)
+public class GraphTest {
+
+  @Test
+  public void graphDefRoundTrip() {
+    byte[] graphDef;
+    // Create a graph for A * X + B
+    try (Graph g = new Graph()) {
+      TestUtil.transpose_A_times_X(g, new int[2][2]);
+      graphDef = g.toGraphDef();
+    }
+    // Import the GraphDef and find all the nodes.
+    try (Graph g = new Graph()) {
+      g.importGraphDef(graphDef);
+      validateImportedGraph(g, "");
+    }
+    try (Graph g = new Graph()) {
+      g.importGraphDef(graphDef, "BugsBunny");
+      validateImportedGraph(g, "BugsBunny/");
+    }
+  }
+
+  // Helper function whose implementation is based on knowledge of how
+  // TestUtil.transpose_A_times_X is implemented.
+  private void validateImportedGraph(Graph g, String prefix) {
+    Operation op = g.operation(prefix + "A");
+    assertNotNull(op);
+    assertEquals(prefix + "A", op.name());
+    assertEquals("Const", op.type());
+    assertEquals(1, op.numOutputs());
+    assertEquals(op, op.output(0).op());
+
+    op = g.operation(prefix + "X");
+    assertNotNull(op);
+    assertEquals(prefix + "X", op.name());
+    assertEquals("Placeholder", op.type());
+    assertEquals(1, op.numOutputs());
+    assertEquals(op, op.output(0).op());
+
+    op = g.operation(prefix + "Y");
+    assertNotNull(op);
+    assertEquals(prefix + "Y", op.name());
+    assertEquals("MatMul", op.type());
+    assertEquals(1, op.numOutputs());
+    assertEquals(op, op.output(0).op());
+  }
+
+  @Test
+  public void failImportOnInvalidGraphDefs() {
+    try (Graph g = new Graph()) {
+      try {
+        g.importGraphDef(null);
+      } catch (IllegalArgumentException e) {
+        // expected exception.
+      }
+
+      try {
+        g.importGraphDef(new byte[] {1});
+      } catch (IllegalArgumentException e) {
+        // expected exception.
+      }
+    }
+  }
+
+  @Test
+  public void failOnUseAfterClose() {
+    Graph g = new Graph();
+    g.close();
+    try {
+      g.toGraphDef();
+    } catch (IllegalStateException e) {
+      // expected exception.
+    }
+  }
+}
diff --git a/tensorflow/java/src/test/java/org/tensorflow/OperationBuilderTest.java b/tensorflow/java/src/test/java/org/tensorflow/OperationBuilderTest.java
new file mode 100644
index 00000000000000..b13f8306310104
--- /dev/null
+++ b/tensorflow/java/src/test/java/org/tensorflow/OperationBuilderTest.java
@@ -0,0 +1,72 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow;
+
+import static org.junit.Assert.fail;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+/** Unit tests for {@link org.tensorflow.OperationBuilder}. */
+@RunWith(JUnit4.class)
+public class OperationBuilderTest {
+  // TODO(ashankar): Restore this test once the C API gracefully handles mixing graphs and
+  // operations instead of segfaulting.
+  // @Test
+  public void failWhenMixingOperationsOnDifferentGraphs() {
+    try (Graph g1 = new Graph();
+        Graph g2 = new Graph()) {
+      Output c1 = TestUtil.constant(g1, "C1", 3);
+      Output c2 = TestUtil.constant(g2, "C2", 3);
+      TestUtil.addN(g1, c1, c1);
+      try {
+        TestUtil.addN(g2, c1, c2);
+      } catch (Exception e) {
+        fail(e.toString());
+      }
+    }
+  }
+
+  @Test
+  public void failOnUseAfterBuild() {
+    try (Graph g = new Graph();
+        Tensor t = Tensor.create(1)) {
+      OperationBuilder b =
+          g.opBuilder("Const", "Const").setAttr("dtype", t.dataType()).setAttr("value", t);
+      b.build();
+      try {
+        b.setAttr("dtype", t.dataType());
+      } catch (IllegalStateException e) {
+        // expected exception.
+      }
+    }
+  }
+
+  @Test
+  public void failOnUseAfterGraphClose() {
+    OperationBuilder b = null;
+    try (Graph g = new Graph();
+        Tensor t = Tensor.create(1)) {
+      b = g.opBuilder("Const", "Const").setAttr("dtype", t.dataType()).setAttr("value", t);
+    }
+    try {
+      b.build();
+    } catch (IllegalStateException e) {
+      // expected exception.
+    }
+  }
+}
diff --git a/tensorflow/java/src/test/java/org/tensorflow/SessionTest.java b/tensorflow/java/src/test/java/org/tensorflow/SessionTest.java
new file mode 100644
index 00000000000000..aa341a7f6a88e9
--- /dev/null
+++ b/tensorflow/java/src/test/java/org/tensorflow/SessionTest.java
@@ -0,0 +1,81 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+/** Unit tests for {@link org.tensorflow.Session}. */
+@RunWith(JUnit4.class)
+public class SessionTest {
+
+  @Test
+  public void run() {
+    try (Graph g = new Graph();
+        Session s = new Session(g)) {
+      TestUtil.transpose_A_times_X(g, new int[][] {{2}, {3}});
+      try (Tensor x = Tensor.create(new int[][] {{5}, {7}});
+          AutoCloseableList<Tensor> outputs =
+              new AutoCloseableList<Tensor>(s.runner().feed("X", x).fetch("Y").run())) {
+        assertEquals(1, outputs.size());
+        final int[][] expected = {{31}};
+        assertEquals(expected, outputs.get(0).copyTo(new int[1][1]));
+      }
+    }
+  }
+
+  @Test
+  public void failOnUseAfterClose() {
+    try (Graph g = new Graph()) {
+      Session s = new Session(g);
+      s.close();
+      try {
+        s.runner().run();
+        fail("methods on a close()d session should fail");
+      } catch (IllegalStateException e) {
+        // expected exception
+      }
+    }
+  }
+
+  private static final class AutoCloseableList<E extends AutoCloseable> extends ArrayList<E>
+      implements AutoCloseable {
+    AutoCloseableList(Collection<? extends E> c) {
+      super(c);
+    }
+
+    @Override
+    public void close() {
+      Exception toThrow = null;
+      for (AutoCloseable c : this) {
+        try {
+          c.close();
+        } catch (Exception e) {
+          toThrow = e;
+        }
+      }
+      if (toThrow != null) {
+        throw new RuntimeException(toThrow);
+      }
+    }
+  }
+}
diff --git a/tensorflow/java/src/test/java/org/tensorflow/TensorFlowTest.java b/tensorflow/java/src/test/java/org/tensorflow/TensorFlowTest.java
index 94fd0582c1fcc8..ff89aeffbbc404 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/TensorFlowTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/TensorFlowTest.java
@@ -26,6 +26,6 @@
 public class TensorFlowTest {
   @Test
   public void version() {
-    assertTrue(TensorFlow.getVersion().length() > 0);
+    assertTrue(TensorFlow.version().length() > 0);
   }
 }
diff --git a/tensorflow/java/src/test/java/org/tensorflow/TensorTest.java b/tensorflow/java/src/test/java/org/tensorflow/TensorTest.java
new file mode 100644
index 00000000000000..ec1c8551a71586
--- /dev/null
+++ b/tensorflow/java/src/test/java/org/tensorflow/TensorTest.java
@@ -0,0 +1,225 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+/** Unit tests for {@link org.tensorflow.Tensor}. */
+@RunWith(JUnit4.class)
+public class TensorTest {
+  @Test
+  public void scalars() {
+    try (Tensor t = Tensor.create(2.718f)) {
+      assertEquals(DataType.FLOAT, t.dataType());
+      assertEquals(0, t.numDimensions());
+      assertEquals(0, t.shape().length);
+      assertEquals(2.718f, t.floatValue(), 0);
+    }
+
+    try (Tensor t = Tensor.create(3.1415)) {
+      assertEquals(DataType.DOUBLE, t.dataType());
+      assertEquals(0, t.numDimensions());
+      assertEquals(0, t.shape().length);
+      assertEquals(3.1415, t.doubleValue(), 0);
+    }
+
+    try (Tensor t = Tensor.create(-33)) {
+      assertEquals(DataType.INT32, t.dataType());
+      assertEquals(0, t.numDimensions());
+      assertEquals(0, t.shape().length);
+      assertEquals(-33, t.intValue());
+    }
+
+    try (Tensor t = Tensor.create(8589934592L)) {
+      assertEquals(DataType.INT64, t.dataType());
+      assertEquals(0, t.numDimensions());
+      assertEquals(0, t.shape().length);
+      assertEquals(8589934592L, t.longValue());
+    }
+
+    try (Tensor t = Tensor.create(true)) {
+      assertEquals(DataType.BOOL, t.dataType());
+      assertEquals(0, t.numDimensions());
+      assertEquals(0, t.shape().length);
+      assertTrue(t.booleanValue());
+    }
+
+    final byte[] bytes = {1,2,3,4};
+    try (Tensor t = Tensor.create(bytes)) {
+      assertEquals(DataType.STRING, t.dataType());
+      assertEquals(0, t.numDimensions());
+      assertEquals(0, t.shape().length);
+      assertArrayEquals(bytes, t.bytesValue());
+    }
+  }
+
+  @Test
+  public void nDimensional() {
+    double[] vector = {1.414, 2.718, 3.1415};
+    try (Tensor t = Tensor.create(vector)) {
+      assertEquals(DataType.DOUBLE, t.dataType());
+      assertEquals(1, t.numDimensions());
+      assertArrayEquals(new long[] {3}, t.shape());
+
+      double[] got = new double[3];
+      assertArrayEquals(vector, t.copyTo(got), 0);
+    }
+
+    int[][] matrix = {{1, 2, 3}, {4, 5, 6}};
+    try (Tensor t = Tensor.create(matrix)) {
+      assertEquals(DataType.INT32, t.dataType());
+      assertEquals(2, t.numDimensions());
+      assertArrayEquals(new long[] {2, 3}, t.shape());
+
+      int[][] got = new int[2][3];
+      assertArrayEquals(matrix, t.copyTo(got));
+    }
+
+    long[][][] threeD = {
+      {{1}, {3}, {5}, {7}, {9}}, {{2}, {4}, {6}, {8}, {0}},
+    };
+    try (Tensor t = Tensor.create(threeD)) {
+      assertEquals(DataType.INT64, t.dataType());
+      assertEquals(3, t.numDimensions());
+      assertArrayEquals(new long[] {2, 5, 1}, t.shape());
+
+      long[][][] got = new long[2][5][1];
+      assertArrayEquals(threeD, t.copyTo(got));
+    }
+
+    boolean[][][][] fourD = {
+      {{{false, false, false, true}, {false, false, true, false}}},
+      {{{false, false, true, true}, {false, true, false, false}}},
+      {{{false, true, false, true}, {false, true, true, false}}},
+    };
+    try (Tensor t = Tensor.create(fourD)) {
+      assertEquals(DataType.BOOL, t.dataType());
+      assertEquals(4, t.numDimensions());
+      assertArrayEquals(new long[] {3, 1, 2, 4}, t.shape());
+
+      boolean[][][][] got = new boolean[3][1][2][4];
+      assertArrayEquals(fourD, t.copyTo(got));
+    }
+  }
+
+  @Test
+  public void failCreateOnMismatchedDimensions() {
+    int[][][] invalid = new int[3][1][];
+    for (int x = 0; x < invalid.length; ++x) {
+      for (int y = 0; y < invalid[x].length; ++y) {
+        invalid[x][y] = new int[x + y + 1];
+      }
+    }
+    try (Tensor t = Tensor.create(invalid)) {
+      fail("Tensor.create() should fail because of differing sizes in the 3rd dimension");
+    } catch (IllegalArgumentException e) {
+      // The expected exception.
+    }
+  }
+
+  @Test
+  public void failCopyToOnIncompatibleDestination() {
+    try (final Tensor matrix = Tensor.create(new int[][] {{1, 2}, {3, 4}})) {
+      try {
+        matrix.copyTo(new int[2]);
+        fail("should have failed on dimension mismatch");
+      } catch (IllegalArgumentException e) {
+        // The expected exception.
+      }
+
+      try {
+        matrix.copyTo(new float[2][2]);
+        fail("should have failed on DataType mismatch");
+      } catch (IllegalArgumentException e) {
+        // The expected exception.
+      }
+
+      try {
+        matrix.copyTo(new int[2][3]);
+        fail("should have failed on shape mismatch");
+      } catch (IllegalArgumentException e) {
+        // The expected exception.
+      }
+    }
+  }
+
+  @Test
+  public void failCopyToOnScalar() {
+    try (final Tensor scalar = Tensor.create(3)) {
+      try {
+        scalar.copyTo(3);
+        fail("copyTo should fail on scalar tensors, suggesting use of primitive accessors instead");
+      } catch (IllegalArgumentException e) {
+        // The expected exception.
+      }
+    }
+  }
+
+  @Test
+  public void failOnArbitraryObject() {
+    try (Tensor t = Tensor.create(new Object())) {
+      fail("should fail on creating a Tensor with a Java object that has not equivalent DataType");
+    } catch (IllegalArgumentException e) {
+      // The expected exception.
+    }
+  }
+
+  @Test
+  public void failOnZeroDimension() {
+    try (Tensor t = Tensor.create(new int[3][0][1])) {
+      fail("should fail on creating a Tensor where one of the dimensions is 0");
+    } catch (IllegalArgumentException e) {
+      // The expected exception.
+    }
+  }
+
+  @Test
+  public void useAfterClose() {
+    int n = 4;
+    Tensor t = Tensor.create(n);
+    t.close();
+    try {
+      t.intValue();
+    } catch (NullPointerException e) {
+      // The expected exception.
+    }
+  }
+
+  @Test
+  public void fromHandle() {
+    // fromHandle is a package-visible method intended for use when the C TF_Tensor object has been
+    // created indepdently of the Java code. In practice, two Tensor instances MUST NOT have the
+    // same native handle.
+    //
+    // An exception is made for this test, where the pitfalls of this is avoided by not calling
+    // close() on both Tensors.
+    final float[][] matrix = {{1, 2, 3}, {4, 5, 6}};
+    try (Tensor src = Tensor.create(matrix)) {
+      Tensor cpy = Tensor.fromHandle(src.getNativeHandle());
+      assertEquals(src.dataType(), cpy.dataType());
+      assertEquals(src.numDimensions(), cpy.numDimensions());
+      assertArrayEquals(src.shape(), cpy.shape());
+      assertArrayEquals(matrix, cpy.copyTo(new float[2][3]));
+    }
+  }
+}
diff --git a/tensorflow/java/src/test/java/org/tensorflow/TestUtil.java b/tensorflow/java/src/test/java/org/tensorflow/TestUtil.java
new file mode 100644
index 00000000000000..67d456202fff52
--- /dev/null
+++ b/tensorflow/java/src/test/java/org/tensorflow/TestUtil.java
@@ -0,0 +1,52 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow;
+
+/** Static utility functions. */
+public class TestUtil {
+  public static Output constant(Graph g, String name, Object value) {
+    try (Tensor t = Tensor.create(value)) {
+      return g.opBuilder("Const", name)
+          .setAttr("dtype", t.dataType())
+          .setAttr("value", t)
+          .build()
+          .output(0);
+    }
+  }
+
+  public static Output placeholder(Graph g, String name, DataType dtype) {
+    return g.opBuilder("Placeholder", name).setAttr("dtype", dtype).build().output(0);
+  }
+
+  public static Output addN(Graph g, Output... inputs) {
+    return g.opBuilder("AddN", "AddN").addInputList(inputs).build().output(0);
+  }
+
+  public static Output matmul(
+      Graph g, String name, Output a, Output b, boolean transposeA, boolean transposeB) {
+    return g.opBuilder("MatMul", name)
+        .addInput(a)
+        .addInput(b)
+        .setAttr("transpose_a", transposeA)
+        .setAttr("transpose_b", transposeB)
+        .build()
+        .output(0);
+  }
+
+  public static void transpose_A_times_X(Graph g, int[][] a) {
+    matmul(g, "Y", constant(g, "A", a), placeholder(g, "X", DataType.INT32), true, false);
+  }
+}
diff --git a/tensorflow/models/embedding/word2vec.py b/tensorflow/models/embedding/word2vec.py
index 4b365547169cec..e2bc70c813bde9 100644
--- a/tensorflow/models/embedding/word2vec.py
+++ b/tensorflow/models/embedding/word2vec.py
@@ -396,7 +396,7 @@ def train(self):
 
     initial_epoch, initial_words = self._session.run([self._epoch, self._words])
 
-    summary_op = tf.contrib.deprecated.merge_all_summaries()
+    summary_op = tf.summary.merge_all()
     summary_writer = tf.summary.FileWriter(opts.save_path, self._session.graph)
     workers = []
     for _ in xrange(opts.concurrent_steps):
diff --git a/tensorflow/models/image/cifar10/cifar10_eval.py b/tensorflow/models/image/cifar10/cifar10_eval.py
index c2329380d6fcd4..2f85051454655b 100644
--- a/tensorflow/models/image/cifar10/cifar10_eval.py
+++ b/tensorflow/models/image/cifar10/cifar10_eval.py
@@ -134,7 +134,7 @@ def evaluate():
     saver = tf.train.Saver(variables_to_restore)
 
     # Build the summary operation based on the TF collection of Summaries.
-    summary_op = tf.contrib.deprecated.merge_all_summaries()
+    summary_op = tf.summary.merge_all()
 
     summary_writer = tf.summary.FileWriter(FLAGS.eval_dir, g)
 
diff --git a/tensorflow/models/image/cifar10/cifar10_input.py b/tensorflow/models/image/cifar10/cifar10_input.py
index b00859b2620f12..dbbd99040260fe 100644
--- a/tensorflow/models/image/cifar10/cifar10_input.py
+++ b/tensorflow/models/image/cifar10/cifar10_input.py
@@ -84,12 +84,14 @@ class CIFAR10Record(object):
 
   # The first bytes represent the label, which we convert from uint8->int32.
   result.label = tf.cast(
-      tf.slice(record_bytes, [0], [label_bytes]), tf.int32)
+      tf.strided_slice(record_bytes, [0], [label_bytes]), tf.int32)
 
   # The remaining bytes after the label represent the image, which we reshape
   # from [depth * height * width] to [depth, height, width].
-  depth_major = tf.reshape(tf.slice(record_bytes, [label_bytes], [image_bytes]),
-                           [result.depth, result.height, result.width])
+  depth_major = tf.reshape(
+      tf.strided_slice(record_bytes, [label_bytes],
+                       [label_bytes + image_bytes]),
+      [result.depth, result.height, result.width])
   # Convert from [depth, height, width] to [height, width, depth].
   result.uint8image = tf.transpose(depth_major, [1, 2, 0])
 
diff --git a/tensorflow/models/image/cifar10/cifar10_multi_gpu_train.py b/tensorflow/models/image/cifar10/cifar10_multi_gpu_train.py
index a59e13d5e3b7ee..c2d1e73f8707f0 100644
--- a/tensorflow/models/image/cifar10/cifar10_multi_gpu_train.py
+++ b/tensorflow/models/image/cifar10/cifar10_multi_gpu_train.py
@@ -213,7 +213,7 @@ def train():
     train_op = tf.group(apply_gradient_op, variables_averages_op)
 
     # Create a saver.
-    saver = tf.train.Saver(tf.all_variables())
+    saver = tf.train.Saver(tf.global_variables())
 
     # Build the summary operation from the last tower summaries.
     summary_op = tf.contrib.deprecated.merge_summary(summaries)
diff --git a/tensorflow/models/rnn/ptb/ptb_word_lm.py b/tensorflow/models/rnn/ptb/ptb_word_lm.py
index 45fb1e774ab341..6ada6a6c8a3f13 100644
--- a/tensorflow/models/rnn/ptb/ptb_word_lm.py
+++ b/tensorflow/models/rnn/ptb/ptb_word_lm.py
@@ -108,11 +108,11 @@ def __init__(self, is_training, config, input_):
     # Slightly better results can be obtained with forget gate biases
     # initialized to 1 but the hyperparameters of the model would need to be
     # different than reported in the paper.
-    lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(size, forget_bias=0.0, state_is_tuple=True)
+    lstm_cell = tf.contrib.rnn.BasicLSTMCell(size, forget_bias=0.0, state_is_tuple=True)
     if is_training and config.keep_prob < 1:
-      lstm_cell = tf.nn.rnn_cell.DropoutWrapper(
+      lstm_cell = tf.contrib.rnn.DropoutWrapper(
           lstm_cell, output_keep_prob=config.keep_prob)
-    cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * config.num_layers, state_is_tuple=True)
+    cell = tf.contrib.rnn.MultiRNNCell([lstm_cell] * config.num_layers, state_is_tuple=True)
 
     self._initial_state = cell.zero_state(batch_size, data_type())
 
diff --git a/tensorflow/models/rnn/ptb/reader.py b/tensorflow/models/rnn/ptb/reader.py
index d9e666b3d3e137..2bcbcac5e62475 100644
--- a/tensorflow/models/rnn/ptb/reader.py
+++ b/tensorflow/models/rnn/ptb/reader.py
@@ -113,6 +113,8 @@ def ptb_producer(raw_data, batch_size, num_steps, name=None):
       epoch_size = tf.identity(epoch_size, name="epoch_size")
 
     i = tf.train.range_input_producer(epoch_size, shuffle=False).dequeue()
-    x = tf.slice(data, [0, i * num_steps], [batch_size, num_steps])
-    y = tf.slice(data, [0, i * num_steps + 1], [batch_size, num_steps])
+    x = tf.strided_slice(data, [0, i * num_steps],
+                         [batch_size, (i + 1) * num_steps])
+    y = tf.strided_slice(data, [0, i * num_steps + 1],
+                         [batch_size, (i + 1) * num_steps + 1])
     return x, y
diff --git a/tensorflow/models/rnn/rnn_cell.py b/tensorflow/models/rnn/rnn_cell.py
index 1602d3b11e83e8..47beb5e5a9be76 100644
--- a/tensorflow/models/rnn/rnn_cell.py
+++ b/tensorflow/models/rnn/rnn_cell.py
@@ -18,4 +18,4 @@
 from __future__ import division
 from __future__ import print_function
 
-raise ImportError("This module is deprecated.  Use tf.nn.rnn_cell instead.")
+raise ImportError("This module is deprecated.  Use tf.contrib.rnn instead.")
diff --git a/tensorflow/models/rnn/translate/seq2seq_model.py b/tensorflow/models/rnn/translate/seq2seq_model.py
index 23e256c57beb56..a264b9b4bbc300 100644
--- a/tensorflow/models/rnn/translate/seq2seq_model.py
+++ b/tensorflow/models/rnn/translate/seq2seq_model.py
@@ -100,7 +100,7 @@ def __init__(self,
       b = tf.get_variable("proj_b", [self.target_vocab_size], dtype=dtype)
       output_projection = (w, b)
 
-      def sampled_loss(inputs, labels):
+      def sampled_loss(labels, inputs):
         labels = tf.reshape(labels, [-1, 1])
         # We need to compute the sampled_softmax_loss using 32bit floats to
         # avoid numerical instabilities.
@@ -114,12 +114,12 @@ def sampled_loss(inputs, labels):
       softmax_loss_function = sampled_loss
 
     # Create the internal multi-layer cell for our RNN.
-    single_cell = tf.nn.rnn_cell.GRUCell(size)
+    single_cell = tf.contrib.rnn.GRUCell(size)
     if use_lstm:
-      single_cell = tf.nn.rnn_cell.BasicLSTMCell(size)
+      single_cell = tf.contrib.rnn.BasicLSTMCell(size)
     cell = single_cell
     if num_layers > 1:
-      cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * num_layers)
+      cell = tf.contrib.rnn.MultiRNNCell([single_cell] * num_layers)
 
     # The seq2seq function: we use embedding for the input and attention.
     def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
@@ -185,7 +185,7 @@ def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
         self.updates.append(opt.apply_gradients(
             zip(clipped_gradients, params), global_step=self.global_step))
 
-    self.saver = tf.train.Saver(tf.all_variables())
+    self.saver = tf.train.Saver(tf.global_variables())
 
   def step(self, session, encoder_inputs, decoder_inputs, target_weights,
            bucket_id, forward_only):
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index f1fae16bb07c77..150d554ca7e413 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -39,10 +39,12 @@ py_library(
         ":platform",
         ":platform_test",
         ":summary",
+        ":metrics",
         ":layers",
         ":training",
         ":ops",
         ":test_ops",
+        "//tensorflow/python/ops/losses",
         "//tensorflow/python/debug:debug_py",
     ] + if_not_windows([
         "//tensorflow/contrib:contrib_py",
@@ -162,6 +164,7 @@ cc_library(
         "//third_party/py/numpy:headers",
         "//util/python:python_headers",
     ],
+    alwayslink = 1,
 )
 
 cc_library(
@@ -746,6 +749,11 @@ tf_gen_op_wrapper_private_py(
     require_shape_functions = True,
 )
 
+tf_gen_op_wrapper_private_py(
+    name = "set_ops_gen",
+    require_shape_functions = True,
+)
+
 tf_gen_op_wrapper_private_py(
     name = "state_ops_gen",
     require_shape_functions = True,
@@ -795,6 +803,16 @@ py_library(
     ],
 )
 
+py_library(
+    name = "sets",
+    srcs = ["ops/sets.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":framework",
+        ":set_ops_gen",
+    ],
+)
+
 py_library(
     name = "candidate_sampling_ops",
     srcs = ["ops/candidate_sampling_ops.py"],
@@ -1296,6 +1314,39 @@ py_library(
     ],
 )
 
+py_library(
+    name = "confusion_matrix",
+    srcs = ["ops/confusion_matrix.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":array_ops",
+        ":control_flow_ops",
+        ":framework",
+        ":math_ops",
+        ":sparse_ops",
+    ],
+)
+
+py_library(
+    name = "metrics",
+    srcs = ["ops/metrics.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":array_ops",
+        ":check_ops",
+        ":confusion_matrix",
+        ":control_flow_ops",
+        ":framework",
+        ":math_ops",
+        ":nn",
+        ":sets",
+        ":sparse_ops",
+        ":state_ops",
+        ":variable_scope",
+        ":variables",
+    ],
+)
+
 py_library(
     name = "special_math_ops",
     srcs = ["ops/special_math_ops.py"],
@@ -1318,6 +1369,7 @@ py_library(
         ":array_ops",
         ":check_ops",
         ":clip_ops",
+        ":confusion_matrix",
         ":control_flow_ops",
         ":data_flow_grad",
         ":data_flow_ops",
@@ -1516,6 +1568,7 @@ py_library(
         ":script_ops",
         ":seq2seq",
         ":session_ops",
+        ":sets",
         ":sparse_grad",
         ":sparse_ops",
         ":special_math_ops",
@@ -2304,6 +2357,62 @@ py_test(
     ],
 )
 
+py_test(
+    name = "layers_convolutional_test",
+    size = "small",
+    srcs = [
+        "layers/convolutional_test.py",
+    ],
+    main = "layers/convolutional_test.py",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":layers",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_test(
+    name = "layers_conv_utils_test",
+    size = "small",
+    srcs = [
+        "layers/conv_utils_test.py",
+    ],
+    main = "layers/conv_utils_test.py",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":layers",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_test(
+    name = "layers_pooling_test",
+    size = "small",
+    srcs = [
+        "layers/pooling_test.py",
+    ],
+    main = "layers/pooling_test.py",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":layers",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_test(
+    name = "layers_normalization_test",
+    size = "small",
+    srcs = [
+        "layers/normalization_test.py",
+    ],
+    main = "layers/normalization_test.py",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":layers",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
 py_library(
     name = "docs",
     srcs = [
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index 2a7a76c3962ee5..70ea38ffb2c88d 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -83,10 +83,12 @@
 
 # Bring in subpackages.
 from tensorflow.python.layers import layers
+from tensorflow.python.ops import metrics
 from tensorflow.python.ops import nn
-from tensorflow.python.ops import resources
 from tensorflow.python.ops import sdca_ops as sdca
 from tensorflow.python.ops import image_ops as image
+from tensorflow.python.ops import losses
+from tensorflow.python.ops import sets
 from tensorflow.python.user_ops import user_ops
 from tensorflow.python.util import compat
 from tensorflow.python.summary import summary
@@ -116,12 +118,12 @@
 from tensorflow.python.framework import framework_lib
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import confusion_matrix as confusion_matrix_m
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import histogram_ops
 from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import resources
 from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import session_ops
 from tensorflow.python.ops import sparse_ops
@@ -217,12 +219,14 @@
     'graph_util',
     'image',
     'logging',
+    'losses',
+    'metrics',
     'newaxis',
     'nn',
     'python_io',
-    'resources',
     'resource_loader',
     'sdca',
+    'sets',
     'summary',
     'sysconfig',
     'test',
@@ -242,10 +246,11 @@
 # referenced in the whitelist.
 remove_undocumented(__name__, _allowed_symbols,
                     [framework_lib, array_ops, client_lib, check_ops,
-                     compat, constant_op, control_flow_ops, functional_ops,
-                     histogram_ops, io_ops, math_ops, nn, resource_loader,
-                     resources, script_ops, session_ops, sparse_ops, state_ops,
-                     string_ops, summary, tensor_array_ops, train, layers])
+                     compat, constant_op, control_flow_ops, confusion_matrix_m,
+                     functional_ops, histogram_ops, io_ops, losses, math_ops,
+                     metrics, nn, resource_loader, sets, script_ops,
+                     session_ops, sparse_ops, state_ops, string_ops, summary,
+                     tensor_array_ops, train, layers])
 
 # Special dunders that we choose to export:
 _exported_dunders = set([
diff --git a/tensorflow/python/client/device_lib.i b/tensorflow/python/client/device_lib.i
index a9cb2ff61fcfc3..51c04584a5492e 100644
--- a/tensorflow/python/client/device_lib.i
+++ b/tensorflow/python/client/device_lib.i
@@ -35,6 +35,8 @@ static std::vector<string> ListDevices(TF_Status* out_status) {
     Set_TF_Status_from_Status(out_status, status);
   }
 
+  std::vector<std::unique_ptr<Device>> device_holder(devices.begin(), devices.end());
+
   for (const Device* device : devices) {
     const DeviceAttributes& attr = device->attributes();
     string attr_serialized;
@@ -45,8 +47,6 @@ static std::vector<string> ListDevices(TF_Status* out_status) {
       output.clear();
       return output;
     }
-    if(device->device_type() == DEVICE_SYCL)
-      delete device;
     output.push_back(attr_serialized);
   }
 
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index 591cc5afbc2f48..39d7c489f564ac 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -100,11 +100,11 @@ def _get_feeds_for_indexed_slices(feed, feed_val):
     # SparseTensorValues or normal tuples.
     (sparse_tensor.SparseTensor,
      lambda fetch: (
-         [fetch.indices, fetch.values, fetch.shape],
+         [fetch.indices, fetch.values, fetch.dense_shape],
          lambda fetched_vals: sparse_tensor.SparseTensorValue(*fetched_vals)),
      lambda feed, feed_val: list(zip(
          [feed.indices, feed.values, feed.shape], feed_val)),
-     lambda feed: [feed.indices, feed.values, feed.shape]),
+     lambda feed: [feed.indices, feed.values, feed.dense_shape]),
     # IndexedSlices are fetched as IndexedSlicesValues. They can be fed
     # IndexedSlicesValues or normal tuples.
     (ops.IndexedSlices,
diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index 0c602a9014cfdf..bb0d0acbf58752 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -493,7 +493,7 @@ def testFetchSparseTensor(self):
       sp_out = s.run(sp)
       self.assertAllEqual(sp_out.indices, indices)
       self.assertAllEqual(sp_out.values, values)
-      self.assertAllEqual(sp_out.shape, shape)
+      self.assertAllEqual(sp_out.dense_shape, shape)
       # Tuple fetch, use as tuple
       indices_out, values_out, shape_out = s.run(sp)
       self.assertAllEqual(indices_out, indices)
@@ -508,7 +508,7 @@ def testFetchSparseTensor(self):
       sp_out, = s.run([sp])
       self.assertAllEqual(sp_out.indices, indices)
       self.assertAllEqual(sp_out.values, values)
-      self.assertAllEqual(sp_out.shape, shape)
+      self.assertAllEqual(sp_out.dense_shape, shape)
       # Dict fetch (single value), use as tuple
       indices_out, values_out, shape_out = s.run({'sp': sp})['sp']
       self.assertAllEqual(indices_out, indices)
@@ -523,7 +523,7 @@ def testFetchSparseTensor(self):
       sp_out = s.run({'sp': sp})['sp']
       self.assertAllEqual(sp_out.indices, indices)
       self.assertAllEqual(sp_out.values, values)
-      self.assertAllEqual(sp_out.shape, shape)
+      self.assertAllEqual(sp_out.dense_shape, shape)
       # Nested list fetch use as tuple
       sp_out = s.run([[[sp]], sp])
       indices_out, values_out, shape_out = sp_out[0][0][0]
@@ -538,10 +538,10 @@ def testFetchSparseTensor(self):
       sp_out = s.run([[[sp]], sp])
       self.assertAllEqual(sp_out[0][0][0].indices, indices)
       self.assertAllEqual(sp_out[0][0][0].values, values)
-      self.assertAllEqual(sp_out[0][0][0].shape, shape)
+      self.assertAllEqual(sp_out[0][0][0].dense_shape, shape)
       self.assertAllEqual(sp_out[1].indices, indices)
       self.assertAllEqual(sp_out[1].values, values)
-      self.assertAllEqual(sp_out[1].shape, shape)
+      self.assertAllEqual(sp_out[1].dense_shape, shape)
 
   def testFeedSparseTensor(self):
     with session.Session() as s:
@@ -566,7 +566,7 @@ def testFeedSparseTensor(self):
       sp_out = s.run(sp, {sp: (indices, values, shape)})
       self.assertAllEqual(sp_out.indices, indices)
       self.assertAllEqual(sp_out.values, values)
-      self.assertAllEqual(sp_out.shape, shape)
+      self.assertAllEqual(sp_out.dense_shape, shape)
       # Feed with SparseTensorValue
       indices_out, values_out, shape_out = s.run(
           [sp_indices, sp_values, sp_shape],
@@ -579,13 +579,13 @@ def testFeedSparseTensor(self):
           sp2, {sp: sparse_tensor.SparseTensorValue(indices, values, shape)})
       self.assertAllEqual(sp2_out.indices, indices)
       self.assertAllEqual(sp2_out.values, values)
-      self.assertAllEqual(sp2_out.shape, shape)
+      self.assertAllEqual(sp2_out.dense_shape, shape)
       # Feed SparseTensorValue and fetch sp directly.
       sp_out = s.run(
           sp, {sp: sparse_tensor.SparseTensorValue(indices, values, shape)})
       self.assertAllEqual(sp_out.indices, indices)
       self.assertAllEqual(sp_out.values, values)
-      self.assertAllEqual(sp_out.shape, shape)
+      self.assertAllEqual(sp_out.dense_shape, shape)
 
   def testFeedSparsePlaceholder(self):
     with session.Session() as s:
@@ -615,7 +615,7 @@ def testFeedSparsePlaceholder(self):
           sp2, {sp: sparse_tensor.SparseTensorValue(indices, values, shape)})
       self.assertAllEqual(sp2_out.indices, indices)
       self.assertAllEqual(sp2_out.values, values)
-      self.assertAllEqual(sp2_out.shape, shape)
+      self.assertAllEqual(sp2_out.dense_shape, shape)
 
   def testFeedSparsePlaceholderPartialShape(self):
     with session.Session() as s:
@@ -646,7 +646,7 @@ def testFeedSparsePlaceholderPartialShape(self):
           sp2, {sp: sparse_tensor.SparseTensorValue(indices, values, shape)})
       self.assertAllEqual(sp2_out.indices, indices)
       self.assertAllEqual(sp2_out.values, values)
-      self.assertAllEqual(sp2_out.shape, shape)
+      self.assertAllEqual(sp2_out.dense_shape, shape)
 
   def testFeedSparsePlaceholderConstantShape(self):
     with session.Session() as s:
diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index bbe76b4945a111..150b63629bd44e 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -349,6 +349,7 @@ py_test(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":debugger_cli_common",
         ":local_cli_wrapper",
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:framework_test_lib",
diff --git a/tensorflow/python/debug/cli/cli_shared.py b/tensorflow/python/debug/cli/cli_shared.py
index 4e4ccd21286381..1831ea90044edb 100644
--- a/tensorflow/python/debug/cli/cli_shared.py
+++ b/tensorflow/python/debug/cli/cli_shared.py
@@ -71,7 +71,22 @@ def _recommend_command(command, description, indent=2):
   return debugger_cli_common.RichTextLines(lines, font_attr_segs=font_attr_segs)
 
 
-def get_run_start_intro(run_call_count, fetches, feed_dict, tensor_filters):
+def get_tfdbg_logo():
+  lines = [
+      "TTTTTT FFFF DDD  BBBB   GGG ",
+      "  TT   F    D  D B   B G    ",
+      "  TT   FFF  D  D BBBB  G  GG",
+      "  TT   F    D  D B   B G   G",
+      "  TT   F    DDD  BBBB   GGG ",
+      "",
+  ]
+  return debugger_cli_common.RichTextLines(lines)
+
+
+def get_run_start_intro(run_call_count,
+                        fetches,
+                        feed_dict,
+                        tensor_filters):
   """Generate formatted intro for run-start UI.
 
   Args:
@@ -101,8 +116,8 @@ def get_run_start_intro(run_call_count, fetches, feed_dict, tensor_filters):
 
   intro_lines = [
       "======================================",
-      "About to enter Session run() call #%d:" % run_call_count, "",
-      "Fetch(es):"
+      "Session.run() call #%d:" % run_call_count,
+      "", "Fetch(es):"
   ]
   intro_lines.extend(["  " + line for line in fetch_lines])
   intro_lines.extend(["", "Feed dict(s):"])
@@ -120,11 +135,16 @@ def get_run_start_intro(run_call_count, fetches, feed_dict, tensor_filters):
   out.extend(
       _recommend_command(
           "run -n", "Execute the run() call without debug tensor-watching"))
+  out.extend(
+      _recommend_command(
+          "run -t <T>",
+          "Execute run() calls (T - 1) times without debugging, then "
+          "execute run() one more time and drop back to the CLI"))
   out.extend(
       _recommend_command(
           "run -f <filter_name>",
           "Keep executing run() calls until a dumped tensor passes a given, "
-          "registered filter (conditional breakpoint mode)."))
+          "registered filter (conditional breakpoint mode)"))
 
   more_font_attr_segs = {}
   more_lines = ["    Registered filter(s):"]
diff --git a/tensorflow/python/debug/cli/cli_shared_test.py b/tensorflow/python/debug/cli/cli_shared_test.py
index dbcb2827afd0c9..0976d6e3d6fe41 100644
--- a/tensorflow/python/debug/cli/cli_shared_test.py
+++ b/tensorflow/python/debug/cli/cli_shared_test.py
@@ -42,8 +42,7 @@ def testSingleFetchNoFeeds(self):
     run_start_intro = cli_shared.get_run_start_intro(12, self.const_a, None, {})
 
     # Verify line about run() call number.
-    self.assertEqual("About to enter Session run() call #12:",
-                     run_start_intro.lines[1])
+    self.assertTrue(run_start_intro.lines[1].endswith("run() call #12:"))
 
     # Verify line about fetch.
     const_a_name_line = run_start_intro.lines[4]
@@ -58,8 +57,10 @@ def testSingleFetchNoFeeds(self):
     self.assertEqual([(2, 5, "bold")], run_start_intro.font_attr_segs[11])
     self.assertEqual("run -n:", run_start_intro.lines[13][2:])
     self.assertEqual([(2, 8, "bold")], run_start_intro.font_attr_segs[13])
-    self.assertEqual("run -f <filter_name>:", run_start_intro.lines[15][2:])
-    self.assertEqual([(2, 22, "bold")], run_start_intro.font_attr_segs[15])
+    self.assertEqual("run -t <T>:", run_start_intro.lines[15][2:])
+    self.assertEqual([(2, 12, "bold")], run_start_intro.font_attr_segs[15])
+    self.assertEqual("run -f <filter_name>:", run_start_intro.lines[17][2:])
+    self.assertEqual([(2, 22, "bold")], run_start_intro.font_attr_segs[17])
 
     # Verify short description.
     description = cli_shared.get_run_short_description(12, self.const_a, None)
@@ -179,8 +180,8 @@ def testTensorFilters(self):
 
     # Verify the listed names of the tensor filters.
     filter_names = set()
-    filter_names.add(run_start_intro.lines[18].split(" ")[-1])
-    filter_names.add(run_start_intro.lines[19].split(" ")[-1])
+    filter_names.add(run_start_intro.lines[20].split(" ")[-1])
+    filter_names.add(run_start_intro.lines[21].split(" ")[-1])
 
     self.assertEqual({"filter_a", "filter_b"}, filter_names)
 
@@ -218,14 +219,14 @@ def testShapeError(self):
     self.assertEqual(2, error_intro.lines[8].index("lt"))
     self.assertEqual([(2, 4, "bold")], error_intro.font_attr_segs[8])
 
-    self.assertTrue(error_intro.lines[11].startswith("Op name:"))
+    self.assertStartsWith(error_intro.lines[11], "Op name:")
     self.assertTrue(error_intro.lines[11].endswith("a/Assign"))
 
-    self.assertTrue(error_intro.lines[12].startswith("Error type:"))
+    self.assertStartsWith(error_intro.lines[12], "Error type:")
     self.assertTrue(error_intro.lines[12].endswith(str(type(tf_error))))
 
     self.assertEqual("Details:", error_intro.lines[14])
-    self.assertTrue(error_intro.lines[15].startswith("foo description"))
+    self.assertStartsWith(error_intro.lines[15], "foo description")
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/debug/examples/README.md b/tensorflow/python/debug/examples/README.md
index b5cf855c663c9b..04f756dbfd32bd 100644
--- a/tensorflow/python/debug/examples/README.md
+++ b/tensorflow/python/debug/examples/README.md
@@ -1,12 +1,16 @@
 # TensorFlow Debugger (tfdbg) Command-Line-Interface Tutorial: MNIST
 
-**(Under development, subject to change)**
+**(Experimental)**
 
-This tutorial showcases the features of TensorFlow Debugger (**tfdbg**)
-command-line interface.
-It contains an example of how to debug a frequently encountered problem in
-TensorFlow model development: bad numerical values (`nan`s and `inf`s) causing
-training to fail.
+TensorFlow debugger (**tfdbg**) is a specialized debugger for TensorFlow. It
+provides visibility into the internal structure and states of running
+TensorFlow graphs. The insight gained from this visibility should facilitate
+debugging of various types of model bugs during training and inference.
+
+This tutorial showcases the features of tfdbg
+command-line interface (CLI), by focusing on how to debug a
+type of frequently-encountered bug in TensorFlow model development:
+bad numerical values (`nan`s and `inf`s) causing training to fail.
 
 To **observe** such an issue, run the following code without the debugger:
 
@@ -25,11 +29,7 @@ Accuracy at step 1: 0.3183
 Accuracy at step 2: 0.098
 Accuracy at step 3: 0.098
 Accuracy at step 4: 0.098
-Accuracy at step 5: 0.098
-Accuracy at step 6: 0.098
-Accuracy at step 7: 0.098
-Accuracy at step 8: 0.098
-Accuracy at step 9: 0.098
+...
 ```
 
 Scratching your head, you suspect that certain nodes in the training graph
@@ -122,7 +122,9 @@ output.
 
 As the screen output indicates, the first `run()` call calculates the accuracy
 using a test data set—i.e., a forward pass on the graph. You can enter the
-command `run` to launch the `run()` call. This will bring up another screen
+command `run` (or its shorthand `r`) to launch the `run()` call.
+
+This will bring up another screen
 right after the `run()` call has ended, which will display all dumped
 intermedate tensors from the run. (These tensors can also be obtained by
 running the command `lt` after you executed `run`.) This is called the
@@ -167,27 +169,21 @@ Try the following commands at the `tfdbg>` prompt (referencing the code at
 | `lo -r hidden/Relu:0` | List the recipients of the output of the node `hidden/Relu`, recursively—i.e., the output recipient tree. |
 | `lt -n softmax.*` | List all dumped tensors whose names match the regular-expression pattern `softmax.*`. |
 | `lt -t MatMul` | List all dumped tensors whose node type is `MatMul`. |
+| `run_info` or `ri` | Display information about the current run, including fetches and feeds. |
 | `help` | Print general help information listing all available **tfdbg** commands and their flags. |
 | `help lt` | Print the help information for the `lt` command. |
 
 In this first `run()` call, there happen to be no problematic numerical values.
-You can exit the run-end UI by entering the command `exit`. Then you will be at
-the second run-start UI:
+You can move on to the next run by using the command `run` or its shorthand `r`.
 
-```none
---- run-start: run #2: fetch: train/Adam; 2 feeds --------------
-======================================
-About to enter Session run() call #2:
-
-Fetch(es):
-  train/Adam
-
-Feed dict(s):
-  input/x-input:0
-  input/y-input:0
-======================================
-...
-```
+> TIP: If you enter `run` or `r` repeatedly, you will be able to move through the
+> `run()` calls in a sequential manner.
+>
+> You can also use the `-t` flag to move ahead a number of `run()` calls at a time, for example:
+>
+> ```
+> tfdbg> run -t 10
+> ```
 
 Instead of entering `run` repeatedly and manually searching for `nan`s and
 `inf`s in the run-end UI after every `run()` call, you can use the following
diff --git a/tensorflow/python/debug/stepper.py b/tensorflow/python/debug/stepper.py
index 0a6b5510abe273..d20be2a829d22d 100644
--- a/tensorflow/python/debug/stepper.py
+++ b/tensorflow/python/debug/stepper.py
@@ -17,12 +17,39 @@
 from __future__ import division
 from __future__ import print_function
 
+import copy
+
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.debug import debug_data
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import session_ops
 
 
+# TODO(cais): Use nest.flatten once it handles nest Dicts correctly.
+def _flatten_fetches(fetches):
+  """Flatten list, tuple of fetches, or a single fetch into a list of fetches.
+
+  Args:
+    fetches: The fetches to flatten: Can be a single Tensor, Op, or a
+      potentially nested list, tuple or dict of such individual fetches.
+
+  Returns:
+    The fetches flattened to a list.
+  """
+
+  flattened = []
+  if isinstance(fetches, (list, tuple)):
+    for fetch in fetches:
+      flattened.extend(_flatten_fetches(fetch))
+  elif isinstance(fetches, dict):
+    for key in fetches:
+      flattened.extend(_flatten_fetches(fetches[key]))
+  else:
+    flattened.append(fetches)
+
+  return flattened
+
+
 class NodeStepper(object):
   """TensorFlow Debugger (tfdbg) stepper.
 
@@ -71,33 +98,22 @@ class NodeStepper(object):
   # the stepper is capable of using dumped intermediate tensors.
   FEED_TYPE_INTERMEDIATE = "intermediate"
 
-  def __init__(self, sess, fetch, feed_dict=None):
+  def __init__(self, sess, fetches, feed_dict=None):
     """Constructor for Debugger.
 
     Args:
       sess: (Session) the TensorFlow Session to step in.
-      fetch: (str or TensorFlow graph element) A single fetched Tensor or Op,
-        or a name (str) representing the Tensor or Op. In the case of a name
-        str, the graph will be searched to find the corresponding Tensor or Op.
-      feed_dict: (dict or None) feed dict to be used in this stepper instance.
-
-    TODO(cais): Currently the stepper supports a single fetch. Support list,
-      tuple or dict of feeds, as in the Session run() interface.
+      fetches: Same as the fetches input argument to `Session.run()`.
+      feed_dict: Same as the feed_dict input argument to `Session.run()`.
     """
 
     self._sess = sess
 
-    if isinstance(fetch, str):
-      # Fetch target is a string. Assume it is the name of the Tensor or Op and
-      # will attempt to find it in the Session's graph.
-      self._fetch_name = fetch
-    elif isinstance(fetch, list) or isinstance(fetch, tuple) or isinstance(
-        fetch, dict):
-      raise NotImplementedError(
-          "list, tuple or dict fetches are not supported yet.")
-    else:
-      self._fetch_name = fetch.name
-    self._fetch = self._sess.graph.as_graph_element(self._fetch_name)
+    self._fetches = fetches
+    flattened_fetches = _flatten_fetches(fetches)
+
+    self._fetch_names, self._fetch_list = self._get_fetch_and_name_lists(
+        flattened_fetches)
 
     # A map from Variable name to initializer op.
     self._variable_initializers = {}
@@ -110,9 +126,10 @@ def __init__(self, sess, fetch, feed_dict=None):
     self._non_control_output_targets = {}
 
     # Sorted transitive closure of the fetched node.
-    self._sorted_transitive_closure = self._dfs_visit(self._sess.graph,
-                                                      self._fetch)
-    self._transitive_closure_set = set(self._sorted_transitive_closure)
+    self._sorted_nodes, self._closure_elements = self._dfs_visit(
+        self._sess.graph, self._fetch_list)
+
+    self._transitive_closure_set = set(self._sorted_nodes)
 
     # A map from Variable name to the old values (before any cont() calls).
     self._cached_variable_values = {}
@@ -128,9 +145,13 @@ def __init__(self, sess, fetch, feed_dict=None):
     self._tensor_handles = {}
 
     # Feed dict from the client.
-    self._client_feed_dict = feed_dict
-    if not self._client_feed_dict:
-      self._client_feed_dict = {}
+    self._client_feed_dict = {}
+    if feed_dict:
+      for key in feed_dict:
+        if isinstance(key, ops.Tensor):
+          self._client_feed_dict[key.name] = feed_dict[key]
+        else:
+          self._client_feed_dict[key] = feed_dict[key]
 
     # Overriding tensor values.
     self._override_tensors = {}
@@ -138,7 +159,31 @@ def __init__(self, sess, fetch, feed_dict=None):
     # What the feed types were used by the last cont() call.
     self._last_feed_types = {}
 
-  def _dfs_visit(self, graph, elem):
+  def _get_fetch_and_name_lists(self, flattened_fetches):
+    """Get the lists of fetches and their names.
+
+    Args:
+      flattened_fetches: A list of fetches or their names. Can mix fetches and
+        names.
+
+    Returns:
+      (list of str): A list of the names of the fetches.
+      (list): A list of the fetches.
+    """
+
+    fetch_names = []
+    fetch_list = []
+    for fetch in flattened_fetches:
+      if isinstance(fetch, str):
+        fetch_names.append(fetch)
+        fetch_list.append(self._sess.graph.as_graph_element(fetch))
+      else:
+        fetch_names.append(fetch.name)
+        fetch_list.append(fetch)
+
+    return fetch_names, fetch_list
+
+  def _dfs_visit(self, graph, elem_list):
     """Trace back the input of a graph element, using depth-first search.
 
     Uses non-recursive implementation to prevent stack overflow for deep
@@ -151,22 +196,27 @@ def _dfs_visit(self, graph, elem):
 
     Args:
       graph: A TF graph instance.
-      elem: A graph element: a Tensor or an Operation.
+      elem_list: list of graph elements: a Tensor or an Operation.
 
     Returns:
-      (list of str) A topologically-sorted list of all graph element names
-        in the transitive closure of elem. Obviously, the topological sort is
-        not unique in general. The return value here is just an arbitrary one
-        of potentially many possible topological sorts.
+      (list of str) A topologically-sorted list of all nodes (not tensors)
+        in the transitive closure of elem_list. Obviously, the topological sort
+         is not unique in general. The return value here is just an arbitrary
+         one of potentially many possible topological sorts.
+      (list of str) A list of all graph elements (nodes and/or tensors) in the
+        transitive closure.
     """
 
     # These set should hold only strings, i.e, names of the nodes.
-    done = set()  # Keep track of visited nodes.
+    done = set()  # Keep track of visited graph elements.
 
     # A list of str: Names of the topologically-sorted graph elements.
-    sorted_node_list = [elem.name]
+    node_inputs = dict()  # New: Input map of nodes in the transitive closure.
 
-    elem_stack = [elem]
+    elem_stack = copy.copy(elem_list)
+
+    # Graph elements in the transitive closure, including the nodes and tensors.
+    closure_elements = [elem.name for elem in elem_list]
 
     while elem_stack:
       curr_elem = elem_stack.pop()
@@ -178,6 +228,12 @@ def _dfs_visit(self, graph, elem):
       control_inputs = [inp for inp in curr_node.control_inputs]
       all_inputs = set(non_control_inputs + control_inputs)
 
+      if curr_node.name not in node_inputs:
+        all_input_nodes = set()
+        for inp in all_inputs:
+          all_input_nodes.add(self._get_node(inp).name)
+        node_inputs[curr_node.name] = all_input_nodes
+
       # Iterate through the (non-control) inputs.
       for inp in all_inputs:
         is_non_control_input = inp in non_control_inputs
@@ -189,7 +245,7 @@ def _dfs_visit(self, graph, elem):
           else:
             self._non_control_output_targets[inp.name].add(curr_elem.name)
 
-          if (inp.op.type == "Variable" and
+          if (inp.op.type in ["Variable", "VariableV2"] and
               inp.name not in self._variable_initializers):
             # Obtain the initializer op of the variable, in case the Variable's
             # value needs to be restored later.
@@ -203,20 +259,81 @@ def _dfs_visit(self, graph, elem):
           continue
 
         elem_stack.append(inp)
-        sorted_node_list.append(inp.name)
+        closure_elements.append(inp.name)
+
+    # Now that we have traversed the transitive closure and obtained the
+    # node-input map, we can topologically sort them.
+    sorted_nodes = []
+    stack = []
+    for node in node_inputs:
+      if not node_inputs[node]:
+        stack.append(node)
+    for node in stack:
+      del node_inputs[node]
+
+    while stack:
+      curr_node = stack.pop()
+      sorted_nodes.append(curr_node)
+
+      # Iterate through the node-input map and remove the child.
+      pushes = []
+      for node in node_inputs:
+        if curr_node in node_inputs[node]:
+          node_inputs[node].remove(curr_node)
+          if not node_inputs[node]:
+            pushes.append(node)
+
+      # Delete new pushes from node-input map.
+      for node in pushes:
+        del node_inputs[node]
+
+      stack.extend(pushes)
 
-    sorted_node_list.reverse()
-    return sorted_node_list
+    return sorted_nodes, closure_elements
 
-  def sorted_transitive_closure(self):
-    """Get a sorted list of transitive inputs to the fetch of the stepper.
+  def sorted_nodes(self):
+    """Get a topologically-sorted list of node names of the stepper.
+
+    These are the names of the nodes (i.e., not Tensors) in the transitive
+    closure of the stepper, in a topologically-sorted order.
 
     Returns:
       (list of str): Sorted transitive inputs to the fetch of the stepper
         instance. The fetch itself is included in the list.
     """
 
-    return self._sorted_transitive_closure
+    return self._sorted_nodes
+
+  def closure_elements(self):
+    """Get a name list of the graph elements of the stepper.
+
+    Returns:
+      (list of str): names of the graph elements (i.e., nodes and tensors) in
+    the transitive closure of the stepper, in a random order.
+    """
+
+    return self._closure_elements
+
+  def output_slots_in_closure(self, node_name):
+    """Get the output tensors in the transitive closure from node.
+
+    Args:
+      node_name: (str) Name of the node in question.
+
+    Returns:
+      (list of int) Output slots of the output tensors of the node that are in
+        the transitive closure of the stepper.
+    """
+
+    node = self._sess.graph.as_graph_element(node_name)
+
+    tensor_slots = []
+    for i, _ in enumerate(node.outputs):
+      tensor_name = node_name + ":%d" % i
+      if tensor_name in self._closure_elements:
+        tensor_slots.append(i)
+
+    return tensor_slots
 
   def is_feedable(self, name):
     """Determine if a graph element if feedable.
@@ -249,10 +366,12 @@ def override_tensor(self, tensor_name, overriding_val):
     if not isinstance(tensor_name, str):
       raise TypeError("Expected type str; got type %s" % type(tensor_name))
 
-    if tensor_name not in self._transitive_closure_set:
+    node_name = self._get_node_name(tensor_name)
+    if node_name not in self._transitive_closure_set:
       raise ValueError(
           "Cannot override tensor \"%s\" because it does not exist in the "
-          "input tree to the fetch \"%s\"" % (tensor_name, self._fetch_name))
+          "input tree to the fetch \"%s\"" %
+          (tensor_name, repr(self._fetch_names)))
 
     self._override_tensors[tensor_name] = overriding_val
 
@@ -320,6 +439,9 @@ def cont(self,
 
     self._last_feed_types = {}
 
+    # The feeds to be used in the Session.run() call.
+    feeds = {}
+
     if isinstance(target, str):
       # Fetch target is a string. Assume it is the name of the Tensor or Op and
       # will attempt to find it in the Session's graph.
@@ -333,14 +455,20 @@ def cont(self,
 
     if (isinstance(graph_element, ops.Tensor) and
         graph_element.op.type == "Placeholder"):
-      raise ValueError("Should not call cont() on a Placeholder")
+      self._last_feed_types[graph_element.name] = self.FEED_TYPE_CLIENT
+      return self._client_feed_dict[graph_element.name]
+    elif (isinstance(graph_element, ops.Operation) and
+          graph_element.type == "Placeholder"):
+      tensor_name = graph_element.name + ":0"
+      self._last_feed_types[tensor_name] = self.FEED_TYPE_CLIENT
+      return self._client_feed_dict[tensor_name]
 
     if isinstance(graph_element, ops.Operation) and graph_element.outputs:
       # Check if this op has any output tensors that also fall into this
       # stepper's transitive closure.
       node_outputs = [
           output.name for output in graph_element.outputs
-          if output.name in self.sorted_transitive_closure()
+          if output.name in self._closure_elements
       ]
       if node_outputs:
         # The target is an op with at least one output within the transitive
@@ -353,10 +481,11 @@ def cont(self,
 
     # Verify that the target is in the transitive closure of the stepper's
     # fetch.
-    if target_name not in self._transitive_closure_set:
+    target_node_name = self._get_node_name(target_name)
+    if target_node_name not in self._transitive_closure_set:
       raise ValueError(
           "Target \"%s\" is not in the transitive closure for the fetch of the "
-          "stepper: \"%s\"." % (target_name, self._fetch_name))
+          "stepper: \"%s\"." % (target_name, repr(self._fetch_names)))
 
     # Check if a cached tensor handle can be used on the fetch directly.
     if use_tensor_handles and target_name in self._tensor_handles:
@@ -369,9 +498,6 @@ def cont(self,
       self._last_feed_types[target_name] = self.FEED_TYPE_OVERRIDE
       return self._override_tensors[target_name]
 
-    # The feeds to be used in the Session.run() call.
-    feeds = {}
-
     # Keep track of which variables are restored in this cont() call.
     restored_variables = set()
 
@@ -427,7 +553,7 @@ def cont(self,
         # the recipient node is not Identity. In that case, the Variable
         # needs to be marked as dirty and its current value recorded, due to
         # the fact that the receiving op may mutate the value of the Variable.
-        if (is_inp_ref and inp.op.type == "Variable" and
+        if (is_inp_ref and inp.op.type in ["Variable", "VariableV2"] and
             curr_node.type != "Identity"):
           # Mark the variable as dirty.
           touched_variables.add(inp.name)
@@ -451,9 +577,9 @@ def cont(self,
           # Tensor handle found in cache.
           feeds[inp] = self._tensor_handles[inp.name].eval()
           self._last_feed_types[inp.name] = self.FEED_TYPE_HANDLE
-        elif inp in self._client_feed_dict:
+        elif inp.name in self._client_feed_dict:
           # This input is available in the client feed_dict.
-          feeds[inp] = self._client_feed_dict[inp]
+          feeds[inp] = self._client_feed_dict[inp.name]
           self._last_feed_types[inp.name] = self.FEED_TYPE_CLIENT
         else:
           # There is no feed available for this input. So keep tracing its
@@ -506,6 +632,9 @@ def cont(self,
     for touched_variable in touched_variables:
       self._invalidate_transitively_outgoing_cache(touched_variable)
 
+  def _get_node_name(self, graph_element_name):
+    return graph_element_name.split(":")[0]
+
   def _invalidate_transitively_outgoing_cache(self, source_element):
     """Invalidate the cached tensor handles by tracing output.
 
@@ -528,10 +657,14 @@ def _invalidate_transitively_outgoing_cache(self, source_element):
 
     # First, use cached invalidation paths to eliminate some cached tensor
     # handles.
+    to_delete = []
     for handle_name in self._tensor_handles:
       if (handle_name in self._cached_invalidation_path and
           source_element in self._cached_invalidation_path[handle_name]):
-        del self._tensor_handles[handle_name]
+        to_delete.append(handle_name)
+
+    for handle_name in to_delete:
+      del self._tensor_handles[handle_name]
 
     if not self._tensor_handles:
       return
@@ -570,11 +703,15 @@ def finalize(self):
       The same return value as self.cont() as called on the final fetch.
     """
 
-    return self.cont(
-        self._fetch,
-        use_tensor_handles=False,
-        use_overrides=False,
-        restore_variable_values=True)
+    # Restore variable to their previous values.
+    for var_name in self._cached_variable_values:
+      self._sess.run(self._variable_initializers[var_name],
+                     feed_dict={
+                         self._variable_initial_values[var_name]:
+                             self._cached_variable_values[var_name]
+                     })
+
+    return self._sess.run(self._fetches, feed_dict=self._client_feed_dict)
 
   def handle_names(self):
     """Return names of the TensorHandles that the debugger is holding.
@@ -582,8 +719,18 @@ def handle_names(self):
     Returns:
       (list of str) Name of the tensors for which TensorHandle is available.
     """
+
     return [name for name in self._tensor_handles]
 
+  def handle_node_names(self):
+    """Get list of names of the nodes for which handles are available.
+
+    Returns:
+      (set of str) List of names of the nodes.
+    """
+
+    return set([self._get_node_name(name) for name in self._tensor_handles])
+
   def dirty_variables(self):
     """Get the set of variables that are currently "dirty".
 
@@ -598,6 +745,47 @@ def dirty_variables(self):
 
     return self._dirty_variables
 
+  def is_placeholder(self, graph_element_name):
+    """Check whether a graph element is a Placeholder, by name.
+
+    Args:
+      graph_element_name: (str) Name of the tensor or op to be tested.
+
+    Returns:
+      (bool) Whether the graph element of the specified name is a Placeholder
+        op or the output Tensor of a Placeholder op.
+
+    Raises:
+      ValueError: If graph_element_name is not in the transitive closure of the
+        stepper instance.
+    """
+
+    node_name = self._get_node_name(graph_element_name)
+    if node_name not in self.sorted_nodes():
+      raise ValueError(
+          "%s is not in the transitive closure of this NodeStepper "
+          "instance" % graph_element_name)
+
+    graph_element = self._sess.graph.as_graph_element(graph_element_name)
+    if not isinstance(graph_element, ops.Operation):
+      graph_element = graph_element.op
+    return graph_element.type == "Placeholder"
+
+  def placeholders(self):
+    """Get the list of Placeholder Tensors in the transitive closure.
+
+    Returns:
+      (list of str) A list of Placeholder Tensors or ops in the transitive
+        closure.
+    """
+
+    placeholders = []
+    for item in self.sorted_nodes():
+      if self.is_placeholder(item):
+        placeholders.append(item)
+
+    return placeholders
+
   def get_tensor_value(self, tensor_name):
     """Get the value of a tensor that the stepper has access to.
 
@@ -621,9 +809,6 @@ def get_tensor_value(self, tensor_name):
           "This stepper instance does not have access to the value of "
           "tensor \"%s\"" % tensor_name)
 
-  def get_fetch_result(self):
-    return self.get_tensor_value(self._fetch_name)
-
   def override_names(self):
     """Return names of the TensorHandles that the debugger is holding.
 
diff --git a/tensorflow/python/debug/stepper_test.py b/tensorflow/python/debug/stepper_test.py
index 9a98dbfd21707c..67a4ef35cdb77d 100644
--- a/tensorflow/python/debug/stepper_test.py
+++ b/tensorflow/python/debug/stepper_test.py
@@ -35,7 +35,18 @@ def setUp(self):
 
     self.e = tf.mul(self.d, self.c, name="e")  # Should be 24.0.
 
-    self.f = tf.div(self.b, 0.30, name="f")  # Should be 20.0.
+    self.f_y = tf.constant(0.30, name="f_y")
+    self.f = tf.div(self.b, self.f_y, name="f")  # Should be 10.0.
+
+    # The there nodes x, y and z form a graph with "cross-links" in. I.e., x
+    # and y are both direct inputs to z, but x is also a direct input to y.
+    self.x = tf.Variable(2.0, name="x")  # Should be 2.0
+    self.y = tf.neg(self.x, name="y")  # Should be -2.0.
+
+    self.z = tf.mul(self.x, self.y, name="z")  # Should be -4.0.
+
+    self.sess = tf.Session()
+    self.sess.run(tf.global_variables_initializer())
 
     self.sess = tf.Session()
     self.sess.run(tf.global_variables_initializer())
@@ -43,19 +54,35 @@ def setUp(self):
   def tearDown(self):
     tf.reset_default_graph()
 
-  def testAttemptToContToFetchNotInTransitiveClosure(self):
+  def testContToFetchNotInTransitiveClosureShouldError(self):
     stepper = NodeStepper(self.sess, "e:0")
 
-    self.assertEqual(
-        ["a:0", "b:0", "b/read:0", "a/read:0", "c:0", "d:0", "e:0"],
-        stepper.sorted_transitive_closure())
+    sorted_nodes = stepper.sorted_nodes()
+    self.assertEqual(7, len(sorted_nodes))
+    self.assertLess(sorted_nodes.index("a"), sorted_nodes.index("a/read"))
+    self.assertLess(sorted_nodes.index("b"), sorted_nodes.index("b/read"))
+    self.assertLess(sorted_nodes.index("a"), sorted_nodes.index("c"))
+    self.assertLess(sorted_nodes.index("b"), sorted_nodes.index("c"))
+    self.assertLess(sorted_nodes.index("a"), sorted_nodes.index("d"))
+    self.assertLess(sorted_nodes.index("d"), sorted_nodes.index("e"))
+    self.assertLess(sorted_nodes.index("c"), sorted_nodes.index("e"))
+
+    self.assertSetEqual(
+        {"e:0", "d:0", "c:0", "a/read:0", "b/read:0", "b:0", "a:0"},
+        set(stepper.closure_elements()))
 
     with self.assertRaisesRegexp(
         ValueError,
         "Target \"f:0\" is not in the transitive closure for the fetch of the "
-        "stepper: \"e:0\""):
+        "stepper"):
       stepper.cont("f:0")
 
+  def testContToNodeNameShouldReturnTensorvalue(self):
+    stepper = NodeStepper(self.sess, "e:0")
+
+    cont_result = stepper.cont("c")
+    self.assertAllClose(6.0, cont_result)
+
   def testUsingNamesNotUsingIntermediateTensors(self):
     stepper = NodeStepper(self.sess, "e:0")
 
@@ -77,6 +104,7 @@ def testUsingNodesNotUsingIntermediateTensors(self):
 
     # There should be no handles before any cont() calls.
     self.assertEqual([], stepper.handle_names())
+    self.assertSetEqual(set(), stepper.handle_node_names())
 
     # Before the cont() call, the stepper should not have access to the value
     # of c:0.
@@ -93,6 +121,7 @@ def testUsingNodesNotUsingIntermediateTensors(self):
     self.assertEqual({}, stepper.last_feed_types())
 
     self.assertEqual(["c:0"], stepper.handle_names())
+    self.assertEqual({"c"}, stepper.handle_node_names())
 
     # After the cont() call, the stepper should have access to the value of c:0
     # via a tensor handle.
@@ -104,7 +133,7 @@ def testUsingNodesNotUsingIntermediateTensors(self):
         "c:0": NodeStepper.FEED_TYPE_HANDLE
     }, stepper.last_feed_types())
 
-  def testIsFeedable(self):
+  def testIsFeedableShouldGiveCorrectAnswers(self):
     stepper = NodeStepper(self.sess, self.e)
 
     self.assertTrue(stepper.is_feedable("a/read:0"))
@@ -139,6 +168,7 @@ def testOverrideValue(self):
     # Now c:0 should have only an override value, but no cached handle, because
     # the handle should have been invalidated.
     self.assertEqual([], stepper.handle_names())
+    self.assertSetEqual(set(), stepper.handle_node_names())
     self.assertEqual(["c:0"], stepper.override_names())
 
     # Run a downstream tensor after the value override.
@@ -161,6 +191,7 @@ def testOverrideValueTwice(self):
     }, stepper.last_feed_types())
 
     self.assertEqual(["e:0"], stepper.handle_names())
+    self.assertSetEqual({"e"}, stepper.handle_node_names())
     self.assertEqual(["c:0"], stepper.override_names())
 
     # Calling cont(self.e) again. This time the cached tensor handle of e
@@ -174,6 +205,7 @@ def testOverrideValueTwice(self):
     stepper.override_tensor("c:0", 8.0)
 
     self.assertEqual([], stepper.handle_names())
+    self.assertEqual(set(), stepper.handle_node_names())
     self.assertEqual(["c:0"], stepper.override_names())
 
     self.assertAllClose(32.0, stepper.cont(self.e))
@@ -190,12 +222,14 @@ def testRemoveOverrideValue(self):
 
     # The previous cont() step should have generated a cached tensor handle.
     self.assertEqual(["c:0"], stepper.handle_names())
+    self.assertSetEqual({"c"}, stepper.handle_node_names())
 
     # Override c:0.
     stepper.override_tensor("c:0", 7.0)
 
     # The overriding should have invalidated the tensor handle.
     self.assertEqual([], stepper.handle_names())
+    self.assertSetEqual(set(), stepper.handle_node_names())
     self.assertEqual(["c:0"], stepper.override_names())
 
     result = stepper.cont(self.e)
@@ -207,6 +241,7 @@ def testRemoveOverrideValue(self):
     # The handle to tensor e:0 should have been cached, even though its
     # transitive closure contains an override.
     self.assertIn("e:0", stepper.handle_names())
+    self.assertSetEqual({"e"}, stepper.handle_node_names())
 
     # Remove the override.
     stepper.remove_override("c:0")
@@ -215,6 +250,7 @@ def testRemoveOverrideValue(self):
 
     # Removing the override should have invalidated the tensor handle for c.
     self.assertNotIn("e:0", stepper.handle_names())
+    self.assertNotIn("e", stepper.handle_node_names())
 
     # Should reflect the non-overriding value.
     self.assertAllClose(24.0, stepper.cont(self.e))
@@ -222,6 +258,7 @@ def testRemoveOverrideValue(self):
     # This time, the handle to tensor e:0 should have been cached again, even
     # thought its transitive closure contains an override.
     self.assertIn("e:0", stepper.handle_names())
+    self.assertIn("e", stepper.handle_node_names())
 
     # Calling cont(self.e) again should have used the tensor handle to e:0.
     self.assertAllClose(24.0, stepper.cont(self.e))
@@ -236,6 +273,7 @@ def testOverrideAndContToSameTensor(self):
     self.assertAllClose(6.0, result)
     self.assertEqual({}, stepper.last_feed_types())
     self.assertEqual(["c:0"], stepper.handle_names())
+    self.assertSetEqual({"c"}, stepper.handle_node_names())
 
     self.assertAllClose(6.0, stepper.cont(self.c))
 
@@ -250,6 +288,7 @@ def testOverrideAndContToSameTensor(self):
     # As a result of the override, the tensor handle should have been
     # invalidated.
     self.assertEqual([], stepper.handle_names())
+    self.assertSetEqual(set(), stepper.handle_node_names())
 
     result = stepper.cont(self.c)
     self.assertAllClose(7.0, result)
@@ -293,6 +332,71 @@ def testInvalidOverrideArgumentType(self):
     with self.assertRaisesRegexp(TypeError, "Expected type str; got type"):
       stepper.override_tensor(self.a, 42.0)
 
+  def testTransitiveClosureWithCrossLinksShouldHaveCorrectOrder(self):
+    stepper = NodeStepper(self.sess, "z:0")
+
+    sorted_nodes = stepper.sorted_nodes()
+    self.assertEqual(4, len(sorted_nodes))
+    self.assertLess(sorted_nodes.index("x"), sorted_nodes.index("x/read"))
+    self.assertLess(sorted_nodes.index("x"), sorted_nodes.index("y"))
+    self.assertLess(sorted_nodes.index("x"), sorted_nodes.index("z"))
+    self.assertLess(sorted_nodes.index("y"), sorted_nodes.index("z"))
+
+  def testNodeStepperConstructorShouldAllowListOrTupleOrDictOfFetches(self):
+    for i in range(6):
+      if i == 0:
+        fetches = [self.e, [self.f, self.z]]
+      elif i == 1:
+        fetches = (self.e, (self.f, self.z))
+      elif i == 2:
+        fetches = {"e": self.e, "fz": {"f": self.f, "z": self.z}}
+      elif i == 3:
+        fetches = ["e:0", ["f:0", "z:0"]]
+      elif i == 4:
+        fetches = ("e:0", ("f:0", "z:0"))
+      elif i == 5:
+        fetches = {"e": "e:0", "fz": {"f": "f:0", "z": "z:0"}}
+
+      stepper = NodeStepper(self.sess, fetches)
+
+      sorted_nodes = stepper.sorted_nodes()
+      self.assertEqual(13, len(sorted_nodes))
+
+      # Check the topological order of the sorted nodes.
+      self.assertLess(sorted_nodes.index("x"), sorted_nodes.index("x/read"))
+      self.assertLess(sorted_nodes.index("x"), sorted_nodes.index("y"))
+      self.assertLess(sorted_nodes.index("x"), sorted_nodes.index("z"))
+      self.assertLess(sorted_nodes.index("y"), sorted_nodes.index("z"))
+
+      self.assertLess(sorted_nodes.index("a"), sorted_nodes.index("a/read"))
+      self.assertLess(sorted_nodes.index("b"), sorted_nodes.index("b/read"))
+      self.assertLess(sorted_nodes.index("a"), sorted_nodes.index("c"))
+      self.assertLess(sorted_nodes.index("b"), sorted_nodes.index("c"))
+      self.assertLess(sorted_nodes.index("a"), sorted_nodes.index("d"))
+      self.assertLess(sorted_nodes.index("d"), sorted_nodes.index("e"))
+      self.assertLess(sorted_nodes.index("c"), sorted_nodes.index("e"))
+      self.assertLess(sorted_nodes.index("b"), sorted_nodes.index("f"))
+      self.assertLess(sorted_nodes.index("f_y"), sorted_nodes.index("f"))
+
+      closure_elements = stepper.closure_elements()
+      self.assertIn("x/read:0", closure_elements)
+      self.assertIn("e:0", closure_elements)
+      self.assertIn("f:0", closure_elements)
+
+      self.assertEqual([0], stepper.output_slots_in_closure("x/read"))
+      self.assertEqual([0], stepper.output_slots_in_closure("e"))
+      self.assertEqual([0], stepper.output_slots_in_closure("f"))
+
+      result = stepper.finalize()
+      if i == 0 or i == 1 or i == 3 or i == 4:
+        self.assertAllClose(24.0, result[0])
+        self.assertAllClose(10.0, result[1][0])
+        self.assertAllClose(-4.0, result[1][1])
+      elif i == 2 or i == 5:
+        self.assertAllClose(24.0, result["e"])
+        self.assertAllClose(10.0, result["fz"]["f"])
+        self.assertAllClose(-4.0, result["fz"]["z"])
+
 
 class StepperTestWithPlaceHolders(test_util.TensorFlowTestCase):
 
@@ -308,6 +412,24 @@ def setUp(self):
   def tearDown(self):
     tf.reset_default_graph()
 
+  def testIsPlaceholdersShouldGiveCorrectAnswers(self):
+    stepper = NodeStepper(self.sess, self.y)
+
+    self.assertTrue(stepper.is_placeholder(self.ph0.name))
+    self.assertTrue(stepper.is_placeholder(self.ph1.name))
+
+    self.assertFalse(stepper.is_placeholder(self.x.name))
+    self.assertFalse(stepper.is_placeholder(self.y.name))
+
+    with self.assertRaisesRegexp(ValueError,
+                                 "A is not in the transitive closure"):
+      self.assertFalse(stepper.is_placeholder("A"))
+
+  def testPlaceholdersShouldGiveCorrectAnswers(self):
+    stepper = NodeStepper(self.sess, self.y)
+
+    self.assertSetEqual({"ph0", "ph1"}, set(stepper.placeholders()))
+
   def testContWithPlaceholders(self):
     stepper = NodeStepper(
         self.sess,
@@ -317,8 +439,9 @@ def testContWithPlaceholders(self):
             self.ph1: [[-1.0], [0.5]]
         })
 
-    self.assertEqual(["ph0:0", "ph1:0", "x:0", "y:0"],
-                     stepper.sorted_transitive_closure())
+    self.assertEqual(4, len(stepper.sorted_nodes()))
+    self.assertSetEqual({"ph0:0", "ph1:0", "x:0", "y:0"},
+                        set(stepper.closure_elements()))
 
     result = stepper.cont(self.x)
     self.assertAllClose([[0.0], [5.5]], result)
@@ -328,6 +451,7 @@ def testContWithPlaceholders(self):
     }, stepper.last_feed_types())
 
     self.assertEqual(["x:0"], stepper.handle_names())
+    self.assertSetEqual({"x"}, stepper.handle_node_names())
 
     result = stepper.cont(self.y)
     self.assertAllClose([[-1.0], [6.0]], result)
@@ -336,18 +460,64 @@ def testContWithPlaceholders(self):
         "ph1:0": NodeStepper.FEED_TYPE_CLIENT,
     }, stepper.last_feed_types())
 
-  def testAttemptToContToPlaceholder(self):
+  def testAttemptToContToPlaceholderWithTensorFeedKeysShouldWork(self):
+    """Continuing to a placeholder should be allowed, using client feed."""
+
+    ph0_feed = [[1.0, 2.0], [-3.0, 5.0]]
+    ph1_feed = [[-1.0], [0.5]]
+    stepper = NodeStepper(
+        self.sess, self.y, feed_dict={
+            self.ph0: ph0_feed,
+            self.ph1: ph1_feed,
+        })
+
+    self.assertAllClose(ph0_feed, stepper.cont(self.ph0))
+    self.assertEqual({
+        self.ph0.name: NodeStepper.FEED_TYPE_CLIENT
+    }, stepper.last_feed_types())
+
+    self.assertAllClose(ph1_feed, stepper.cont(self.ph1))
+    self.assertEqual({
+        self.ph1.name: NodeStepper.FEED_TYPE_CLIENT
+    }, stepper.last_feed_types())
+
+    ph0_node = self.sess.graph.as_graph_element("ph0")
+    self.assertAllClose(ph0_feed, stepper.cont(ph0_node))
+    self.assertEqual({
+        self.ph0.name: NodeStepper.FEED_TYPE_CLIENT
+    }, stepper.last_feed_types())
+
+    self.assertAllClose([[-1.0], [6.0]], stepper.finalize())
+
+  def testAttemptToContToPlaceholderWithTensorNameFeedKeysShouldWork(self):
+
+    ph0_feed = [[1.0, 2.0], [-3.0, 5.0]]
+    ph1_feed = [[-1.0], [0.5]]
     stepper = NodeStepper(
         self.sess,
         self.y,
         feed_dict={
-            self.ph0: [[1.0, 2.0], [-3.0, 5.0]],
-            self.ph1: [[-1.0], [0.5]]
+            self.ph0.name: ph0_feed,
+            self.ph1.name: ph1_feed,
         })
 
-    with self.assertRaisesRegexp(ValueError,
-                                 r"Should not call cont\(\) on a Placeholder"):
-      stepper.cont(self.ph0)
+    self.assertAllClose(ph0_feed, stepper.cont(self.ph0))
+    self.assertEqual({
+        self.ph0.name: NodeStepper.FEED_TYPE_CLIENT
+    }, stepper.last_feed_types())
+
+    self.assertAllClose(ph1_feed, stepper.cont(self.ph1))
+    self.assertEqual({
+        self.ph1.name: NodeStepper.FEED_TYPE_CLIENT
+    }, stepper.last_feed_types())
+
+    ph0_node = self.sess.graph.as_graph_element("ph0")
+    self.assertAllClose(ph0_feed, stepper.cont(ph0_node))
+    self.assertEqual({
+        self.ph0.name: NodeStepper.FEED_TYPE_CLIENT
+    }, stepper.last_feed_types())
+
+    self.assertAllClose([[-1.0], [6.0]], stepper.finalize())
 
 
 class StepperBackwardRunTest(test_util.TensorFlowTestCase):
@@ -552,6 +722,7 @@ def testOverrideThenContToUpdate(self):
     self.assertEqual({}, stepper.last_feed_types())
     self.assertEqual(set(), stepper.dirty_variables())
     self.assertEqual(["d:0"], stepper.handle_names())
+    self.assertSetEqual({"d"}, stepper.handle_node_names())
 
     # Override the value from 1.0 to 10.0.
     stepper.override_tensor("a/read:0", 10.0)
@@ -571,6 +742,7 @@ def testOverrideThenContToUpdate(self):
     # The tensor handle to d:0 should have been removed due to the dirty
     # transitive closure.
     self.assertEqual([], stepper.handle_names())
+    self.assertSetEqual(set(), stepper.handle_node_names())
 
     # For this backprop on c, the overriding value of a/read:0 should have been
     # used:
@@ -586,6 +758,7 @@ def testOverrideThenContToUpdate(self):
     result = stepper.cont("d:0")
     self.assertAllClose(2.0, result)
     self.assertEqual(["d:0"], stepper.handle_names())
+    self.assertSetEqual({"d"}, stepper.handle_node_names())
 
     # Then call update_c again, without restoring c.
     result = stepper.cont(
@@ -613,13 +786,16 @@ def testContToNodeWithOutputTensors(self):
     # output tensor also is in the transitive closure.
     # Do not assume a specific op, e.g., ""gradients/e_grad/Reshape_1",
     # because it may vary between builds.
-    closure = stepper.sorted_transitive_closure()
+    closure_elements = stepper.closure_elements()
     op_with_output_in_closure = None
-    for element_name in closure:
-      if element_name + ":0" in closure:
+    for element_name in closure_elements:
+      if element_name + ":0" in closure_elements:
         op_with_output_in_closure = str(element_name)
         break
 
+    self.assertEqual([0],
+                     stepper.output_slots_in_closure(op_with_output_in_closure))
+
     self.assertIsNotNone(op_with_output_in_closure)
     output_tensor = op_with_output_in_closure + ":0"
 
@@ -632,6 +808,8 @@ def testContToNodeWithOutputTensors(self):
     stepper.cont(op_with_output_in_closure)
 
     self.assertEqual([output_tensor], stepper.handle_names())
+    self.assertSetEqual({op_with_output_in_closure},
+                        stepper.handle_node_names())
 
     # Do a cont() call that uses the cached tensor of
     # "gradients/?_grad/Reshape_1:0".
diff --git a/tensorflow/python/debug/wrappers/local_cli_wrapper.py b/tensorflow/python/debug/wrappers/local_cli_wrapper.py
index cf6a4e58d87395..df407bebeefbdf 100644
--- a/tensorflow/python/debug/wrappers/local_cli_wrapper.py
+++ b/tensorflow/python/debug/wrappers/local_cli_wrapper.py
@@ -70,20 +70,44 @@ def __init__(self, sess, dump_root=None, log_usage=True):
 
       self._dump_root = dump_root
 
-    # State flag for running till a tensor filter is passed.
-    self._run_till_filter_pass = None
+    self._initialize_argparsers()
 
-    # State related to tensor filters.
+    # Registered tensor filters.
     self._tensor_filters = {}
 
-    # Options for the on-run-start hook:
-    #   1) run (DEBUG_RUN)
-    #   2) run --nodebug (NON_DEBUG_RUN)
-    #   3) invoke_stepper (INVOKE_STEPPER, not implemented)
-    self._on_run_start_parsers = {}
+    # Below are the state variables of this wrapper object.
+    # _active_tensor_filter: what (if any) tensor filter is in effect. If such
+    #   a filter is in effect, this object will call run() method of the
+    #   underlying TensorFlow Session object until the filter passes. This is
+    #   activated by the "-f" flag of the "run" command.
+    # _run_through_times: keeps track of how many times the wrapper needs to
+    #   run through without stopping at the run-end CLI. It is activated by the
+    #   "-t" option of the "run" command.
+    # _skip_debug: keeps track of whether the current run should be executed
+    #   without debugging. It is activated by the "-n" option of the "run"
+    #   command.
+    #
+    # _run_start_response: keeps track what OnRunStartResponse the wrapper
+    #   should return at the next run-start callback. If this information is
+    #   unavailable (i.e., is None), the run-start CLI will be launched to ask
+    #   the user. This is the case, e.g., right before the first run starts.
+    self._active_tensor_filter = None
+    self._run_through_times = 1
+    self._skip_debug = False
+    self._run_start_response = None
+
+  def _initialize_argparsers(self):
+    self._argparsers = {}
     ap = argparse.ArgumentParser(
         description="Run through, with or without debug tensor watching.",
         usage=argparse.SUPPRESS)
+    ap.add_argument(
+        "-t",
+        "--times",
+        dest="times",
+        type=int,
+        default=1,
+        help="How many Session.run() calls to proceed with.")
     ap.add_argument(
         "-n",
         "--no_debug",
@@ -97,12 +121,17 @@ def __init__(self, sess, dump_root=None, log_usage=True):
         type=str,
         default="",
         help="Run until a tensor in the graph passes the specified filter.")
-    self._on_run_start_parsers["run"] = ap
+    self._argparsers["run"] = ap
 
     ap = argparse.ArgumentParser(
         description="Invoke stepper (cont, step, breakpoint, etc.)",
         usage=argparse.SUPPRESS)
-    self._on_run_start_parsers["invoke_stepper"] = ap
+    self._argparsers["invoke_stepper"] = ap
+
+    ap = argparse.ArgumentParser(
+        description="Display information about this Session.run() call.",
+        usage=argparse.SUPPRESS)
+    self._argparsers["run_info"] = ap
 
   def add_tensor_filter(self, filter_name, tensor_filter):
     """Add a tensor filter.
@@ -151,46 +180,58 @@ def on_run_start(self, request):
     self._update_run_calls_state(request.run_call_count, request.fetches,
                                  request.feed_dict)
 
-    if self._run_till_filter_pass:
+    if self._active_tensor_filter:
       # If we are running till a filter passes, we just need to keep running
       # with the DEBUG_RUN option.
       return framework.OnRunStartResponse(framework.OnRunStartAction.DEBUG_RUN,
                                           self._get_run_debug_urls())
 
-    run_start_cli = curses_ui.CursesUI()
-
-    run_start_cli.register_command_handler(
-        "run",
-        self._on_run_start_run_handler,
-        self._on_run_start_parsers["run"].format_help(),
-        prefix_aliases=["r"])
-    run_start_cli.register_command_handler(
-        "invoke_stepper",
-        self._on_run_start_step_handler,
-        self._on_run_start_parsers["invoke_stepper"].format_help(),
-        prefix_aliases=["s"])
-
-    if self._tensor_filters:
-      # Register tab completion for the filter names.
-      run_start_cli.register_tab_comp_context(["run", "r"],
-                                              list(self._tensor_filters.keys()))
-
-    run_start_cli.set_help_intro(
-        cli_shared.get_run_start_intro(request.run_call_count, request.fetches,
-                                       request.feed_dict, self._tensor_filters))
-
-    # Create initial screen output detailing the run.
-    title = "run-start: " + self._run_description
-    response = run_start_cli.run_ui(
-        init_command="help", title=title, title_color="blue_on_white")
-    if response == debugger_cli_common.EXPLICIT_USER_EXIT:
+    if self._run_call_count > 1 and not self._skip_debug:
+      if self._run_through_times > 0:
+        # Just run through without debugging.
+        return framework.OnRunStartResponse(
+            framework.OnRunStartAction.NON_DEBUG_RUN, [])
+      elif self._run_through_times == 0:
+        # It is the run at which the run-end CLI will be launched: activate
+        # debugging.
+        return framework.OnRunStartResponse(
+            framework.OnRunStartAction.DEBUG_RUN,
+            self._get_run_debug_urls())
+
+    if self._run_start_response is None:
+      self._prep_cli_for_run_start()
+
+      self._run_start_response = self._launch_cli(is_run_start=True)
+      if self._run_through_times > 1:
+        self._run_through_times -= 1
+
+    if self._run_start_response == debugger_cli_common.EXPLICIT_USER_EXIT:
       # Explicit user "exit" command leads to sys.exit(1).
       print(
           "Note: user exited from debugger CLI: Calling sys.exit(1).",
           file=sys.stderr)
       sys.exit(1)
 
-    return response
+    return self._run_start_response
+
+  def _prep_cli_for_run_start(self):
+    """Prepare (but not launch) the CLI for run-start."""
+
+    self._run_cli = curses_ui.CursesUI()
+
+    help_intro = debugger_cli_common.RichTextLines([])
+    if self._run_call_count == 1:
+      # Show logo at the onset of the first run.
+      help_intro.extend(cli_shared.get_tfdbg_logo())
+    help_intro.extend(debugger_cli_common.RichTextLines("Upcoming run:"))
+    help_intro.extend(self._run_info)
+
+    self._run_cli.set_help_intro(help_intro)
+
+    # Create initial screen output detailing the run.
+    self._title = "run-start: " + self._run_description
+    self._init_command = "help"
+    self._title_color = "blue_on_white"
 
   def on_run_end(self, request):
     """Overrides on-run-end callback.
@@ -216,111 +257,150 @@ def on_run_end(self, request):
       debug_dump = debug_data.DebugDumpDir(
           self._dump_root, partition_graphs=partition_graphs)
 
-      if request.tf_error:
-        help_intro = cli_shared.get_error_intro(request.tf_error)
-
-        init_command = "help"
-        title_color = "red_on_white"
-      else:
-        help_intro = None
-        init_command = "lt"
-
-        title_color = "black_on_white"
-        if self._run_till_filter_pass:
-          if not debug_dump.find(
-              self._tensor_filters[self._run_till_filter_pass], first_n=1):
-            # No dumped tensor passes the filter in this run. Clean up the dump
-            # directory and move on.
-            shutil.rmtree(self._dump_root)
-            return framework.OnRunEndResponse()
-          else:
-            # Some dumped tensor(s) from this run passed the filter.
-            init_command = "lt -f %s" % self._run_till_filter_pass
-            title_color = "red_on_white"
-            self._run_till_filter_pass = None
-
-      analyzer = analyzer_cli.DebugAnalyzer(debug_dump)
-
-      # Supply all the available tensor filters.
-      for filter_name in self._tensor_filters:
-        analyzer.add_tensor_filter(filter_name,
-                                   self._tensor_filters[filter_name])
-
-      run_end_cli = curses_ui.CursesUI()
-      run_end_cli.register_command_handler(
-          "list_tensors",
-          analyzer.list_tensors,
-          analyzer.get_help("list_tensors"),
-          prefix_aliases=["lt"])
-      run_end_cli.register_command_handler(
-          "node_info",
-          analyzer.node_info,
-          analyzer.get_help("node_info"),
-          prefix_aliases=["ni"])
-      run_end_cli.register_command_handler(
-          "list_inputs",
-          analyzer.list_inputs,
-          analyzer.get_help("list_inputs"),
-          prefix_aliases=["li"])
-      run_end_cli.register_command_handler(
-          "list_outputs",
-          analyzer.list_outputs,
-          analyzer.get_help("list_outputs"),
-          prefix_aliases=["lo"])
-      run_end_cli.register_command_handler(
-          "print_tensor",
-          analyzer.print_tensor,
-          analyzer.get_help("print_tensor"),
-          prefix_aliases=["pt"])
-
-      run_end_cli.register_command_handler(
-          "run",
-          self._run_end_run_command_handler,
-          "Helper command for incorrectly entered run command at the run-end "
-          "prompt.",
-          prefix_aliases=["r"]
-      )
-
-      # Get names of all dumped tensors.
-      dumped_tensor_names = []
-      for datum in debug_dump.dumped_tensor_data:
-        dumped_tensor_names.append("%s:%d" %
-                                   (datum.node_name, datum.output_slot))
-
-      # Tab completions for command "print_tensors".
-      run_end_cli.register_tab_comp_context(["print_tensor", "pt"],
-                                            dumped_tensor_names)
-
-      # Tab completion for commands "node_info", "list_inputs" and
-      # "list_outputs". The list comprehension is used below because nodes()
-      # output can be unicodes and they need to be converted to strs.
-      run_end_cli.register_tab_comp_context(
-          ["node_info", "ni", "list_inputs", "li", "list_outputs", "lo"],
-          [str(node_name) for node_name in debug_dump.nodes()])
-      # TODO(cais): Reduce API surface area for aliases vis-a-vis tab
-      #    completion contexts and registered command handlers.
-
-      title = "run-end: " + self._run_description
-      if help_intro:
-        run_end_cli.set_help_intro(help_intro)
-      run_end_cli.run_ui(
-          init_command=init_command, title=title, title_color=title_color)
-
-      # Clean up the dump directory.
-      shutil.rmtree(self._dump_root)
+      passed_filter = None
+      if self._active_tensor_filter:
+        if not debug_dump.find(
+            self._tensor_filters[self._active_tensor_filter], first_n=1):
+          # No dumped tensor passes the filter in this run. Clean up the dump
+          # directory and move on.
+          self._remove_dump_root()
+          return framework.OnRunEndResponse()
+        else:
+          # Some dumped tensor(s) from this run passed the filter.
+          passed_filter = self._active_tensor_filter
+          self._active_tensor_filter = None
+
+      self._prep_cli_for_run_end(debug_dump, request.tf_error, passed_filter)
+
+      self._run_start_response = self._launch_cli()
+
+      # Clean up the dump generated by this run.
+      self._remove_dump_root()
     else:
-      print("No debug information to show following a non-debug run() call.")
+      # No debug information to show following a non-debug run() call.
+      self._run_start_response = None
 
     # Return placeholder response that currently holds no additional
     # information.
     return framework.OnRunEndResponse()
 
-  def _on_run_start_run_handler(self, args, screen_info=None):
+  def _remove_dump_root(self):
+    if os.path.isdir(self._dump_root):
+      shutil.rmtree(self._dump_root)
+
+  def _prep_cli_for_run_end(self, debug_dump, tf_error, passed_filter):
+    """Prepare (but not launch) CLI for run-end, with debug dump from the run.
+
+    Args:
+      debug_dump: (debug_data.DebugDumpDir) The debug dump directory from this
+        run.
+      tf_error: (None or OpError) OpError that happened during the run() call
+        (if any).
+      passed_filter: (None or str) Name of the tensor filter that just passed
+        and caused the preparation of this run-end CLI (if any).
+    """
+
+    if tf_error:
+      help_intro = cli_shared.get_error_intro(tf_error)
+
+      self._init_command = "help"
+      self._title_color = "red_on_white"
+    else:
+      help_intro = None
+      self._init_command = "lt"
+
+      self._title_color = "black_on_white"
+      if passed_filter is not None:
+        # Some dumped tensor(s) from this run passed the filter.
+        self._init_command = "lt -f %s" % passed_filter
+        self._title_color = "red_on_white"
+
+    analyzer = analyzer_cli.DebugAnalyzer(debug_dump)
+
+    # Supply all the available tensor filters.
+    for filter_name in self._tensor_filters:
+      analyzer.add_tensor_filter(filter_name,
+                                 self._tensor_filters[filter_name])
+
+    self._run_cli = curses_ui.CursesUI()
+    self._run_cli.register_command_handler(
+        "list_tensors",
+        analyzer.list_tensors,
+        analyzer.get_help("list_tensors"),
+        prefix_aliases=["lt"])
+    self._run_cli.register_command_handler(
+        "node_info",
+        analyzer.node_info,
+        analyzer.get_help("node_info"),
+        prefix_aliases=["ni"])
+    self._run_cli.register_command_handler(
+        "list_inputs",
+        analyzer.list_inputs,
+        analyzer.get_help("list_inputs"),
+        prefix_aliases=["li"])
+    self._run_cli.register_command_handler(
+        "list_outputs",
+        analyzer.list_outputs,
+        analyzer.get_help("list_outputs"),
+        prefix_aliases=["lo"])
+    self._run_cli.register_command_handler(
+        "print_tensor",
+        analyzer.print_tensor,
+        analyzer.get_help("print_tensor"),
+        prefix_aliases=["pt"])
+
+    # Get names of all dumped tensors.
+    dumped_tensor_names = []
+    for datum in debug_dump.dumped_tensor_data:
+      dumped_tensor_names.append("%s:%d" %
+                                 (datum.node_name, datum.output_slot))
+
+    # Tab completions for command "print_tensors".
+    self._run_cli.register_tab_comp_context(["print_tensor", "pt"],
+                                            dumped_tensor_names)
+
+    # Tab completion for commands "node_info", "list_inputs" and
+    # "list_outputs". The list comprehension is used below because nodes()
+    # output can be unicodes and they need to be converted to strs.
+    self._run_cli.register_tab_comp_context(
+        ["node_info", "ni", "list_inputs", "li", "list_outputs", "lo"],
+        [str(node_name) for node_name in debug_dump.nodes()])
+    # TODO(cais): Reduce API surface area for aliases vis-a-vis tab
+    #    completion contexts and registered command handlers.
+
+    self._title = "run-end: " + self._run_description
+
+    if help_intro:
+      self._run_cli.set_help_intro(help_intro)
+
+  def _launch_cli(self, is_run_start=False):
+    """Launch the interactive command-line interface.
+
+    Args:
+      is_run_start: (bool) whether this CLI launch occurs at a run-start
+        callback.
+
+    Returns:
+      The OnRunStartResponse specified by the user using the "run" command.
+    """
+
+    self._register_this_run_info(self._run_cli)
+    response = self._run_cli.run_ui(
+        init_command=self._init_command,
+        title=self._title,
+        title_color=self._title_color)
+
+    return response
+
+  def _run_info_handler(self, args, screen_info=None):
+    return self._run_info
+
+  def _run_handler(self, args, screen_info=None):
     """Command handler for "run" command during on-run-start."""
 
     _ = screen_info  # Currently unused.
 
-    parsed = self._on_run_start_parsers["run"].parse_args(args)
+    parsed = self._argparsers["run"].parse_args(args)
 
     if parsed.till_filter_pass:
       # For the run-till-bad-numerical-value-appears mode, use the DEBUG_RUN
@@ -328,14 +408,18 @@ def _on_run_start_run_handler(self, args, screen_info=None):
       # state flag of the class itself to True.
       if parsed.till_filter_pass in self._tensor_filters:
         action = framework.OnRunStartAction.DEBUG_RUN
-        self._run_till_filter_pass = parsed.till_filter_pass
+        self._active_tensor_filter = parsed.till_filter_pass
       else:
         # Handle invalid filter name.
         return debugger_cli_common.RichTextLines(
             ["ERROR: tensor filter \"%s\" does not exist." %
              parsed.till_filter_pass])
 
-    if parsed.no_debug:
+    self._skip_debug = parsed.no_debug
+    self._run_through_times = parsed.times
+
+    if parsed.times > 1 or parsed.no_debug:
+      # If requested -t times > 1, the very next run will be a non-debug run.
       action = framework.OnRunStartAction.NON_DEBUG_RUN
       debug_urls = []
     else:
@@ -346,6 +430,28 @@ def _on_run_start_run_handler(self, args, screen_info=None):
     raise debugger_cli_common.CommandLineExit(
         exit_token=framework.OnRunStartResponse(action, debug_urls))
 
+  def _register_this_run_info(self, curses_cli):
+    curses_cli.register_command_handler(
+        "run",
+        self._run_handler,
+        self._argparsers["run"].format_help(),
+        prefix_aliases=["r"])
+    curses_cli.register_command_handler(
+        "invoke_stepper",
+        self._on_run_start_step_handler,
+        self._argparsers["invoke_stepper"].format_help(),
+        prefix_aliases=["s"])
+    curses_cli.register_command_handler(
+        "run_info",
+        self._run_info_handler,
+        self._argparsers["run_info"].format_help(),
+        prefix_aliases=["ri"])
+
+    if self._tensor_filters:
+      # Register tab completion for the filter names.
+      curses_cli.register_tab_comp_context(["run", "r"],
+                                           list(self._tensor_filters.keys()))
+
   def _on_run_start_step_handler(self, args, screen_info=None):
     """Command handler for "invoke_stepper" command during on-run-start."""
 
@@ -359,18 +465,6 @@ def _on_run_start_step_handler(self, args, screen_info=None):
         exit_token=framework.OnRunStartResponse(
             framework.OnRunStartAction.INVOKE_STEPPER, []))
 
-  def _run_end_run_command_handler(self, args, screen_info=None):
-    """Handler for incorrectly entered run command at run-end prompt."""
-
-    _ = screen_info  # Currently unused.
-
-    return debugger_cli_common.RichTextLines([
-        "ERROR: the \"run\" command is invalid for the run-end prompt.", "",
-        "To proceed to the next run, ",
-        "  1) exit this run-end prompt using the command \"exit\"",
-        "  2) enter the command \"run\" at the next run-start prompt.",
-    ])
-
   def _get_run_debug_urls(self):
     """Get the debug_urls value for the current run() call.
 
@@ -397,3 +491,9 @@ def _update_run_calls_state(self, run_call_count, fetches, feed_dict):
     self._run_description = cli_shared.get_run_short_description(run_call_count,
                                                                  fetches,
                                                                  feed_dict)
+    self._run_through_times -= 1
+
+    self._run_info = cli_shared.get_run_start_intro(run_call_count,
+                                                    fetches,
+                                                    feed_dict,
+                                                    self._tensor_filters)
diff --git a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
index 5cef7bce258136..7c261ea082bf48 100644
--- a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
+++ b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
@@ -21,18 +21,96 @@
 import shutil
 import tempfile
 
+import tensorflow as tf
+
 from tensorflow.python.client import session
+from tensorflow.python.debug.cli import debugger_cli_common
 from tensorflow.python.debug.wrappers import local_cli_wrapper
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import googletest
 
 
+class LocalCLIDebuggerWrapperSessionForTest(
+    local_cli_wrapper.LocalCLIDebugWrapperSession):
+  """Subclasses the wrapper class for testing.
+
+  Overrides its CLI-related methods for headless testing environments.
+  Inserts observer variables for assertions.
+  """
+
+  def __init__(self,
+               command_args_sequence,
+               sess,
+               dump_root=None):
+    """Constructor of the for-test subclass.
+
+    Args:
+      command_args_sequence: (list of list of str) A list of arguments for the
+        "run" command.
+      sess: See the doc string of LocalCLIDebugWrapperSession.__init__.
+      dump_root: See the doc string of LocalCLIDebugWrapperSession.__init__.
+    """
+
+    local_cli_wrapper.LocalCLIDebugWrapperSession.__init__(
+        self, sess, dump_root=dump_root, log_usage=False)
+
+    self._command_args_sequence = command_args_sequence
+    self._response_pointer = 0
+
+    # Observer variables.
+    self.observers = {
+        "debug_dumps": [],
+        "tf_errors": [],
+        "run_start_cli_run_numbers": [],
+        "run_end_cli_run_numbers": [],
+    }
+
+  def _prep_cli_for_run_start(self):
+    pass
+
+  def _prep_cli_for_run_end(self, debug_dump, tf_error, passed_filter):
+    self.observers["debug_dumps"].append(debug_dump)
+    self.observers["tf_errors"].append(tf_error)
+
+  def _launch_cli(self, is_run_start=False):
+    if is_run_start:
+      self.observers["run_start_cli_run_numbers"].append(self._run_call_count)
+    else:
+      self.observers["run_end_cli_run_numbers"].append(self._run_call_count)
+
+    command_args = self._command_args_sequence[self._response_pointer]
+    self._response_pointer += 1
+
+    try:
+      self._run_handler(command_args)
+    except debugger_cli_common.CommandLineExit as e:
+      response = e.exit_token
+
+    return response
+
+
 class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
     self._tmp_dir = tempfile.mktemp()
 
+    self.v = tf.Variable(10.0, name="v")
+    self.delta = tf.constant(1.0, name="delta")
+    self.inc_v = tf.assign_add(self.v, self.delta, name="inc_v")
+
+    self.ph = tf.placeholder(tf.float32, name="ph")
+    self.xph = tf.transpose(self.ph, name="xph")
+    self.m = tf.constant(
+        [[0.0, 1.0, 2.0], [-4.0, -1.0, 0.0]], dtype=tf.float32, name="m")
+    self.y = tf.matmul(self.m, self.xph, name="y")
+
+    self.sess = tf.Session()
+
+    # Initialize variable.
+    self.sess.run(self.v.initializer)
+
   def tearDown(self):
+    tf.reset_default_graph()
     if os.path.isdir(self._tmp_dir):
       shutil.rmtree(self._tmp_dir)
 
@@ -68,6 +146,174 @@ def testConstructWrapperWithExistingFileDumpRoot(self):
       local_cli_wrapper.LocalCLIDebugWrapperSession(
           session.Session(), dump_root=file_path, log_usage=False)
 
+  def testRunsUnderDebugMode(self):
+    # Test command sequence: run; run; run;
+    wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
+        [[], [], []], self.sess, dump_root=self._tmp_dir)
+
+    # run under debug mode twice.
+    wrapped_sess.run(self.inc_v)
+    wrapped_sess.run(self.inc_v)
+
+    # Verify that the assign_add op did take effect.
+    self.assertAllClose(12.0, self.sess.run(self.v))
+
+    # Assert correct run call numbers for which the CLI has been launched at
+    # run-start and run-end.
+    self.assertEqual([1], wrapped_sess.observers["run_start_cli_run_numbers"])
+    self.assertEqual([1, 2], wrapped_sess.observers["run_end_cli_run_numbers"])
+
+    # Verify that the dumps have been generated and picked up during run-end.
+    self.assertEqual(2, len(wrapped_sess.observers["debug_dumps"]))
+
+    # Verify that the TensorFlow runtime errors are picked up and in this case,
+    # they should be both None.
+    self.assertEqual([None, None], wrapped_sess.observers["tf_errors"])
+
+  def testRunsUnderNonDebugMode(self):
+    # Test command sequence: run -n; run -n; run -n;
+    wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
+        [["-n"], ["-n"], ["-n"]],
+        self.sess,
+        dump_root=self._tmp_dir)
+
+    # run three times.
+    wrapped_sess.run(self.inc_v)
+    wrapped_sess.run(self.inc_v)
+    wrapped_sess.run(self.inc_v)
+
+    self.assertAllClose(13.0, self.sess.run(self.v))
+
+    self.assertEqual([1, 2, 3],
+                     wrapped_sess.observers["run_start_cli_run_numbers"])
+    self.assertEqual([], wrapped_sess.observers["run_end_cli_run_numbers"])
+
+  def testRunsUnderNonDebugThenDebugMode(self):
+    # Test command sequence: run -n; run -n; run; run;
+    # Do two NON_DEBUG_RUNs, followed by DEBUG_RUNs.
+    wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
+        [["-n"], ["-n"], [], []],
+        self.sess,
+        dump_root=self._tmp_dir)
+
+    # run three times.
+    wrapped_sess.run(self.inc_v)
+    wrapped_sess.run(self.inc_v)
+    wrapped_sess.run(self.inc_v)
+
+    self.assertAllClose(13.0, self.sess.run(self.v))
+
+    self.assertEqual([1, 2, 3],
+                     wrapped_sess.observers["run_start_cli_run_numbers"])
+
+    # Here, the CLI should have been launched only under the third run,
+    # because the first and second runs are NON_DEBUG.
+    self.assertEqual([3], wrapped_sess.observers["run_end_cli_run_numbers"])
+    self.assertEqual(1, len(wrapped_sess.observers["debug_dumps"]))
+    self.assertEqual([None], wrapped_sess.observers["tf_errors"])
+
+  def testRunMultipleTimesWithinLimit(self):
+    # Test command sequence: run -t 3; run;
+    wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
+        [["-t", "3"], []], self.sess, dump_root=self._tmp_dir)
+
+    # run three times.
+    wrapped_sess.run(self.inc_v)
+    wrapped_sess.run(self.inc_v)
+    wrapped_sess.run(self.inc_v)
+
+    self.assertAllClose(13.0, self.sess.run(self.v))
+
+    self.assertEqual([1], wrapped_sess.observers["run_start_cli_run_numbers"])
+    self.assertEqual([3], wrapped_sess.observers["run_end_cli_run_numbers"])
+    self.assertEqual(1, len(wrapped_sess.observers["debug_dumps"]))
+    self.assertEqual([None], wrapped_sess.observers["tf_errors"])
+
+  def testRunMultipleTimesOverLimit(self):
+    # Test command sequence: run -t 3;
+    wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
+        [["-t", "3"]], self.sess, dump_root=self._tmp_dir)
+
+    # run twice, which is less than the number of times specified by the
+    # command.
+    wrapped_sess.run(self.inc_v)
+    wrapped_sess.run(self.inc_v)
+
+    self.assertAllClose(12.0, self.sess.run(self.v))
+
+    self.assertEqual([1], wrapped_sess.observers["run_start_cli_run_numbers"])
+    self.assertEqual([], wrapped_sess.observers["run_end_cli_run_numbers"])
+    self.assertEqual(0, len(wrapped_sess.observers["debug_dumps"]))
+    self.assertEqual([], wrapped_sess.observers["tf_errors"])
+
+  def testRunMixingDebugModeAndMultpleTimes(self):
+    # Test command sequence: run -n; run -t 2; run; run;
+    wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
+        [["-n"], ["-t", "2"], [], []],
+        self.sess,
+        dump_root=self._tmp_dir)
+
+    # run four times.
+    wrapped_sess.run(self.inc_v)
+    wrapped_sess.run(self.inc_v)
+    wrapped_sess.run(self.inc_v)
+    wrapped_sess.run(self.inc_v)
+
+    self.assertAllClose(14.0, self.sess.run(self.v))
+
+    self.assertEqual([1, 2],
+                     wrapped_sess.observers["run_start_cli_run_numbers"])
+    self.assertEqual([3, 4], wrapped_sess.observers["run_end_cli_run_numbers"])
+    self.assertEqual(2, len(wrapped_sess.observers["debug_dumps"]))
+    self.assertEqual([None, None], wrapped_sess.observers["tf_errors"])
+
+  def testRuntimeErrorShouldBeCaught(self):
+    wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
+        [[], []], self.sess, dump_root=self._tmp_dir)
+
+    # Do a run that should lead to an TensorFlow runtime error.
+    wrapped_sess.run(self.y, feed_dict={self.ph: [[0.0], [1.0], [2.0]]})
+
+    self.assertEqual([1], wrapped_sess.observers["run_start_cli_run_numbers"])
+    self.assertEqual([1], wrapped_sess.observers["run_end_cli_run_numbers"])
+    self.assertEqual(1, len(wrapped_sess.observers["debug_dumps"]))
+
+    # Verify that the runtime error is caught by the wrapped session properly.
+    self.assertEqual(1, len(wrapped_sess.observers["tf_errors"]))
+    tf_error = wrapped_sess.observers["tf_errors"][0]
+    self.assertEqual("y", tf_error.op.name)
+
+  def testRunTillFilterPassesShouldLaunchCLIAtCorrectRun(self):
+    # Test command sequence:
+    #   run -f greater_than_twelve; run -f greater_than_twelve; run;
+    wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
+        [["-f", "v_greater_than_twelve"], ["-f", "v_greater_than_twelve"], []],
+        self.sess, dump_root=self._tmp_dir)
+
+    def v_greater_than_twelve(datum, tensor):
+      return datum.node_name == "v" and tensor > 12.0
+
+    wrapped_sess.add_tensor_filter(
+        "v_greater_than_twelve", v_greater_than_twelve)
+
+    # run five times.
+    wrapped_sess.run(self.inc_v)
+    wrapped_sess.run(self.inc_v)
+    wrapped_sess.run(self.inc_v)
+    wrapped_sess.run(self.inc_v)
+    wrapped_sess.run(self.inc_v)
+
+    self.assertAllClose(15.0, self.sess.run(self.v))
+
+    self.assertEqual([1], wrapped_sess.observers["run_start_cli_run_numbers"])
+
+    # run-end CLI should NOT have been launched for run #2 and #3, because only
+    # starting from run #4 v becomes greater than 12.0.
+    self.assertEqual([4, 5], wrapped_sess.observers["run_end_cli_run_numbers"])
+
+    self.assertEqual(2, len(wrapped_sess.observers["debug_dumps"]))
+    self.assertEqual([None, None], wrapped_sess.observers["tf_errors"])
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/framework/framework_lib.py b/tensorflow/python/framework/framework_lib.py
index 281979384ed7f5..fe935881c68751 100644
--- a/tensorflow/python/framework/framework_lib.py
+++ b/tensorflow/python/framework/framework_lib.py
@@ -35,6 +35,7 @@
 @@control_dependencies
 @@convert_to_tensor
 @@convert_to_tensor_or_indexed_slices
+@@convert_to_tensor_or_sparse_tensor
 @@get_default_graph
 @@reset_default_graph
 @@import_graph_def
@@ -93,6 +94,7 @@
 from tensorflow.python.framework.ops import convert_to_tensor_or_indexed_slices
 from tensorflow.python.framework.random_seed import get_seed
 from tensorflow.python.framework.random_seed import set_random_seed
+from tensorflow.python.framework.sparse_tensor import convert_to_tensor_or_sparse_tensor
 from tensorflow.python.framework.subscribe import subscribe
 from tensorflow.python.framework.importer import import_graph_def
 
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index 9460b2a47495fe..1fa52a71386376 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -411,6 +411,7 @@ def __init__(self, *args, **kwargs):
     self.extra_vars = []
 
   def getvar(self,
+             getter,
              name,
              shape=None,
              dtype=None,
diff --git a/tensorflow/python/framework/gen_docs_combined.py b/tensorflow/python/framework/gen_docs_combined.py
index 67c0d64ea8fc79..606d4353f4b6b1 100644
--- a/tensorflow/python/framework/gen_docs_combined.py
+++ b/tensorflow/python/framework/gen_docs_combined.py
@@ -50,7 +50,6 @@ def module_names():
       "tf.errors",
       "tf.image",
       "tf.nn",
-      "tf.nn.rnn_cell",
       "tf.train",
       "tf.python_io",
       "tf.summary",
@@ -71,6 +70,7 @@ def module_names():
       "tf.contrib.layers",
       "tf.contrib.learn",
       "tf.contrib.learn.monitors",
+      "tf.contrib.legacy_seq2seq",
       "tf.contrib.linalg",
       "tf.contrib.losses",
       "tf.contrib.metrics",
@@ -184,7 +184,6 @@ def library(name, title, module=None, **args):
                                "batch_norm_with_global_normalization_grad",
                                "all_candidate_sampler", "seq2seq"],
               prefix=PREFIX_TEXT),
-      library("rnn_cell", "Neural Network RNN Cells", tf.nn.rnn_cell),
       library("client", "Running Graphs", client_lib),
       library("train",
               "Training",
@@ -228,10 +227,12 @@ def library(name, title, module=None, **args):
       library("contrib.learn", "Learn (contrib)", tf.contrib.learn),
       library("contrib.learn.monitors", "Monitors (contrib)",
               tf.contrib.learn.monitors),
+      library("contrib.legacy_seq2seq", "Sequence to Sequence (contrib)",
+              tf.contrib.legacy_seq2seq),
       library("contrib.linalg", "Linear Algebra (contrib)",
               tf.contrib.linalg),
       library("contrib.losses", "Losses (contrib)", tf.contrib.losses),
-      library("contrib.rnn", "RNN (contrib)", tf.contrib.rnn),
+      library("contrib.rnn", "RNN and Cells (contrib)", tf.contrib.rnn),
       library("contrib.metrics", "Metrics (contrib)", tf.contrib.metrics),
       library("contrib.training", "Training (contrib)", tf.contrib.training),
       library("contrib.util", "Utilities (contrib)", tf.contrib.util),
@@ -254,12 +255,25 @@ def library(name, title, module=None, **args):
 # to imports in learn/python/learn/__init__.py
 # TODO(wicke): Remove contrib.layers.relu* after shortnames are
 # disabled.  These conflict with tf.nn.relu*
+# TODO(xiejw): Remove tf.nn.rnn_cell.* once the implementation files are moved.
 EXCLUDE = frozenset(["tf.contrib.learn.monitors.NanLossDuringTrainingError",
                      "tf.contrib.layers.relu", "tf.contrib.layers.relu6",
                      "tf.contrib.framework.assert_global_step",
                      "tf.contrib.framework.get_global_step",
                      "tf.contrib.learn.NanLossDuringTrainingError",
-                     "tf.contrib.layers.stack"])
+                     "tf.contrib.layers.stack",
+                     "tf.confusion_matrix",
+                     "tf.nn.rnn_cell.RNNCell",
+                     "tf.nn.rnn_cell.BasicRNNCell",
+                     "tf.nn.rnn_cell.BasicLSTMCell",
+                     "tf.nn.rnn_cell.GRUCell",
+                     "tf.nn.rnn_cell.LSTMCell",
+                     "tf.nn.rnn_cell.LSTMStateTuple",
+                     "tf.nn.rnn_cell.MultiRNNCell",
+                     "tf.nn.rnn_cell.DropoutWrapper",
+                     "tf.nn.rnn_cell.EmbeddingWrapper",
+                     "tf.nn.rnn_cell.InputProjectionWrapper",
+                     "tf.nn.rnn_cell.OutputProjectionWrapper"])
 
 
 def main(unused_argv):
diff --git a/tensorflow/python/framework/graph_util_impl.py b/tensorflow/python/framework/graph_util_impl.py
index 587f883260ce3d..f25a8ceffc52a6 100644
--- a/tensorflow/python/framework/graph_util_impl.py
+++ b/tensorflow/python/framework/graph_util_impl.py
@@ -40,6 +40,7 @@
     "ScatterUpdate",
     "TruncatedNormal",
     "Variable",
+    "VariableV2",
 }
 
 
@@ -193,12 +194,16 @@ def convert_variables_to_constants(sess, input_graph_def, output_node_names,
   Returns:
     GraphDef containing a simplified version of the original.
   """
+  # This graph only includes the nodes needed to evaluate the output nodes, and
+  # removes unneeded nodes like those involved in saving and assignment.
+  inference_graph = extract_sub_graph(input_graph_def, output_node_names)
+
   found_variables = {}
   variable_names = []
   variable_dict_names = []
-  for node in input_graph_def.node:
-    if node.op == "Assign":
-      variable_name = node.input[0]
+  for node in inference_graph.node:
+    if node.op == "Variable":
+      variable_name = node.name
       if (variable_names_whitelist is not None and
           variable_name not in variable_names_whitelist):
         continue
@@ -211,10 +216,6 @@ def convert_variables_to_constants(sess, input_graph_def, output_node_names,
   found_variables = dict(zip(variable_dict_names, returned_variables))
   logging.info("Froze %d variables." % len(returned_variables))
 
-  # This graph only includes the nodes needed to evaluate the output nodes, and
-  # removes unneeded nodes like those involved in saving and assignment.
-  inference_graph = extract_sub_graph(input_graph_def, output_node_names)
-
   output_graph_def = graph_pb2.GraphDef()
   how_many_converted = 0
   for input_node in inference_graph.node:
diff --git a/tensorflow/python/framework/graph_util_test.py b/tensorflow/python/framework/graph_util_test.py
index 549ae34d6de330..2cc4868b7f3c44 100644
--- a/tensorflow/python/framework/graph_util_test.py
+++ b/tensorflow/python/framework/graph_util_test.py
@@ -33,7 +33,7 @@
 def test_device_func_pin_variable_to_cpu(op):
   if op.device:
     return op.device
-  return "/cpu:0" if op.node_def.op == "Variable" else op.device
+  return "/cpu:0" if op.node_def.op in ["Variable", "VariableV2"] else op.device
 
 
 class DeviceFunctionsTest(tf.test.TestCase):
@@ -190,6 +190,7 @@ def testConvertVariablesToConsts(self):
       self.assertEqual(4, len(constant_graph_def.node))
       for node in constant_graph_def.node:
         self.assertNotEqual("Variable", node.op)
+        self.assertNotEqual("VariableV2", node.op)
       with tf.Session() as sess:
         output_node = sess.graph.get_tensor_by_name("output_node:0")
         output = sess.run(output_node)
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index d1edf43193435d..8ed0101a6b744a 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -43,7 +43,7 @@
 from tensorflow.python.framework import versions
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
-from tensorflow.python.util import deprecation
+from tensorflow.python.util import decorator_utils
 
 
 def _override_helper(clazz_object, operator, func):
@@ -3980,7 +3980,7 @@ class GraphKeys(object):
     for more details.
   * `SUMMARIES`: the summary `Tensor` objects that have been created in the
     graph. See
-    [`tf.contrib.deprecated.merge_all_summaries()`](../../api_docs/python/train.md#merge_all_summaries)
+    [`tf.summary.merge_all()`](../../api_docs/python/summary.md#merge_all)
     for more details.
   * `QUEUE_RUNNERS`: the `QueueRunner` objects that are used to
     produce input for a computation. See
@@ -4063,12 +4063,12 @@ class GraphKeys(object):
   COND_CONTEXT = "cond_context"
   WHILE_CONTEXT = "while_context"
 
-  @property
-  @deprecation.deprecated("2017-03-02",
-              "VARIABLES collection name is deprecated, "
-              "please use GLOBAL_VARIABLES instead")
-  def VARIABLES(self):
-    return self.GLOBAL_VARIABLES
+  @decorator_utils.classproperty
+  def VARIABLES(cls):  # pylint: disable=no-self-argument
+    logging.warning("VARIABLES collection name is deprecated, "
+                    "please use GLOBAL_VARIABLES instead; "
+                    "VARIABLES will be removed after 2017-03-02.")
+    return cls.GLOBAL_VARIABLES
 
 
 def add_to_collection(name, value):
diff --git a/tensorflow/python/framework/sparse_tensor.py b/tensorflow/python/framework/sparse_tensor.py
index 8b7bb130de6e50..0a0f8f71df2070 100644
--- a/tensorflow/python/framework/sparse_tensor.py
+++ b/tensorflow/python/framework/sparse_tensor.py
@@ -12,24 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Classes and functions used to construct graphs."""
 # pylint: disable=g-bad-name
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
+import six
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 
-
 # pylint: disable=protected-access
 _TensorLike = ops._TensorLike
 _eval_using_default_session = ops._eval_using_default_session
 _override_helper = ops._override_helper
+
 # pylint: enable=protected-access
 
 
@@ -37,37 +36,41 @@ class SparseTensor(_TensorLike):
   """Represents a sparse tensor.
 
   TensorFlow represents a sparse tensor as three separate dense tensors:
-  `indices`, `values`, and `shape`.  In Python, the three tensors are
+  `indices`, `values`, and `dense_shape`.  In Python, the three tensors are
   collected into a `SparseTensor` class for ease of use.  If you have separate
-  `indices`, `values`, and `shape` tensors, wrap them in a `SparseTensor`
+  `indices`, `values`, and `dense_shape` tensors, wrap them in a `SparseTensor`
   object before passing to the ops below.
 
-  Concretely, the sparse tensor `SparseTensor(indices, values, shape)`
+  Concretely, the sparse tensor `SparseTensor(indices, values, dense_shape)`
   comprises the following components, where `N` and `ndims` are the number
   of values and number of dimensions in the `SparseTensor`, respectively:
 
-  * `indices`: A 2-D int64 tensor of shape `[N, ndims]`, which specifies
+  * `indices`: A 2-D int64 tensor of dense_shape `[N, ndims]`, which specifies
     the indices of the elements in the sparse tensor that contain nonzero
     values (elements are zero-indexed). For example, `indices=[[1,3], [2,4]]`
     specifies that the elements with indexes of [1,3] and [2,4] have
     nonzero values.
 
-  * `values`: A 1-D tensor of any type and shape `[N]`, which supplies the
+  * `values`: A 1-D tensor of any type and dense_shape `[N]`, which supplies the
     values for each element in `indices`. For example, given
     `indices=[[1,3], [2,4]]`, the parameter `values=[18, 3.6]` specifies
     that element [1,3] of the sparse tensor has a value of 18, and element
     [2,4] of the tensor has a value of 3.6.
 
-  * `shape`: A 1-D int64 tensor of shape `[ndims]`, which specifies the shape
-    of the sparse tensor. Takes a list indicating the number of elements in
-    each dimension. For example, `shape=[3,6]` specifies a two-dimensional 3x6
-    tensor, `shape=[2,3,4]` specifies a three-dimensional 2x3x4 tensor, and
-    `shape=[9]` specifies a one-dimensional tensor with 9 elements.
+  * `dense_shape`: A 1-D int64 tensor of dense_shape `[ndims]`, which specifies
+  the
+    dense_shape of the sparse tensor. Takes a list indicating the number of
+    elements
+    in each dimension. For example, `dense_shape=[3,6]` specifies a
+    two-dimensional
+    3x6 tensor, `dense_shape=[2,3,4]` specifies a three-dimensional 2x3x4
+    tensor, and
+    `dense_shape=[9]` specifies a one-dimensional tensor with 9 elements.
 
   The corresponding dense tensor satisfies:
 
   ```python
-  dense.shape = shape
+  dense.shape = dense_shape
   dense[tuple(indices[i])] = values[i]
   ```
 
@@ -80,7 +83,7 @@ class SparseTensor(_TensorLike):
   Example: The sparse tensor
 
   ```python
-  SparseTensor(indices=[[0, 0], [1, 2]], values=[1, 2], shape=[3, 4])
+  SparseTensor(indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])
   ```
 
   represents the dense tensor
@@ -95,7 +98,7 @@ class SparseTensor(_TensorLike):
   @@get_shape
   @@indices
   @@values
-  @@shape
+  @@dense_shape
   @@dtype
   @@op
   @@graph
@@ -105,26 +108,31 @@ class SparseTensor(_TensorLike):
   def from_value(cls, sparse_tensor_value):
     if not (isinstance(sparse_tensor_value, SparseTensor) or
             isinstance(sparse_tensor_value, SparseTensorValue)):
-      raise TypeError(
-          "Neither a SparseTensor nor SparseTensorValue: %s."
-          % sparse_tensor_value)
+      raise TypeError("Neither a SparseTensor nor SparseTensorValue: %s." %
+                      sparse_tensor_value)
     return SparseTensor(
         indices=sparse_tensor_value.indices,
         values=sparse_tensor_value.values,
-        shape=sparse_tensor_value.shape)
+        dense_shape=sparse_tensor_value.dense_shape)
 
-  def __init__(self, indices, values, shape):
+  def __init__(self, indices, values, dense_shape=None, shape=None):
     """Creates a `SparseTensor`.
 
     Args:
-      indices: A 2-D int64 tensor of shape `[N, ndims]`.
-      values: A 1-D tensor of any type and shape `[N]`.
-      shape: A 1-D int64 tensor of shape `[ndims]`.
+      indices: A 2-D int64 tensor of dense_shape `[N, ndims]`.
+      values: A 1-D tensor of any type and dense_shape `[N]`.
+      dense_shape: A 1-D int64 tensor of dense_shape `[ndims]`.
+      shape: Temporary.  Legacy naming of dense_shape.  Only one of `shape` or
+        `dense_shape` must be provided.
 
     Returns:
-      A `SparseTensor`
+      A `SparseTensor`.
+
+    Raises:
+      ValueError: if both `shape` and `dense_shape` are provided.
     """
-    with ops.name_scope(None, "SparseTensor", [indices, values, shape]):
+    with ops.name_scope(None, "SparseTensor",
+                        [indices, values, shape, dense_shape]):
       indices = ops.convert_to_tensor(
           indices, name="indices", dtype=dtypes.int64)
       # Always pass as_ref=True because we want to be able to update
@@ -133,35 +141,40 @@ def __init__(self, indices, values, shape):
       # is a VariableOp and updating users of SparseTensor.
       values = ops.internal_convert_to_tensor(
           values, name="values", as_ref=True)
-      shape = ops.convert_to_tensor(shape, name="shape", dtype=dtypes.int64)
+      if shape is not None and dense_shape is not None:
+        raise ValueError("Only one of shape or dense_shape must be provided, "
+                         "but saw %s and %s" % (shape, dense_shape))
+      dense_shape = shape if shape is not None else dense_shape
+      dense_shape = ops.convert_to_tensor(
+          dense_shape, name="dense_shape", dtype=dtypes.int64)
     self._indices = indices
     self._values = values
-    self._shape = shape
+    self._dense_shape = dense_shape
 
     indices_shape = indices.get_shape().with_rank(2)
     values_shape = values.get_shape().with_rank(1)
-    shape_shape = shape.get_shape().with_rank(1)
+    dense_shape_shape = dense_shape.get_shape().with_rank(1)
 
     # Assert number of rows in indices match the number of elements in values.
     indices_shape[0].merge_with(values_shape[0])
     # Assert number of columns in indices matches the number of elements in
-    # shape.
-    indices_shape[1].merge_with(shape_shape[0])
+    # dense_shape.
+    indices_shape[1].merge_with(dense_shape_shape[0])
 
   def get_shape(self):
-    """Get the `TensorShape` that represents the shape of the dense tensor.
+    """Get the `TensorShape` representing the shape of the dense tensor.
 
     Returns:
       A `TensorShape` object.
     """
-    return tensor_util.constant_value_as_shape(self._shape)
+    return tensor_util.constant_value_as_shape(self._dense_shape)
 
   @property
   def indices(self):
     """The indices of non-zero values in the represented dense tensor.
 
     Returns:
-      A 2-D Tensor of int64 with shape `[N, ndims]`, where `N` is the
+      A 2-D Tensor of int64 with dense_shape `[N, ndims]`, where `N` is the
         number of non-zero values in the tensor, and `ndims` is the rank.
     """
     return self._indices
@@ -186,18 +199,23 @@ def dtype(self):
     return self._values.dtype
 
   @property
-  def shape(self):
+  def dense_shape(self):
     """A 1-D Tensor of int64 representing the shape of the dense tensor."""
-    return self._shape
+    return self._dense_shape
+
+  @property
+  def shape(self):
+    """Legacy property returning `dense_shape`."""
+    return self._dense_shape
 
   @property
   def graph(self):
-    """The `Graph` that contains the index, value, and shape tensors."""
+    """The `Graph` that contains the index, value, and dense_shape tensors."""
     return self._indices.graph
 
   def __str__(self):
-    return "SparseTensor(indices=%s, values=%s, shape=%s)" % (
-        self._indices, self._values, self._shape)
+    return "SparseTensor(indices=%s, values=%s, dense_shape=%s)" % (
+        self._indices, self._values, self._dense_shape)
 
   def eval(self, feed_dict=None, session=None):
     """Evaluates this sparse tensor in a `Session`.
@@ -220,14 +238,101 @@ def eval(self, feed_dict=None, session=None):
     Returns:
       A `SparseTensorValue` object.
     """
-    indices, values, shape = _eval_using_default_session(
-        [self.indices, self.values, self.shape], feed_dict, self.graph, session)
-    return SparseTensorValue(indices, values, shape)
+    indices, values, dense_shape = _eval_using_default_session(
+        [self.indices, self.values, self.dense_shape], feed_dict, self.graph,
+        session)
+    return SparseTensorValue(indices, values, dense_shape)
 
   @staticmethod
   def _override_operator(operator, func):
     _override_helper(SparseTensor, operator, func)
 
 
-SparseTensorValue = collections.namedtuple("SparseTensorValue",
-                                           ["indices", "values", "shape"])
+class _STVIter(six.Iterator):
+  """Iterator for the SparseTensorValue."""
+
+  def __init__(self, st):
+    self._st = st
+    self._ix = -1
+
+  def __iter__(self):  # pylint: disable=non-iterator-returned
+    return self
+
+  def __next__(self):
+    self._ix += 1
+    if self._ix == 0:
+      return self._st.indices
+    elif self._ix == 1:
+      return self._st.values
+    elif self._ix == 2:
+      return self._st.dense_shape
+    else:
+      raise StopIteration
+
+
+class SparseTensorValue(object):
+  """Stores the calculated numpy arrays representing a `SparseTensor`.
+
+  Returned as the output of a session.run on a `SparseTensor` object.
+  """
+
+  def __init__(self, indices, values, dense_shape=None, shape=None):
+    self._indices = indices
+    self._values = values
+    self._dense_shape = shape or dense_shape
+
+  @property
+  def indices(self):
+    return self._indices
+
+  @property
+  def values(self):
+    return self._values
+
+  @property
+  def dense_shape(self):
+    return self._dense_shape
+
+  @property
+  def shape(self):
+    return self._dense_shape
+
+  def __repr__(self):
+    return "SparseTensorValue(indices=%s, values=%s, dense_shape=%s)" % (
+        self._indices, self._values, self._dense_shape)
+
+  def __iter__(self):  # pylint: disable=non-iterator-returned
+    return _STVIter(self)
+
+  def __getitem__(self, i):
+    return [self.indices, self.values, self.dense_shape][i]
+
+
+def convert_to_tensor_or_sparse_tensor(value, dtype=None, name=None):
+  """Converts value to a `SparseTensor` or `Tensor`.
+
+  Args:
+    value: A `SparseTensor`, `SparseTensorValue`, or an object whose type has a
+      registered `Tensor` conversion function.
+    dtype: Optional element type for the returned tensor. If missing, the
+      type is inferred from the type of `value`.
+    name: Optional name to use if a new `Tensor` is created.
+
+  Returns:
+    A `SparseTensor` or `Tensor` based on `value`.
+
+  Raises:
+    RuntimeError: If result type is incompatible with `dtype`.
+  """
+  if dtype is not None:
+    dtype = dtypes.as_dtype(dtype)
+  if isinstance(value, SparseTensorValue):
+    value = SparseTensor.from_value(value)
+  if isinstance(value, SparseTensor):
+    if dtype and not dtype.is_compatible_with(value.dtype):
+      raise RuntimeError(
+          "Sparse dtype: requested = %s, actual = %s" % (
+              dtype.name, value.dtype.name))
+    return value
+  return ops.internal_convert_to_tensor(
+      value, dtype=dtype, name=name)
diff --git a/tensorflow/python/framework/sparse_tensor_test.py b/tensorflow/python/framework/sparse_tensor_test.py
index b5f8142afc6c51..afd815b498d2bc 100644
--- a/tensorflow/python/framework/sparse_tensor_test.py
+++ b/tensorflow/python/framework/sparse_tensor_test.py
@@ -38,7 +38,7 @@ def testPythonConstruction(self):
             sparse_tensor.SparseTensor(indices, values, shape))]:
       self.assertEqual(sp.indices.dtype, dtypes.int64)
       self.assertEqual(sp.values.dtype, dtypes.string)
-      self.assertEqual(sp.shape.dtype, dtypes.int64)
+      self.assertEqual(sp.dense_shape.dtype, dtypes.int64)
       self.assertEqual(sp.get_shape(), (4, 5))
 
       with self.test_session() as sess:
@@ -52,5 +52,31 @@ def testPythonConstruction(self):
         self.assertAllEqual(sess_run_value.shape, value.shape)
 
 
+class ConvertToTensorOrSparseTensorTest(test_util.TensorFlowTestCase):
+
+  def test_convert_dense(self):
+    with self.test_session():
+      value = [42, 43]
+      from_value = sparse_tensor.convert_to_tensor_or_sparse_tensor(
+          value)
+      self.assertAllEqual(value, from_value.eval())
+
+  def test_convert_sparse(self):
+    with self.test_session():
+      indices = [[0, 1], [1, 0]]
+      values = [42, 43]
+      shape = [2, 2]
+      sparse_tensor_value = sparse_tensor.SparseTensorValue(
+          indices, values, shape)
+      st = sparse_tensor.SparseTensor.from_value(sparse_tensor_value)
+      from_value = sparse_tensor.convert_to_tensor_or_sparse_tensor(
+          sparse_tensor_value).eval()
+      from_tensor = sparse_tensor.convert_to_tensor_or_sparse_tensor(st).eval()
+      for convertee in [from_value, from_tensor]:
+        self.assertAllEqual(sparse_tensor_value.indices, convertee.indices)
+        self.assertAllEqual(sparse_tensor_value.values, convertee.values)
+        self.assertAllEqual(sparse_tensor_value.dense_shape, convertee.shape)
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index d1fcc7c78a28a9..c8575763becc06 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -201,6 +201,13 @@ tf_py_test(
     additional_deps = ["//tensorflow:tensorflow_py"],
 )
 
+tf_py_test(
+    name = "losses_test",
+    size = "small",
+    srcs = ["losses_test.py"],
+    additional_deps = ["//tensorflow:tensorflow_py"],
+)
+
 tf_py_test(
     name = "matrix_inverse_op_test",
     size = "small",
@@ -865,15 +872,6 @@ cuda_py_test(
     ],
 )
 
-cuda_py_test(
-    name = "rnn_cell_test",
-    size = "small",
-    srcs = ["rnn_cell_test.py"],
-    additional_deps = [
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
 cuda_py_test(
     name = "scalar_strict_test",
     size = "small",
@@ -1235,15 +1233,6 @@ cuda_py_test(
     ],
 )
 
-cuda_py_test(
-    name = "seq2seq_test",
-    size = "medium",
-    srcs = ["seq2seq_test.py"],
-    additional_deps = [
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
 cuda_py_test(
     name = "slice_op_test",
     size = "medium",
@@ -1388,6 +1377,28 @@ sycl_py_test(
     additional_deps = ["//tensorflow:tensorflow_py"],
 )
 
+tf_py_test(
+    name = "sets_test",
+    size = "small",
+    srcs = ["sets_test.py"],
+    additional_deps = ["//tensorflow:tensorflow_py"],
+)
+
+tf_py_test(
+    name = "metrics_test",
+    size = "small",
+    srcs = ["metrics_test.py"],
+    additional_deps = ["//tensorflow:tensorflow_py"],
+    shard_count = 3,
+)
+
+tf_py_test(
+    name = "confusion_matrix_test",
+    size = "small",
+    srcs = ["confusion_matrix_test.py"],
+    additional_deps = ["//tensorflow:tensorflow_py"],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/python/kernel_tests/atrous_conv2d_test.py b/tensorflow/python/kernel_tests/atrous_conv2d_test.py
index 1dff6a9f72525e..162bebf8d698f6 100644
--- a/tensorflow/python/kernel_tests/atrous_conv2d_test.py
+++ b/tensorflow/python/kernel_tests/atrous_conv2d_test.py
@@ -22,33 +22,33 @@
 import tensorflow as tf
 
 
-class AtrousConv2DTest(tf.test.TestCase):
-
-  def _upsample_filters(self, filters, rate):
-    """Upsamples the filters by a factor of rate along the spatial dimensions.
+def _upsample_filters(filters, rate):
+  """Upsamples the filters by a factor of rate along the spatial dimensions.
+
+  Args:
+    filters: [h, w, in_depth, out_depth]. Original filters.
+    rate: An int, specifying the upsampling rate.
+
+  Returns:
+    filters_up: [h_up, w_up, in_depth, out_depth]. Upsampled filters with
+      h_up = h + (h - 1) * (rate - 1)
+      w_up = w + (w - 1) * (rate - 1)
+      containing (rate - 1) zeros between consecutive filter values along
+      the filters' spatial dimensions.
+  """
+  if rate == 1:
+    return filters
+  # [h, w, in_depth, out_depth] -> [in_depth, out_depth, h, w]
+  filters_up = np.transpose(filters, [2, 3, 0, 1])
+  ker = np.zeros([rate, rate], dtype=np.float32)
+  ker[0, 0] = 1
+  filters_up = np.kron(filters_up, ker)[:, :, :-(rate-1), :-(rate-1)]
+  # [in_depth, out_depth, h_up, w_up] -> [h_up, w_up, in_depth, out_depth]
+  filters_up = np.transpose(filters_up, [2, 3, 0, 1])
+  return filters_up
 
-    Args:
-      filters: [h, w, in_depth, out_depth]. Original filters.
-      rate: An int, specifying the upsampling rate.
 
-    Returns:
-      filters_up: [h_up, w_up, in_depth, out_depth]. Upsampled filters with
-        h_up = h + (h - 1) * (rate - 1)
-        w_up = w + (w - 1) * (rate - 1)
-        containing (rate - 1) zeros between consecutive filter values along
-        the filters' spatial dimensions.
-    """
-    if rate == 1:
-      return filters
-    # [h, w, in_depth, out_depth] -> [in_depth, out_depth, h, w]
-    filters_up = np.transpose(filters, [2, 3, 0, 1])
-    ker = np.zeros([rate, rate])
-    ker[0, 0] = 1
-    filters_up = np.kron(filters_up, ker)[:, :, :-(rate-1), :-(rate-1)]
-    # [in_depth, out_depth, h_up, w_up] -> [h_up, w_up, in_depth, out_depth]
-    filters_up = np.transpose(filters_up, [2, 3, 0, 1])
-    self.assertEqual(np.sum(filters), np.sum(filters_up))
-    return filters_up
+class AtrousConv2DTest(tf.test.TestCase):
 
   def testAtrousConv2DForward(self):
     with self.test_session(use_gpu=True):
@@ -65,14 +65,13 @@ def testAtrousConv2DForward(self):
             f = np.arange(np.prod(f_shape), dtype=np.float32).reshape(f_shape)
 
             for rate in range(1, 4):
-              f_up = self._upsample_filters(f, rate)
+              f_up = _upsample_filters(f, rate)
 
               for padding in ["SAME", "VALID"]:
                 y1 = tf.nn.atrous_conv2d(x, f, rate, padding=padding)
                 y2 = tf.nn.conv2d(x, f_up, strides=[1, 1, 1, 1],
                                   padding=padding)
-                self.assertAllClose(y1.eval(), y2.eval(), rtol=1e-2,
-                                    atol=1e-2)
+                self.assertAllClose(y1.eval(), y2.eval(), rtol=1e-3, atol=1e-3)
 
   def testAtrousSequence(self):
     """Tests optimization of sequence of atrous convolutions.
@@ -150,5 +149,42 @@ def testGradient(self):
         self.assertLess(err, err_tolerance)
 
 
+class AtrousConv2DTransposeTest(tf.test.TestCase):
+
+  def testAtrousConv2DTransposeForward(self):
+    with self.test_session(use_gpu=True):
+      # Input: [batch, height, width, input_depth]
+      height = 9
+      for width in [9, 10]:  # Test both odd and even width.
+        x_shape = [2, height, width, 2]
+        x = np.arange(np.prod(x_shape), dtype=np.float32).reshape(x_shape)
+
+        # Filter: [kernel_height, kernel_width, input_depth, output_depth]
+        for kernel_height in range(1, 4):
+          for kernel_width in range(1, 4):
+            f_shape = [kernel_height, kernel_width, 2, 2]
+            f = np.arange(np.prod(f_shape), dtype=np.float32).reshape(f_shape)
+
+            for rate in range(1, 4):
+              f_up = _upsample_filters(f, rate)
+              kernel_height_up = (kernel_height +
+                                  (kernel_height - 1) * (rate - 1))
+              kernel_width_up = kernel_width + (kernel_width - 1) * (rate - 1)
+
+              for padding in ["SAME", "VALID"]:
+                if padding == "SAME":
+                  y_shape = [2, height, width, 2]
+                else:
+                  y_shape = [2,
+                             height + kernel_height_up - 1,
+                             width + kernel_width_up - 1,
+                             2]
+
+                y1 = tf.nn.atrous_conv2d_transpose(x, f, y_shape, rate, padding)
+                y2 = tf.nn.conv2d_transpose(
+                    x, f_up, y_shape, strides=[1, 1, 1, 1], padding=padding)
+                self.assertAllClose(y1.eval(), y2.eval(), rtol=1e-3, atol=1e-3)
+
+
 if __name__ == "__main__":
   tf.test.main()
diff --git a/tensorflow/python/kernel_tests/cast_op_test.py b/tensorflow/python/kernel_tests/cast_op_test.py
index f1de7febf53a87..ba5d6fc9635b92 100644
--- a/tensorflow/python/kernel_tests/cast_op_test.py
+++ b/tensorflow/python/kernel_tests/cast_op_test.py
@@ -186,7 +186,7 @@ def testCast(self):
       self.assertAllEqual(st_cast.indices.eval(), [[0], [1], [2]])
       self.assertAllEqual(st_cast.values.eval(),
                           np.array([1, 2, 3], np.float32))
-      self.assertAllEqual(st_cast.shape.eval(), [3])
+      self.assertAllEqual(st_cast.dense_shape.eval(), [3])
 
 
 class SaturateCastTest(tf.test.TestCase):
diff --git a/tensorflow/python/kernel_tests/check_ops_test.py b/tensorflow/python/kernel_tests/check_ops_test.py
index 49bca0938ff01b..0cb6bbdeddaa55 100644
--- a/tensorflow/python/kernel_tests/check_ops_test.py
+++ b/tensorflow/python/kernel_tests/check_ops_test.py
@@ -29,7 +29,8 @@ def test_single_tensor_raises(self):
       tf.assert_proper_iterable(tensor)
 
   def test_single_sparse_tensor_raises(self):
-    ten = tf.SparseTensor(indices=[[0, 0], [1, 2]], values=[1, 2], shape=[3, 4])
+    ten = tf.SparseTensor(
+        indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])
     with self.assertRaisesRegexp(TypeError, "proper"):
       tf.assert_proper_iterable(ten)
 
diff --git a/tensorflow/contrib/metrics/python/kernel_tests/confusion_matrix_ops_test.py b/tensorflow/python/kernel_tests/confusion_matrix_test.py
similarity index 88%
rename from tensorflow/contrib/metrics/python/kernel_tests/confusion_matrix_ops_test.py
rename to tensorflow/python/kernel_tests/confusion_matrix_test.py
index a81ef6f9a2a1b8..ff1231de42a4a3 100644
--- a/tensorflow/contrib/metrics/python/kernel_tests/confusion_matrix_ops_test.py
+++ b/tensorflow/python/kernel_tests/confusion_matrix_test.py
@@ -28,8 +28,8 @@ class ConfusionMatrixTest(tf.test.TestCase):
   def _testConfMatrix(self, predictions, labels, truth, weights=None):
     with self.test_session():
       dtype = predictions.dtype
-      ans = tf.contrib.metrics.confusion_matrix(
-          predictions, labels, dtype=dtype, weights=weights)
+      ans = tf.confusion_matrix(
+          labels, predictions, dtype=dtype, weights=weights)
       tf_ans = ans.eval()
       self.assertAllClose(tf_ans, truth, atol=1e-10)
       self.assertEqual(tf_ans.dtype, dtype)
@@ -69,8 +69,8 @@ def _testConfMatrixOnTensors(self, tf_dtype, np_dtype):
       lab = tf.concat(0, [tf.zeros([20], dtype=tf_dtype),
                           tf.ones([20], dtype=tf_dtype)])
 
-      cm = tf.contrib.metrics.confusion_matrix(
-          data, lab, dtype=tf_dtype, num_classes=2)
+      cm = tf.confusion_matrix(
+          lab, data, dtype=tf_dtype, num_classes=2)
 
       d, l, cm_out = sess.run([data, lab, cm], {m_neg: 0.0,
                                                 m_pos: 1.0,
@@ -157,28 +157,28 @@ def testInvalidRank(self):
     predictions = np.asarray([[1, 2, 3]])
     labels = np.asarray([1, 2, 3])
     self.assertRaisesRegexp(ValueError, "an not squeeze dim",
-                            tf.contrib.metrics.confusion_matrix, predictions,
-                            labels)
+                            tf.confusion_matrix,
+                            predictions, labels)
 
     predictions = np.asarray([1, 2, 3])
     labels = np.asarray([[1, 2, 3]])
     self.assertRaisesRegexp(ValueError, "an not squeeze dim",
-                            tf.contrib.metrics.confusion_matrix, predictions,
-                            labels)
+                            tf.confusion_matrix,
+                            predictions, labels)
 
   def testInputDifferentSize(self):
     predictions = np.asarray([1, 2, 3])
     labels = np.asarray([1, 2])
     self.assertRaisesRegexp(ValueError, "must be equal",
-                            tf.contrib.metrics.confusion_matrix, predictions,
-                            labels)
+                            tf.confusion_matrix,
+                            predictions, labels)
 
   def testOutputIsInt32(self):
     predictions = np.arange(2)
     labels = np.arange(2)
     with self.test_session():
-      cm = tf.contrib.metrics.confusion_matrix(
-          predictions, labels, dtype=dtypes.int32)
+      cm = tf.confusion_matrix(
+          labels, predictions, dtype=dtypes.int32)
       tf_cm = cm.eval()
     self.assertEqual(tf_cm.dtype, np.int32)
 
@@ -186,8 +186,8 @@ def testOutputIsInt64(self):
     predictions = np.arange(2)
     labels = np.arange(2)
     with self.test_session():
-      cm = tf.contrib.metrics.confusion_matrix(
-          predictions, labels, dtype=dtypes.int64)
+      cm = tf.confusion_matrix(
+          labels, predictions, dtype=dtypes.int64)
       tf_cm = cm.eval()
     self.assertEqual(tf_cm.dtype, np.int64)
 
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index 732a604dc293f8..8bad17e54dcea4 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -8,7 +8,7 @@
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# WITHOUT WARRANTIES OiR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
@@ -62,6 +62,16 @@ def check_consumers(graph):
   return True
 
 
+def opt_cfg():
+  return tf.ConfigProto(
+      allow_soft_placement=True,
+      graph_options=tf.GraphOptions(
+          optimizer_options=tf.OptimizerOptions(
+              opt_level=tf.OptimizerOptions.L1,
+              do_function_inlining=True,
+              do_constant_folding=True)))
+
+
 def isum(s):
   i = tf.constant(0, name="i")
   c = lambda i, s: tf.less(i, 10)
@@ -291,10 +301,12 @@ def testCondSparseTensor(self):
       values = tf.constant([2.0, 4.0], name="values")
       indices = tf.constant([[0], [3]], dtype=tf.int64, name="indices")
       shape = tf.constant([10], dtype=tf.int64, name="dense_shape")
-      x = tf.SparseTensor(indices, values, shape=shape)
+      x = tf.SparseTensor(indices, values, dense_shape=shape)
       pred = tf.less(1, 2)
-      fn1 = lambda: tf.SparseTensor(indices + 1, x.values + 1, shape=shape)
-      fn2 = lambda: tf.SparseTensor(indices, x.values - 1, shape=shape)
+      fn1 = lambda: tf.SparseTensor(
+          indices + 1, x.values + 1, dense_shape=shape)
+      fn2 = lambda: tf.SparseTensor(
+          indices, x.values - 1, dense_shape=shape)
       r = tf.cond(pred, fn1, fn2)
       self.assertAllEqual([3.0, 5.0], r.values.eval())
       self.assertAllEqual([[1], [4]], r.indices.eval())
@@ -432,7 +444,7 @@ def testCond_7(self):
 
   def testCondRef(self):
     with self.test_session():
-      x = gen_state_ops._variable(shape=[1], dtype=tf.float32, 
+      x = gen_state_ops._variable(shape=[1], dtype=tf.float32,
           name="x", container="", shared_name="")
       true_fn = lambda: x
       false_fn = lambda: tf.constant([2.0])
@@ -441,8 +453,8 @@ def testCondRef(self):
 
   def testUninitializedRefIdentity(self):
     with self.test_session() as sess:
-      v = gen_state_ops._variable(shape=[1], dtype=tf.float32, 
-          name="v", container="", shared_name="")      
+      v = gen_state_ops._variable(shape=[1], dtype=tf.float32,
+          name="v", container="", shared_name="")
       inited = state_ops.is_variable_initialized(v)
       v_f, v_t = control_flow_ops.ref_switch(v, inited)
       # Both v_f and v_t are uninitialized references. However, an actual use
@@ -459,6 +471,30 @@ def testUninitializedRefIdentity(self):
       merged_op = control_flow_ops.merge([assign_v, orig_v])
       self.assertAllEqual([1.0], sess.run(merged_op.output))
 
+  def testCondSwitchIdentity(self):
+    # Make sure the recv identity is not removed by optimization.
+    with tf.Session(config=opt_cfg()) as sess:
+      pred = tf.constant(True)
+      def fn1():
+        return tf.no_op()
+      def fn2():
+        return tf.Assert(False, ["Wrong branch!!!"])
+      r = tf.cond(pred, fn1, fn2)
+      sess.run(r)
+
+  def testCondRecvIdentity(self):
+    # Make sure the switch identity is not removed by optimization.
+    with tf.Session(config=opt_cfg()) as sess:
+      with tf.device("/gpu:0"):
+        pred = tf.constant(True)
+      def fn1():
+        return tf.no_op()
+      def fn2():
+        with tf.device("/cpu:0"):
+          return tf.Assert(False, ["Wrong branch!!!"])
+      r = tf.cond(pred, fn1, fn2)
+      sess.run(r)
+
   def testCondGrad_1(self):
     with self.test_session():
       x = tf.constant(10.0, name="x")
@@ -608,7 +644,8 @@ def testWhile_5(self):
     with self.test_session():
 
       def compute(i, c, o):
-        c = tf.slice(x, tf.expand_dims(i, 0), [1])
+        c = tf.strided_slice(x, tf.expand_dims(i, 0),
+                             [1] + tf.expand_dims(i, 0))
         o = tf.concat(0, [o, c])
         i = tf.add(i, 1)
         return [i, c, o]
@@ -618,9 +655,10 @@ def compute(i, c, o):
       o = tf.convert_to_tensor([0])
       x = tf.convert_to_tensor([1, 2, 3, 4, 5, 6])
       s = tf.size(x)
-      r = tf.while_loop(
-          lambda i, c, o: tf.less(i, s), compute, [i, c, o],
-          [i.get_shape(), c.get_shape(), tensor_shape.unknown_shape()])
+      r = tf.while_loop(lambda i, c, o: tf.less(i, s), compute, [i, c, o], [
+          i.get_shape(), tensor_shape.unknown_shape(),
+          tensor_shape.unknown_shape()
+      ])
       result = r[2].eval()
     self.assertTrue(check_op_order(i.graph))
     self.assertAllEqual(np.array([0, 1, 2, 3, 4, 5, 6]), result)
@@ -704,14 +742,14 @@ def testWhileShapeInferenceSparseTensor(self):
       indices = tf.constant([[0], [3]], dtype=tf.int64, name="indices")
       shape = tf.constant([10], dtype=tf.int64, name="dense_shape")
       i = tf.constant(0)
-      x = tf.SparseTensor(indices, values, shape=shape)
+      x = tf.SparseTensor(indices, values, dense_shape=shape)
       def c(i, _):
         return i < 10
       def b(i, x):
         return [i + 1, tf.SparseTensor(x.indices, x.values * 2.0,
                                        x.shape)]
       _, r = tf.while_loop(c, b, [i, x])
-      self.assertEqual(r.shape.get_shape()[0].value, 1)
+      self.assertEqual(r.dense_shape.get_shape()[0].value, 1)
 
       _, r = tf.while_loop(c, b, [i, x],
                            [i.get_shape(), tensor_shape.TensorShape([None])])
@@ -1653,12 +1691,12 @@ def testWhileGrad_SparseTensor(self):
       indices = tf.constant([[0], [3]], dtype=tf.int64, name="indices")
       shape = tf.constant([10], dtype=tf.int64, name="dense_shape")
       i = tf.constant(0)
-      x = tf.SparseTensor(indices, values, shape=shape)
+      x = tf.SparseTensor(indices, values, dense_shape=shape)
       def c(i, _):
         return i < 10
       def b(i, x):
         return [i + 1, tf.SparseTensor(x.indices, x.values * 2.0,
-                                       x.shape)]
+                                       x.dense_shape)]
       _, r = tf.while_loop(c, b, [i, x])
       r = tf.gradients(r.values, values)[0]
       self.assertAllClose(np.array([1024.0, 1024.0]), r.eval())
@@ -2097,6 +2135,7 @@ def testRefSelect(self):
     v1 = tf.Variable(p1, validate_shape=False)
     v2 = tf.Variable(p2, validate_shape=False)
     v3 = tf.Variable(p3, validate_shape=False)
+    self.assertIs(None, v1.get_shape().ndims)
     s = control_flow_ops.ref_select(index, [v1, v2, v3])
     self.assertIs(None, s.get_shape().ndims)
 
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index aa31c03e196b42..39da808dab1115 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -59,7 +59,7 @@ def _sparsify(x, thresh=0.5, index_dtype=np.int64):
   x_shape = x.shape
 
   return tf.SparseTensor(
-      indices=x_indices, values=x_values, shape=x_shape), x_values
+      indices=x_indices, values=x_values, dense_shape=x_shape), x_values
 
 class UnaryOpTest(tf.test.TestCase):
 
@@ -120,7 +120,8 @@ def _check(self, result_tensor, result_np, input_sp_t, tol):
     self.assertTrue(isinstance(result_tensor, tf.SparseTensor))
     self.assertTrue(isinstance(input_sp_t, tf.SparseTensor))
     self.assertAllEqual(input_sp_t.indices.eval(), result_tensor.indices.eval())
-    self.assertAllEqual(input_sp_t.shape.eval(), result_tensor.shape.eval())
+    self.assertAllEqual(
+        input_sp_t.dense_shape.eval(), result_tensor.dense_shape.eval())
     if tol is None:
       self.assertAllClose(result_np, result_tensor.values.eval())
     else:
diff --git a/tensorflow/python/kernel_tests/functional_ops_test.py b/tensorflow/python/kernel_tests/functional_ops_test.py
index 9cbcfd37c9fb23..0ab4778a20caa9 100644
--- a/tensorflow/python/kernel_tests/functional_ops_test.py
+++ b/tensorflow/python/kernel_tests/functional_ops_test.py
@@ -119,7 +119,7 @@ def testMapSparseTensor(self):
       with self.assertRaises(TypeError):
         tf.map_fn(lambda x: x, tf.SparseTensor(indices=[[0, 0], [0, 1], [1, 0]],
                                                values=tf.constant([0, 1, 2]),
-                                               shape=[2, 2]))
+                                               dense_shape=[2, 2]))
 
   def testMap_Scoped(self):
     with self.test_session() as sess:
diff --git a/tensorflow/python/kernel_tests/losses_test.py b/tensorflow/python/kernel_tests/losses_test.py
new file mode 100644
index 00000000000000..2393124ba3155a
--- /dev/null
+++ b/tensorflow/python/kernel_tests/losses_test.py
@@ -0,0 +1,1142 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for losses."""
+# pylint: disable=unused-import,g-bad-import-order
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+# pylint: enable=unused-import
+
+import numpy as np
+import tensorflow as tf
+
+
+class AbsoluteDifferenceLossTest(tf.test.TestCase):
+
+  def setUp(self):
+    self._predictions = tf.constant([4, 8, 12, 8, 1, 3], shape=(2, 3))
+    self._labels = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+
+  def testValueErrorThrownWhenWeightIsNone(self):
+    with self.test_session():
+      with self.assertRaises(ValueError):
+        tf.losses.absolute_difference(
+            self._predictions, self._predictions, weights=None)
+
+  def testAllCorrectNoLossWeight(self):
+    loss = tf.losses.absolute_difference(
+        self._predictions, self._predictions)
+    with self.test_session():
+      self.assertAlmostEqual(0.0, loss.eval(), 3)
+
+  def testNonZeroLoss(self):
+    loss = tf.losses.absolute_difference(
+        self._labels, self._predictions)
+    with self.test_session():
+      self.assertAlmostEqual(5.5, loss.eval(), 3)
+
+  def testNonZeroLossWithPythonScalarWeight(self):
+    weights = 2.3
+    loss = tf.losses.absolute_difference(
+        self._labels, self._predictions, weights)
+    with self.test_session():
+      self.assertAlmostEqual(5.5 * weights, loss.eval(), 3)
+
+  def testNonZeroLossWithScalarTensorWeight(self):
+    weights = 2.3
+    loss = tf.losses.absolute_difference(
+        self._labels, self._predictions, tf.constant(weights))
+    with self.test_session():
+      self.assertAlmostEqual(5.5 * weights, loss.eval(), 3)
+
+  def testNonZeroLossWithOneDimBatchSpecificWeights(self):
+    weights = tf.constant([1.2, 0.0], shape=[2,])
+    loss = tf.losses.absolute_difference(
+        self._labels, self._predictions, weights)
+    with self.test_session():
+      self.assertAlmostEqual(5.6, loss.eval(), 3)
+
+  def testNonZeroLossWithTwoDimBatchSpecificWeights(self):
+    weights = tf.constant([1.2, 0.0], shape=[2, 1])
+    loss = tf.losses.absolute_difference(
+        self._labels, self._predictions, weights)
+    with self.test_session():
+      self.assertAlmostEqual(5.6, loss.eval(), 3)
+
+  def testNonZeroLossWithSampleSpecificWeights(self):
+    weights = tf.constant([3, 6, 5, 0, 4, 2], shape=[2, 3])
+    loss = tf.losses.absolute_difference(
+        self._labels, self._predictions, weights)
+    with self.test_session():
+      self.assertAlmostEqual(16.6, loss.eval(), 3)
+
+  def testNonZeroLossWithSampleSpecificWeightsMostZero(self):
+    weights = tf.constant([0, 0, 0, 0, 0, 2], shape=[2, 3])
+    loss = tf.losses.absolute_difference(
+        self._labels, self._predictions, weights)
+    with self.test_session():
+      self.assertAlmostEqual(6.0, loss.eval(), 3)
+
+  def testLossWithSampleSpecificWeightsAllZero(self):
+    weights = tf.zeros((2, 3))
+    loss = tf.losses.absolute_difference(
+        self._labels, self._predictions, weights)
+    with self.test_session():
+      self.assertAlmostEqual(0.0, loss.eval(), 3)
+
+
+class SoftmaxCrossEntropyLossTest(tf.test.TestCase):
+
+  def testNoneWeightRaisesValueError(self):
+    logits = tf.constant([[10.0, 0.0, 0.0],
+                          [0.0, 10.0, 0.0],
+                          [0.0, 0.0, 10.0]])
+    labels = tf.constant([[1, 0, 0],
+                          [0, 1, 0],
+                          [0, 0, 1]])
+    with self.test_session():
+      with self.assertRaises(ValueError):
+        tf.losses.softmax_cross_entropy(labels, logits, weights=None)
+
+  def testAllCorrect(self):
+    with self.test_session():
+      logits = tf.constant([[10.0, 0.0, 0.0],
+                            [0.0, 10.0, 0.0],
+                            [0.0, 0.0, 10.0]])
+      labels = tf.constant([[1, 0, 0],
+                            [0, 1, 0],
+                            [0, 0, 1]])
+      loss = tf.losses.softmax_cross_entropy(labels, logits)
+      self.assertEquals('softmax_cross_entropy_loss/value', loss.op.name)
+      self.assertAlmostEqual(loss.eval(), 0.0, 3)
+
+  def testAllWrong(self):
+    logits = tf.constant([[10.0, 0.0, 0.0],
+                          [0.0, 10.0, 0.0],
+                          [0.0, 0.0, 10.0]])
+    labels = tf.constant([[0, 0, 1],
+                          [1, 0, 0],
+                          [0, 1, 0]])
+
+    with self.test_session():
+      loss = tf.losses.softmax_cross_entropy(labels, logits)
+      self.assertEquals(loss.op.name, 'softmax_cross_entropy_loss/value')
+      self.assertAlmostEqual(loss.eval(), 10.0, 3)
+
+  def testNonZeroLossWithPythonScalarWeight(self):
+    logits = tf.constant([[10.0, 0.0, 0.0],
+                          [0.0, 10.0, 0.0],
+                          [0.0, 0.0, 10.0]])
+    labels = tf.constant([[0, 0, 1],
+                          [1, 0, 0],
+                          [0, 1, 0]])
+    weights = 2.3
+    with self.test_session():
+      loss = tf.losses.softmax_cross_entropy(labels, logits, weights)
+      self.assertAlmostEqual(weights * 10.0, loss.eval(), 3)
+
+  def testNonZeroLossWithScalarTensorWeight(self):
+    logits = tf.constant([[10.0, 0.0, 0.0],
+                          [0.0, 10.0, 0.0],
+                          [0.0, 0.0, 10.0]])
+    labels = tf.constant([[0, 0, 1],
+                          [1, 0, 0],
+                          [0, 1, 0]])
+    weights = 2.3
+    with self.test_session():
+      loss = tf.losses.softmax_cross_entropy(
+          labels, logits, tf.constant(weights))
+      self.assertAlmostEqual(weights * 10.0, loss.eval(), 3)
+
+  def testNonZeroLossWithOneDimBatchSpecificWeights(self):
+    logits = tf.constant([[10.0, 0.0, 0.0],
+                          [0.0, 10.0, 0.0],
+                          [0.0, 0.0, 10.0]])
+    labels = tf.constant([[0, 0, 1],
+                          [1, 0, 0],
+                          [0, 1, 0]])
+    weights = tf.constant([1.2, 3.4, 5.6], shape=[3])
+    with self.test_session():
+      loss = tf.losses.softmax_cross_entropy(labels, logits, weights)
+      self.assertAlmostEqual((1.2 + 3.4 + 5.6) * 10.0 / 3.0, loss.eval(), 3)
+
+  def testAllWrongAllWeightsMissing(self):
+    logits = tf.constant([[10.0, 0.0, 0.0],
+                          [0.0, 10.0, 0.0],
+                          [0.0, 0.0, 10.0]])
+    labels = tf.constant([[0, 0, 1],
+                          [1, 0, 0],
+                          [0, 1, 0]])
+    weights = tf.constant([0, 0, 0], shape=[3])
+    with self.test_session():
+      loss = tf.losses.softmax_cross_entropy(labels, logits, weights)
+      self.assertAlmostEqual(0.0, loss.eval(), 3)
+
+  def testSomeWeightsMissing(self):
+    logits = tf.constant([[10.0, 0.0, 0.0],
+                          [0.0, 10.0, 0.0],
+                          [0.0, 0.0, 10.0]])
+    labels = tf.constant([[0, 0, 1],
+                          [1, 0, 0],
+                          [0, 1, 0]])
+    weights = tf.constant([1.2, 0, 0], shape=[3])
+    with self.test_session():
+      loss = tf.losses.softmax_cross_entropy(labels, logits, weights)
+      self.assertAlmostEqual(12.0, loss.eval(), 3)
+
+  def testSoftmaxWithMeasurementSpecificWeightsRaisesException(self):
+    with self.test_session():
+      logits = tf.constant([[100.0, -100.0, -100.0],
+                            [-100.0, 100.0, -100.0],
+                            [-100.0, -100.0, 100.0]])
+      labels = tf.constant([[1, 0, 0],
+                            [0, 1, 0],
+                            [0, 0, 1]])
+      weights = tf.constant([[3, 4, 5],
+                             [2, 6, 0],
+                             [8, 0, 1]])
+
+      with self.assertRaises(ValueError):
+        tf.losses.softmax_cross_entropy(
+            labels, logits, weights=weights).eval()
+
+  def testSoftmaxLabelSmoothing(self):
+    with self.test_session():
+      # Softmax Cross Entropy Loss is:
+      #   -\sum_i p_i \log q_i
+      # where for a softmax activation
+      # \log q_i = x_i - \log \sum_j \exp x_j
+      #          = x_i - x_max - \log \sum_j \exp (x_j - x_max)
+      # For our activations, [100, -100, -100] the log partion function becomes
+      # \log ( exp(0) + exp(-200) + exp(-200) ) = 0
+      # so our log softmaxes become: [0, -200, -200]
+      # so our cross entropy loss is:
+      # -(1 - L + L/n) * 0 + 400 * L/n = 400 L/n
+      logits = tf.constant([[100.0, -100.0, -100.0]])
+      labels = tf.constant([[1, 0, 0]])
+      label_smoothing = 0.1
+      loss = tf.losses.softmax_cross_entropy(
+          labels, logits, label_smoothing=label_smoothing)
+      self.assertEquals(loss.op.name, 'softmax_cross_entropy_loss/value')
+      expected_value = 400.0 * label_smoothing / 3.0
+      self.assertAlmostEqual(loss.eval(), expected_value, 3)
+
+
+class SparseSoftmaxCrossEntropyLossTest(tf.test.TestCase):
+
+  def testNoneWeightRaisesValueError(self):
+    logits = tf.constant([[10.0, 0.0, 0.0],
+                          [0.0, 10.0, 0.0],
+                          [0.0, 0.0, 10.0]])
+    labels = tf.constant([[0], [1], [2]])
+    with self.test_session():
+      with self.assertRaises(ValueError):
+        tf.losses.sparse_softmax_cross_entropy(
+            labels, logits, weights=None)
+
+  def testAllCorrectInt32Labels(self):
+    with self.test_session():
+      logits = tf.constant([[10.0, 0.0, 0.0],
+                            [0.0, 10.0, 0.0],
+                            [0.0, 0.0, 10.0]])
+      labels = tf.constant([[0], [1], [2]], dtype=tf.int32)
+      loss = tf.losses.sparse_softmax_cross_entropy(labels, logits)
+      self.assertEquals(loss.op.name, 'sparse_softmax_cross_entropy_loss/value')
+      self.assertAlmostEqual(loss.eval(), 0.0, 3)
+
+  def testAllCorrectInt64Labels(self):
+    with self.test_session():
+      logits = tf.constant([[10.0, 0.0, 0.0],
+                            [0.0, 10.0, 0.0],
+                            [0.0, 0.0, 10.0]])
+      labels = tf.constant([[0], [1], [2]], dtype=tf.int64)
+      loss = tf.losses.sparse_softmax_cross_entropy(labels, logits)
+      self.assertEquals(loss.op.name, 'sparse_softmax_cross_entropy_loss/value')
+      self.assertAlmostEqual(loss.eval(), 0.0, 3)
+
+  def testAllCorrectNonColumnLabels(self):
+    with self.test_session():
+      logits = tf.constant([[10.0, 0.0, 0.0],
+                            [0.0, 10.0, 0.0],
+                            [0.0, 0.0, 10.0]])
+      labels = tf.constant([0, 1, 2])
+      loss = tf.losses.sparse_softmax_cross_entropy(labels, logits)
+      self.assertEquals(loss.op.name, 'sparse_softmax_cross_entropy_loss/value')
+      self.assertAlmostEqual(loss.eval(), 0.0, 3)
+
+  def testAllWrongInt32Labels(self):
+    logits = tf.constant([[10.0, 0.0, 0.0],
+                          [0.0, 10.0, 0.0],
+                          [0.0, 0.0, 10.0]])
+    labels = tf.constant([[2], [0], [1]], dtype=tf.int32)
+
+    with self.test_session():
+      loss = tf.losses.sparse_softmax_cross_entropy(labels, logits)
+      self.assertEquals(loss.op.name, 'sparse_softmax_cross_entropy_loss/value')
+      self.assertAlmostEqual(loss.eval(), 10.0, 3)
+
+  def testAllWrongInt64Labels(self):
+    logits = tf.constant([[10.0, 0.0, 0.0],
+                          [0.0, 10.0, 0.0],
+                          [0.0, 0.0, 10.0]])
+    labels = tf.constant([[2], [0], [1]], dtype=tf.int64)
+
+    with self.test_session():
+      loss = tf.losses.sparse_softmax_cross_entropy(labels, logits)
+      self.assertEquals(loss.op.name, 'sparse_softmax_cross_entropy_loss/value')
+      self.assertAlmostEqual(loss.eval(), 10.0, 3)
+
+  def testAllWrongNonColumnLabels(self):
+    logits = tf.constant([[10.0, 0.0, 0.0],
+                          [0.0, 10.0, 0.0],
+                          [0.0, 0.0, 10.0]])
+    labels = tf.constant([2, 0, 1])
+
+    with self.test_session():
+      loss = tf.losses.sparse_softmax_cross_entropy(labels, logits)
+      self.assertEquals(loss.op.name, 'sparse_softmax_cross_entropy_loss/value')
+      self.assertAlmostEqual(loss.eval(), 10.0, 3)
+
+  def testNonZeroLossWithPythonScalarWeight(self):
+    logits = tf.constant([[10.0, 0.0, 0.0],
+                          [0.0, 10.0, 0.0],
+                          [0.0, 0.0, 10.0]])
+    labels = tf.constant([[2], [0], [1]])
+    weights = 2.3
+    with self.test_session():
+      loss = tf.losses.sparse_softmax_cross_entropy(
+          labels, logits, weights)
+      self.assertAlmostEqual(weights * 10.0, loss.eval(), 3)
+
+  def testNonZeroLossWithScalarTensorWeight(self):
+    logits = tf.constant([[10.0, 0.0, 0.0],
+                          [0.0, 10.0, 0.0],
+                          [0.0, 0.0, 10.0]])
+    labels = tf.constant([[2], [0], [1]])
+    weights = 2.3
+    with self.test_session():
+      loss = tf.losses.sparse_softmax_cross_entropy(
+          labels, logits, tf.constant(weights))
+      self.assertAlmostEqual(weights * 10.0, loss.eval(), 3)
+
+  def testNonZeroLossWithOneDimBatchSpecificWeights(self):
+    logits = tf.constant([[10.0, 0.0, 0.0],
+                          [0.0, 10.0, 0.0],
+                          [0.0, 0.0, 10.0]])
+    labels = tf.constant([[2], [0], [1]])
+    weights = tf.constant([1.2, 3.4, 5.6], shape=[3])
+    with self.test_session():
+      loss = tf.losses.sparse_softmax_cross_entropy(
+          labels, logits, weights)
+      self.assertAlmostEqual((1.2 + 3.4 + 5.6) * 10.0 / 3.0, loss.eval(), 3)
+
+  def testNonZeroLossWithColumnWeights(self):
+    logits = tf.constant([[10.0, 0.0, 0.0],
+                          [0.0, 10.0, 0.0],
+                          [0.0, 0.0, 10.0]])
+    labels = tf.constant([[2], [0], [1]])
+    weights = tf.constant([[1.2], [3.4], [5.6]])
+    with self.test_session():
+      loss = tf.losses.sparse_softmax_cross_entropy(
+          labels, logits, weights)
+      self.assertAlmostEqual((1.2 + 3.4 + 5.6) * 10.0 / 3.0, loss.eval(), 3)
+
+  def testAllWrongAllWeightsMissing(self):
+    logits = tf.constant([[10.0, 0.0, 0.0],
+                          [0.0, 10.0, 0.0],
+                          [0.0, 0.0, 10.0]])
+    labels = tf.constant([[2], [0], [1]])
+    weights = tf.constant([0, 0, 0], shape=[3])
+    with self.test_session():
+      loss = tf.losses.sparse_softmax_cross_entropy(
+          labels, logits, weights)
+      self.assertAlmostEqual(0.0, loss.eval(), 3)
+
+  def testSomeWeightsMissing(self):
+    logits = tf.constant([[10.0, 0.0, 0.0],
+                          [0.0, 10.0, 0.0],
+                          [0.0, 0.0, 10.0]])
+    labels = tf.constant([[2], [0], [1]])
+    weights = tf.constant([1.2, 0, 0], shape=[3])
+    with self.test_session():
+      loss = tf.losses.sparse_softmax_cross_entropy(
+          labels, logits, weights)
+      self.assertAlmostEqual(12.0, loss.eval(), 3)
+
+  def testMeasurementSpecificWeightsRaisesException(self):
+    with self.test_session():
+      logits = tf.constant([[100.0, -100.0, -100.0],
+                            [-100.0, 100.0, -100.0],
+                            [-100.0, -100.0, 100.0]])
+      labels = tf.constant([[0], [1], [2]])
+      weights = tf.constant([[3, 4, 5],
+                             [2, 6, 0],
+                             [8, 0, 1]])
+
+      with self.assertRaises(ValueError):
+        tf.losses.sparse_softmax_cross_entropy(
+            labels, logits, weights=weights).eval()
+
+  def testInconsistentWeightSizeRaisesException(self):
+    """The weight tensor has incorrect number of elements."""
+    with self.test_session():
+      logits = tf.constant([[100.0, -100.0, -100.0],
+                            [-100.0, 100.0, -100.0],
+                            [-100.0, -100.0, 100.0]])
+      labels = tf.constant([[0], [1], [2]])
+      weights = tf.constant([1.2, 3.4, 5.6, 7.8])
+
+      with self.assertRaises(ValueError):
+        tf.losses.sparse_softmax_cross_entropy(
+            labels, logits, weights=weights).eval()
+
+  def testInconsistentLabelSizeRaisesException(self):
+    """The label tensor has incorrect number of elements."""
+    with self.test_session():
+      logits = tf.constant([[100.0, -100.0, -100.0],
+                            [-100.0, 100.0, -100.0],
+                            [-100.0, -100.0, 100.0]])
+      labels = tf.constant([[0], [1], [2], [3]])
+      weights = tf.constant([1.2, 3.4, 5.6])
+
+      with self.assertRaises(ValueError):
+        tf.losses.sparse_softmax_cross_entropy(
+            labels, logits, weights=weights).eval()
+
+  def testInconsistentWeightShapeRaisesException(self):
+    """The weight tensor has incorrect shape."""
+    with self.test_session():
+      logits = tf.constant([[100.0, -100.0, -100.0, -100.0],
+                            [-100.0, 100.0, -100.0, -100.0],
+                            [-100.0, -100.0, 100.0, -100.0],
+                            [-100.0, -100.0, -100.0, 100.0]])
+      labels = tf.constant([[0], [1], [2], [3]])
+      weights = tf.constant([[1.2, 3.4], [5.6, 7.8]])
+
+      with self.assertRaises(ValueError):
+        tf.losses.sparse_softmax_cross_entropy(
+            labels, logits, weights=weights).eval()
+
+  def testInconsistentLabelShapeRaisesException(self):
+    """The label tensor has incorrect shape."""
+    with self.test_session():
+      logits = tf.constant([[100.0, -100.0, -100.0, -100.0],
+                            [-100.0, 100.0, -100.0, -100.0],
+                            [-100.0, -100.0, 100.0, -100.0],
+                            [-100.0, -100.0, -100.0, 100.0]])
+      labels = tf.constant([[0, 1], [2, 3]])
+      weights = tf.constant([1.2, 3.4, 5.6, 7.8])
+
+      with self.assertRaises(tf.errors.InvalidArgumentError):
+        tf.losses.sparse_softmax_cross_entropy(
+            labels, logits, weights=weights).eval()
+
+
+class SigmoidCrossEntropyLossTest(tf.test.TestCase):
+
+  def testAllCorrectSigmoid(self):
+    with self.test_session():
+      logits = tf.constant([[100.0, -100.0, -100.0],
+                            [-100.0, 100.0, -100.0],
+                            [-100.0, -100.0, 100.0]])
+      labels = tf.constant([[1, 0, 0],
+                            [0, 1, 0],
+                            [0, 0, 1]])
+      loss = tf.losses.sigmoid_cross_entropy(labels, logits)
+      self.assertEquals(loss.op.name, 'sigmoid_cross_entropy_loss/value')
+      self.assertAlmostEqual(0.0, loss.eval(), 3)
+
+  def testLossWithSingleDimPlaceholderForLogitsAndWeights1(self):
+    logits = tf.placeholder(tf.float32, shape=(None, 1))
+    labels = tf.placeholder(tf.float32, shape=(None, 1))
+    weights = tf.ones_like(logits, dtype=tf.float32)
+
+    loss = tf.losses.sigmoid_cross_entropy(labels, logits, weights)
+
+    with self.test_session() as sess:
+      loss = sess.run(loss, feed_dict={
+          logits: np.ones((32, 1)),
+          labels: np.ones((32, 1)),
+      })
+      self.assertAlmostEqual(0.313, loss, 3)
+
+  def testLossWithSingleDimPlaceholderForLogitsAndWeights2(self):
+    logits = tf.placeholder(tf.float32, shape=(None, 2))
+    labels = tf.placeholder(tf.float32, shape=(None, 2))
+    weights = tf.ones_like(logits, dtype=tf.float32)
+
+    loss = tf.losses.sigmoid_cross_entropy(labels, logits, weights)
+
+    with self.test_session() as sess:
+      loss = sess.run(loss, feed_dict={
+          logits: np.ones((32, 2)),
+          labels: np.ones((32, 2)),
+      })
+      self.assertAlmostEqual(0.313, loss, 3)
+
+  def testAllWrongSigmoid(self):
+    with self.test_session():
+      logits = tf.constant([[100.0, -100.0, -100.0],
+                            [-100.0, 100.0, -100.0],
+                            [-100.0, -100.0, 100.0]])
+      labels = tf.constant([[0, 0, 1],
+                            [1, 0, 0],
+                            [0, 1, 0]])
+      loss = tf.losses.sigmoid_cross_entropy(labels, logits)
+      self.assertEquals(loss.op.name, 'sigmoid_cross_entropy_loss/value')
+      self.assertAlmostEqual(loss.eval(), 600.0 / 9.0, 3)
+
+  def testAllWrongSigmoidWithMeasurementSpecificWeights(self):
+    with self.test_session():
+      logits = tf.constant([[100.0, -100.0, -100.0],
+                            [-100.0, 100.0, -100.0],
+                            [-100.0, -100.0, 100.0]])
+      labels = tf.constant([[0, 0, 1],
+                            [1, 0, 0],
+                            [0, 1, 0]])
+      weights = tf.constant([[3, 4, 5],
+                             [2, 6, 0],
+                             [8, 0, 1]])
+      loss = tf.losses.sigmoid_cross_entropy(
+          labels, logits, weights)
+      self.assertEquals(loss.op.name, 'sigmoid_cross_entropy_loss/value')
+      self.assertAlmostEqual(1700.0 / 7.0, loss.eval(), 3)
+
+  def testMultiCorrectSigmoid(self):
+    logits = tf.constant([[100.0, -100.0, 100.0],
+                          [100.0, 100.0, -100.0],
+                          [-100.0, 100.0, 100.0]])
+    labels = tf.constant([[1, 0, 1],
+                          [1, 1, 0],
+                          [0, 1, 1]])
+    loss = tf.losses.sigmoid_cross_entropy(labels, logits)
+    self.assertEquals(loss.op.name, 'sigmoid_cross_entropy_loss/value')
+
+    with self.test_session():
+      self.assertAlmostEqual(loss.eval(), 0.0, 3)
+
+  def testSigmoidLabelSmoothingCorrect(self):
+    with self.test_session():
+      logits = tf.constant([[100.0, -100.0, -100.0]])
+      labels = tf.constant([[1, 0, 1]])
+      # Sigmoid cross entropy loss is:
+      #   max(x,0) - x*z + log(1 + exp(-abs(x)))
+      # The new labels are:
+      #    z' = z * (1 - L) + 0.5 L
+      #    1 -> 1 - 0.5 L
+      #    0 -> 0.5 L
+      # here we expect:
+      # 1/3 * (100 - 100 * (1 - 0.5 L)  + 0
+      #       + 0  + 100 * (0.5 L)      + 0
+      #       + 0  + 100 * (1 - 0.5 L)  + 0)
+      # = 1/3 * (100 + 50 L)
+      label_smoothing = 0.1
+      loss = tf.losses.sigmoid_cross_entropy(
+          labels, logits, label_smoothing=label_smoothing)
+      self.assertEquals(loss.op.name, 'sigmoid_cross_entropy_loss/value')
+      expected_value = (100.0 + 50.0 * label_smoothing) / 3.0
+      self.assertAlmostEqual(loss.eval(), expected_value, 3)
+
+  def testSigmoidLabelSmoothingEqualsSoftmaxTwoLabel(self):
+    with self.test_session():
+      label_smoothing = 0.1
+      sigmoid_logits = tf.constant([[100.0, -100.0, -100.0]])
+      sigmoid_labels = tf.constant([[1, 0, 1]])
+      sigmoid_loss = tf.losses.sigmoid_cross_entropy(
+          sigmoid_labels, sigmoid_logits, label_smoothing=label_smoothing)
+
+      softmax_logits = tf.constant([[0.0, 100.0], [100.0, 0.0], [100.0, 0.0]])
+      softmax_labels = tf.constant([[0, 1], [1, 0], [0, 1]])
+      softmax_loss = tf.losses.softmax_cross_entropy(
+          softmax_labels, softmax_logits, label_smoothing=label_smoothing)
+      self.assertAlmostEqual(sigmoid_loss.eval(), softmax_loss.eval(), 3)
+
+
+class LogLossTest(tf.test.TestCase):
+
+  def setUp(self):
+    predictions = np.asarray([.9, .2, .2, .8, .4, .6]).reshape((2, 3))
+    labels = np.asarray([1.0, 0.0, 1.0, 1.0, 0.0, 0.0]).reshape((2, 3))
+
+    self._np_predictions = predictions
+    self._np_labels = labels
+
+    epsilon = 1e-7
+    self._expected_losses = np.multiply(
+        labels, np.log(predictions + epsilon)) + np.multiply(
+            1 - labels, np.log(1 - predictions + epsilon))
+
+    self._predictions = tf.constant(predictions)
+    self._labels = tf.constant(labels)
+
+  def testValueErrorThrownWhenWeightIsNone(self):
+    with self.test_session():
+      with self.assertRaises(ValueError):
+        tf.losses.log_loss(self._labels, self._labels, weights=None)
+
+  def testAllCorrectNoLossWeight(self):
+    loss = tf.losses.log_loss(self._labels, self._labels)
+    with self.test_session():
+      self.assertAlmostEqual(0.0, loss.eval(), 3)
+
+  def testAllCorrectNoLossWeightWithPlaceholder(self):
+    tf_predictions = tf.placeholder(tf.float32, shape=self._np_labels.shape)
+    loss = tf.losses.log_loss(self._labels, tf_predictions)
+    with self.test_session():
+      self.assertAlmostEqual(0.0, loss.eval(feed_dict={
+          tf_predictions: self._np_labels}), 3)
+
+  def testNonZeroLoss(self):
+    loss = tf.losses.log_loss(self._labels, self._predictions)
+    with self.test_session():
+      self.assertAlmostEqual(-np.sum(self._expected_losses) / 6.0,
+                             loss.eval(), 3)
+
+  def testNonZeroLossWithPythonScalarWeight(self):
+    weights = 2.3
+    loss = tf.losses.log_loss(
+        self._labels, self._predictions, weights)
+    with self.test_session():
+      self.assertAlmostEqual(weights * -np.sum(self._expected_losses) / 6.0,
+                             loss.eval(), 3)
+
+  def testNonZeroLossWithScalarTensorWeight(self):
+    weights = 2.3
+    loss = tf.losses.log_loss(
+        self._labels, self._predictions, tf.constant(weights))
+    with self.test_session():
+      self.assertAlmostEqual(weights * -np.sum(self._expected_losses) / 6.0,
+                             loss.eval(), 3)
+
+  def testNonZeroLossWithScalarTensorWeightAndPlaceholder(self):
+    tf_predictions = tf.placeholder(tf.float32,
+                                    shape=self._np_predictions.shape)
+    weights = 2.3
+    loss = tf.losses.log_loss(
+        self._labels, tf_predictions, tf.constant(weights))
+    with self.test_session() as sess:
+      loss = sess.run(loss, feed_dict={tf_predictions: self._np_predictions})
+      self.assertAlmostEqual(weights * -np.sum(self._expected_losses) / 6.0,
+                             loss, 3)
+
+  def testNonZeroLossWithScalarTensorWeightAndPlaceholderWithRankOnly(self):
+    tf_predictions = tf.placeholder(tf.float32, shape=[None, None])
+    weights = 2.3
+    loss = tf.losses.log_loss(
+        self._labels, tf_predictions, tf.constant(weights))
+    with self.test_session() as sess:
+      loss = sess.run(loss, feed_dict={tf_predictions: self._np_predictions})
+      self.assertAlmostEqual(weights * -np.sum(self._expected_losses) / 6.0,
+                             loss, 3)
+
+  def testNonZeroLossWithOneDimBatchSpecificWeights(self):
+    weights = tf.constant([1.2, 3.4], shape=[2])
+    expected_losses = np.multiply(
+        self._expected_losses,
+        np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape((2, 3)))
+    loss = tf.losses.log_loss(
+        self._labels, self._predictions, weights)
+    with self.test_session():
+      self.assertAlmostEqual(-np.sum(expected_losses) / 6.0,
+                             loss.eval(), 3)
+
+  def testNonZeroLossWithOneDimBatchSpecificWeightsSomeZero(self):
+    weights = tf.constant([1.2, 0], shape=[2])
+    expected_losses = np.multiply(
+        self._expected_losses,
+        np.asarray([1.2, 1.2, 1.2, 0, 0, 0]).reshape((2, 3)))
+    loss = tf.losses.log_loss(
+        self._labels, self._predictions, weights)
+    with self.test_session():
+      self.assertAlmostEqual(-np.sum(expected_losses) / 3.0,
+                             loss.eval(), 3)
+
+  def testNonZeroLossWithTwoDimBatchSpecificWeightsSomeZero(self):
+    weights = tf.constant([1.2, 0], shape=[2, 1])
+    expected_losses = np.multiply(
+        self._expected_losses,
+        np.asarray([1.2, 1.2, 1.2, 0, 0, 0]).reshape((2, 3)))
+    loss = tf.losses.log_loss(
+        self._labels, self._predictions, weights)
+    with self.test_session():
+      self.assertAlmostEqual(-np.sum(expected_losses) / 3.0,
+                             loss.eval(), 3)
+
+  def testWeightsWithSameNumDimsButWrongShapeThrowsException(self):
+    weights = tf.constant(np.random.normal(size=(2, 4)), shape=[2, 4])
+    with self.test_session():
+      with self.assertRaises(ValueError):
+        tf.losses.log_loss(self._labels, self._predictions, weights)
+
+  def testNonZeroLossWithMeasurementSpecificWeights(self):
+    weights = np.array([3, 6, 5, 0, 4, 2]).reshape((2, 3))
+    expected_losses = np.multiply(self._expected_losses, weights)
+
+    loss = tf.losses.log_loss(
+        self._labels,
+        self._predictions,
+        tf.constant(weights, shape=(2, 3)))
+    with self.test_session():
+      self.assertAlmostEqual(-np.sum(expected_losses) / 5.0, loss.eval(), 3)
+
+  def testNonZeroLossWithMeasurementSpecificWeightsWithPlaceholder(self):
+    weights = np.array([3, 6, 5, 0, 4, 2]).reshape((2, 3))
+    expected_losses = np.multiply(self._expected_losses, weights)
+
+    tf_predictions = tf.placeholder(tf.float32, shape=[2, 3])
+    loss = tf.losses.log_loss(
+        self._labels,
+        tf_predictions,
+        tf.constant(weights, shape=(2, 3)))
+
+    with self.test_session() as sess:
+      loss = sess.run(loss, feed_dict={tf_predictions: self._np_predictions})
+      self.assertAlmostEqual(-np.sum(expected_losses) / 5.0, loss, 3)
+
+  def testNonZeroLossWithSampleSpecificWeightsMostZero(self):
+    weights = np.array([0, 0, 0, 0, 0, 2]).reshape((2, 3))
+    expected_losses = np.multiply(self._expected_losses, weights)
+
+    loss = tf.losses.log_loss(
+        self._labels,
+        self._predictions,
+        tf.constant(weights, shape=(2, 3)))
+    with self.test_session():
+      self.assertAlmostEqual(-np.sum(expected_losses), loss.eval(), 3)
+
+  def testNonZeroLossWithSampleSpecificWeightsMostZeroWithPlaceholder(self):
+    weights = np.array([0, 0, 0, 0, 0, 2]).reshape((2, 3))
+    expected_losses = np.multiply(self._expected_losses, weights)
+
+    tf_predictions = tf.placeholder(tf.float32, shape=[2, 3])
+    tf_weights = tf.constant(weights, shape=(2, 3))
+    loss = tf.losses.log_loss(self._labels, tf_predictions, tf_weights)
+
+    with self.test_session() as sess:
+      loss = sess.run(loss, feed_dict={tf_predictions: self._np_predictions})
+      self.assertAlmostEqual(-np.sum(expected_losses), loss, 3)
+
+  def testLossWithSampleSpecificWeightsAllZero(self):
+    tf_weights = tf.zeros(shape=(2, 3))
+    loss = tf.losses.log_loss(
+        self._labels, self._predictions, tf_weights)
+    with self.test_session():
+      self.assertAlmostEqual(0.0, loss.eval(), 3)
+
+
+class HingeLossTest(tf.test.TestCase):
+
+  def testIncompatibleShapes(self):
+    with self.test_session():
+      logits = tf.constant([[-1.0], [2.1]])
+      labels = tf.constant([0.0, 1.0])
+      with self.assertRaises(ValueError):
+        _ = tf.losses.hinge_loss(labels, logits).eval()
+
+  def testAllOutsideMargin(self):
+    with self.test_session():
+      logits = tf.constant([1.2, -1.4, -1.0, 2.1])
+      labels = tf.constant([1.0, 0.0, 0.0, 1.0])
+      loss = tf.losses.hinge_loss(labels, logits)
+      self.assertAllClose(loss.eval(), 0.0, atol=1e-3)
+
+  def testSomeInsideMargin(self):
+    with self.test_session():
+      logits = tf.constant([[-0.7], [-1.4], [1.4], [0.6]])
+      labels = tf.constant([[0.0], [0.0], [1.0], [1.0]])
+      loss = tf.losses.hinge_loss(labels, logits)
+      # Examples 1 and 4 are on the correct side of the hyperplane but within
+      # the margin so they incur some (small) loss.
+      self.assertAllClose(loss.eval(), 0.175, atol=1e-3)
+
+  def testSomeMisclassified(self):
+    with self.test_session():
+      logits = tf.constant([[[1.2], [0.4], [-1.0], [-1.1]]])
+      labels = tf.constant([[[1.0], [0.0], [0.0], [1.0]]])
+      loss = tf.losses.hinge_loss(labels, logits)
+      # Examples 2 and 4 are on the wrong side of the hyperplane so they incur
+      # some (fairly large) loss.
+      self.assertAllClose(loss.eval(), 0.875, atol=1e-3)
+
+
+class MeanSquaredErrorTest(tf.test.TestCase):
+
+  def setUp(self):
+    self._predictions = tf.constant([4, 8, 12, 8, 1, 3], shape=(2, 3))
+    self._labels = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+
+  def testValueErrorThrownWhenWeightIsNone(self):
+    with self.test_session():
+      with self.assertRaises(ValueError):
+        tf.losses.mean_squared_error(
+            self._predictions, self._predictions, weights=None)
+
+  def testAllCorrectNoLossWeight(self):
+    loss = tf.losses.mean_squared_error(
+        self._predictions, self._predictions)
+    with self.test_session():
+      self.assertAlmostEqual(0.0, loss.eval(), 3)
+
+  def testNonZeroLoss(self):
+    loss = tf.losses.mean_squared_error(
+        self._labels, self._predictions)
+    with self.test_session():
+      self.assertAlmostEqual(49.5, loss.eval(), 3)
+
+  def testNonZeroLossWithPythonScalarWeight(self):
+    weights = 2.3
+    loss = tf.losses.mean_squared_error(
+        self._labels, self._predictions, weights)
+    with self.test_session():
+      self.assertAlmostEqual(49.5 * weights, loss.eval(), 3)
+
+  def testNonZeroLossWithScalarTensorWeight(self):
+    weights = 2.3
+    loss = tf.losses.mean_squared_error(
+        self._labels, self._predictions, tf.constant(weights))
+    with self.test_session():
+      self.assertAlmostEqual(49.5 * weights, loss.eval(), 3)
+
+  def testNonZeroLossWithOneDimBatchSpecificWeights(self):
+    weights = tf.constant([1.2, 3.4], shape=[2,])
+    loss = tf.losses.mean_squared_error(
+        self._labels, self._predictions, weights)
+    with self.test_session():
+      self.assertAlmostEqual(767.8 / 6.0, loss.eval(), 3)
+
+  def testNonZeroLossWithTwoDimBatchSpecificWeights(self):
+    weights = tf.constant([1.2, 3.4], shape=[2, 1])
+    loss = tf.losses.mean_squared_error(
+        self._labels, self._predictions, weights)
+    with self.test_session():
+      self.assertAlmostEqual(767.8 / 6.0, loss.eval(), 3)
+
+  def testNonZeroLossWithSampleSpecificWeights(self):
+    weights = tf.constant([3, 6, 5, 0, 4, 2], shape=[2, 3])
+    loss = tf.losses.mean_squared_error(
+        self._labels, self._predictions, weights)
+    with self.test_session():
+      self.assertAlmostEqual(587 / 5.0, loss.eval(), 3)
+
+  def testNonZeroLossWithSampleSpecificWeightsMostZero(self):
+    weights = tf.constant([0, 0, 0, 0, 0, 2], shape=[2, 3])
+    loss = tf.losses.mean_squared_error(
+        self._labels, self._predictions, weights)
+    with self.test_session():
+      self.assertAlmostEqual(18.0, loss.eval(), 3)
+
+  def testLossWithSampleSpecificWeightsAllZero(self):
+    weights = tf.zeros((2, 3))
+    loss = tf.losses.mean_squared_error(
+        self._labels, self._predictions, weights)
+    with self.test_session():
+      self.assertAlmostEqual(0.0, loss.eval(), 3)
+
+
+class MeanPairwiseSquaresErrorTest(tf.test.TestCase):
+
+  def setUp(self):
+    self._predictions = np.array([[4, 8, 12],
+                                  [8, 1, 3]])
+    self._labels = np.array([[1, 9, 2],
+                             [-5, -5, 7]])
+
+    batch_size, dims = self._labels.shape
+
+    # Compute the expected loss 'manually'.
+    total = np.zeros((batch_size, 1))
+    for b in range(batch_size):
+      for i in range(dims):
+        for j in range(dims):
+          x = self._predictions[b, i].item() - self._predictions[b, j].item()
+          y = self._labels[b, i].item() - self._labels[b, j].item()
+          tmp = (x-y) * (x-y)
+          total[b] += tmp
+
+    self._expected_losses = np.divide(total, 9.0)
+
+  def testValueErrorThrownWhenWeightIsNone(self):
+    with self.test_session():
+      with self.assertRaises(ValueError):
+        tf.losses.mean_pairwise_squared_error(
+            predictions=tf.constant(self._labels),
+            labels=tf.constant(self._labels),
+            weights=None)
+
+  def testAllCorrectNoLossWeight(self):
+    loss = tf.losses.mean_pairwise_squared_error(
+        predictions=tf.constant(self._labels),
+        labels=tf.constant(self._labels))
+    with self.test_session():
+      self.assertAlmostEqual(0.0, loss.eval(), 3)
+
+  def testNonZeroLoss(self):
+    loss = tf.losses.mean_pairwise_squared_error(
+        predictions=tf.constant(self._predictions),
+        labels=tf.constant(self._labels))
+    with self.test_session():
+      self.assertAlmostEqual(np.sum(self._expected_losses), loss.eval(), 3)
+
+  def testGradientWithZeroWeight(self):
+    with tf.Graph().as_default():
+      tf.set_random_seed(0)
+
+      inputs = tf.ones((2, 3))
+      weights = tf.get_variable('weights',
+                                shape=[3, 4],
+                                initializer=tf.truncated_normal_initializer())
+      predictions = tf.matmul(inputs, weights)
+
+      optimizer = tf.train.MomentumOptimizer(learning_rate=0.001, momentum=0.9)
+      loss = tf.losses.mean_pairwise_squared_error(
+          predictions,
+          predictions,
+          0)
+
+      gradients_to_variables = optimizer.compute_gradients(loss)
+
+      init_op = tf.global_variables_initializer()
+
+      with self.test_session() as sess:
+        sess.run(init_op)
+        for grad, _ in gradients_to_variables:
+          np_grad = sess.run(grad)
+          self.assertFalse(np.isnan(np_grad).any())
+
+  def testNonZeroLossWithPythonScalarWeight(self):
+    weights = 2.3
+    loss = tf.losses.mean_pairwise_squared_error(
+        predictions=tf.constant(self._predictions),
+        labels=tf.constant(self._labels),
+        weights=weights)
+    with self.test_session():
+      self.assertAlmostEqual(weights * np.sum(self._expected_losses),
+                             loss.eval(), 3)
+
+  def testNonZeroLossWithScalarTensorWeight(self):
+    weights = 2.3
+    loss = tf.losses.mean_pairwise_squared_error(
+        predictions=tf.constant(self._predictions),
+        labels=tf.constant(self._labels),
+        weights=tf.constant(weights))
+    with self.test_session():
+      self.assertAlmostEqual(weights * np.sum(self._expected_losses),
+                             loss.eval(), 3)
+
+  def testNonZeroLossWithScalarZeroWeight(self):
+    weights = 0
+    loss = tf.losses.mean_pairwise_squared_error(
+        predictions=tf.constant(self._predictions),
+        labels=tf.constant(self._labels),
+        weights=tf.constant(weights))
+    with self.test_session():
+      self.assertAlmostEqual(0, loss.eval(), 3)
+
+  def testNonZeroLossWithScalarTensorWeightWithPlaceholder(self):
+    weights = 2.3
+    tf_predictions = tf.placeholder(tf.float32, shape=self._predictions.shape)
+    tf_labels = tf.placeholder(tf.float32, shape=self._labels.shape)
+    loss = tf.losses.mean_pairwise_squared_error(
+        predictions=tf_predictions,
+        labels=tf_labels,
+        weights=tf.constant(weights))
+    with self.test_session() as sess:
+      loss = sess.run(loss, feed_dict={
+          tf_predictions: self._predictions,
+          tf_labels: self._labels,
+      })
+      self.assertAlmostEqual(weights * np.sum(self._expected_losses), loss, 3)
+
+  def testNonZeroLossWithOneDimBatchSpecificWeights(self):
+    weights = np.asarray([2.0, 1.0]).reshape((2, 1))
+    expected_losses = np.multiply(weights, self._expected_losses)
+
+    loss = tf.losses.mean_pairwise_squared_error(
+        predictions=tf.constant(self._predictions),
+        labels=tf.constant(self._labels),
+        weights=tf.constant(weights, shape=[2]))
+    with self.test_session():
+      self.assertAlmostEqual(np.sum(expected_losses), loss.eval(), 3)
+
+  def testZeroLossWithOneDimBatchZeroWeights(self):
+    weights = np.asarray([0.0, 0.0]).reshape((2, 1))
+    loss = tf.losses.mean_pairwise_squared_error(
+        predictions=tf.constant(self._predictions),
+        labels=tf.constant(self._labels),
+        weights=tf.constant(weights, shape=[2]))
+    with self.test_session():
+      self.assertAlmostEqual(0, loss.eval(), 3)
+
+  def testNonZeroLossWithOneDimBatchSpecificWeightsAndPlaceholders(self):
+    weights = np.asarray([1.2, 3.4]).reshape((2, 1))
+    expected_losses = np.multiply(weights, self._expected_losses)
+
+    tf_predictions = tf.placeholder(tf.float32, shape=self._predictions.shape)
+    tf_labels = tf.placeholder(tf.int32, shape=self._labels.shape)
+    loss = tf.losses.mean_pairwise_squared_error(
+        predictions=tf_predictions,
+        labels=tf_labels,
+        weights=tf.constant(weights, shape=[2]))
+
+    with self.test_session() as sess:
+      loss = sess.run(loss, feed_dict={
+          tf_predictions: self._predictions,
+          tf_labels: self._labels,
+      })
+      self.assertAlmostEqual(np.sum(expected_losses), loss, 3)
+
+  def testLossWithAllZeroBatchSpecificWeights(self):
+    weights = np.zeros((2, 1))
+    loss = tf.losses.mean_pairwise_squared_error(
+        predictions=tf.constant(self._predictions),
+        labels=tf.constant(self._labels),
+        weights=tf.constant(weights, shape=[2]))
+    with self.test_session():
+      self.assertAlmostEqual(0.0, loss.eval(), 3)
+
+
+class CosineDistanceLossTest(tf.test.TestCase):
+
+  def setUp(self):
+    self._predictions = np.asarray([[1, 0, 0],  # Batch 1
+                                    [0, 0, -1],
+                                    [1, 0, 0],  # Batch 2
+                                    [1, 0, 0],
+                                    [0, 0, -1],  # Batch 3
+                                    [1, 0, 0]]).reshape((3, 2, 3))
+
+    self._labels = np.asarray([[1, 0, 0],
+                               [0, 0, 1],
+                               [0, 1, 0],
+                               [1, 0, 0],
+                               [0, 0, 1],
+                               [0, 1, 0]]).reshape((3, 2, 3))
+
+  def testValueErrorThrownWhenWeightIsNone(self):
+    with self.test_session():
+      with self.assertRaises(ValueError):
+        tf.losses.cosine_distance(
+            predictions=tf.constant(self._labels),
+            labels=tf.constant(self._labels),
+            dim=2,
+            weights=None)
+
+  def testAllCorrectNoWeights(self):
+    loss = tf.losses.cosine_distance(
+        predictions=tf.constant(self._labels),
+        labels=tf.constant(self._labels),
+        dim=2)
+    with self.test_session():
+      self.assertAlmostEqual(0, loss.eval(), 5)
+
+  def testPartiallyCorrectWithIntegerValues(self):
+    loss = tf.losses.cosine_distance(
+        predictions=tf.constant(self._predictions),
+        labels=tf.constant(self._labels),
+        dim=2)
+    with self.test_session():
+      self.assertAlmostEqual(1, loss.eval(), 5)
+
+  def testPartiallyCorrectFloatingPointValues(self):
+    predictions = np.matrix((
+        '0.819031913261206 0.567041924552012 0.087465312324590;'
+        '-0.665139432070255 -0.739487441769973 -0.103671883216994;'
+        '0.707106781186548 -0.707106781186548 0'))
+    labels = np.matrix((
+        '0.819031913261206 0.567041924552012 0.087465312324590;'
+        '0.665139432070255 0.739487441769973 0.103671883216994;'
+        '0.707106781186548 0.707106781186548 0'))
+
+    tf_preds = tf.constant(predictions, shape=(3, 1, 3), dtype=tf.float32)
+    tf_labels = tf.constant(labels, shape=(3, 1, 3), dtype=tf.float32)
+    loss = tf.losses.cosine_distance(tf_labels, tf_preds, dim=2)
+
+    with self.test_session():
+      self.assertAlmostEqual(1.0, loss.eval(), 5)
+
+  def testSampleSpecificWeights(self):
+    loss = tf.losses.cosine_distance(
+        predictions=tf.constant(self._predictions),
+        labels=tf.constant(self._labels),
+        dim=2,
+        weights=tf.constant([1, 0, 0]))
+    with self.test_session():
+      self.assertEqual(1.0, loss.eval())
+
+  def testMeasurementSpecificWeights(self):
+    loss = tf.losses.cosine_distance(
+        predictions=tf.constant(self._predictions),
+        labels=tf.constant(self._labels),
+        dim=2,
+        weights=tf.constant([1, 0, 0, 1, 1, 1], shape=(3, 2)))
+    with self.test_session():
+      self.assertEqual(3.0 / 4.0, loss.eval())
+
+  def testValueErrorThrownWithShapelessPlaceholder(self):
+    tf_predictions = tf.placeholder(tf.float32)
+    with self.test_session():
+      with self.assertRaises(ValueError):
+        tf.losses.cosine_distance(
+            predictions=tf_predictions,
+            labels=tf.constant(self._labels),
+            dim=2,
+            weights=tf.constant([1, 0, 0, 1, 1, 1], shape=(3, 2)))
+
+  def testMeasurementSpecificWeightsWithPlaceholderWithShape(self):
+    tf_predictions = tf.placeholder(tf.float32, shape=self._labels.shape)
+    loss = tf.losses.cosine_distance(
+        predictions=tf_predictions,
+        labels=tf.constant(self._labels),
+        dim=2,
+        weights=tf.constant([1, 0, 0, 1, 1, 1], shape=(3, 2)))
+    with self.test_session() as sess:
+      loss = sess.run(loss, feed_dict={tf_predictions: self._predictions})
+      self.assertEqual(3.0 / 4.0, loss)
+
+  def testZeroLossWhenAllSampleSpecificWeightsAreZero(self):
+    loss = tf.losses.cosine_distance(
+        predictions=tf.constant(self._predictions),
+        labels=tf.constant(self._labels),
+        dim=2,
+        weights=tf.zeros((3,)))
+    with self.test_session():
+      self.assertEqual(0, loss.eval())
+
+  def testZeroLossWhenAllMeasurementSpecificWeightsAreZero(self):
+    loss = tf.losses.cosine_distance(
+        predictions=tf.constant(self._predictions),
+        labels=tf.constant(self._labels),
+        dim=2,
+        weights=tf.zeros((3, 2)))
+    with self.test_session():
+      self.assertEqual(0, loss.eval())
+
+
+class AddLossTest(tf.test.TestCase):
+
+  def testNoCollectLossesBatch2(self):
+    logits = tf.constant([[1.2, 0.4, -1.0, -1.1]] * 2)
+    labels = tf.constant([[1.0, 0.0, 0.0, 1.0]] * 2)
+    self.assertFalse(tf.losses.get_losses())
+    tf.losses.absolute_difference(logits, labels, loss_collection=None)
+    tf.losses.log_loss(logits, labels, loss_collection=None)
+    tf.losses.mean_squared_error(logits, labels, loss_collection=None)
+    tf.losses.sigmoid_cross_entropy(logits, labels, loss_collection=None)
+    tf.losses.softmax_cross_entropy(logits, labels, loss_collection=None)
+    self.assertFalse(tf.losses.get_losses())
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/python/kernel_tests/metrics_test.py b/tensorflow/python/kernel_tests/metrics_test.py
new file mode 100644
index 00000000000000..28b1811805bf64
--- /dev/null
+++ b/tensorflow/python/kernel_tests/metrics_test.py
@@ -0,0 +1,3360 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for metrics."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+from tensorflow.python.ops import metrics
+
+NAN = float('nan')
+
+
+def _enqueue_vector(sess, queue, values, shape=None):
+  if not shape:
+    shape = (1, len(values))
+  dtype = queue.dtypes[0]
+  sess.run(queue.enqueue(tf.constant(values, dtype=dtype, shape=shape)))
+
+
+def _binary_2d_label_to_sparse_value(labels):
+  """Convert dense 2D binary indicator tensor to sparse tensor.
+
+  Only 1 values in `labels` are included in result.
+
+  Args:
+    labels: Dense 2D binary indicator tensor.
+
+  Returns:
+    `SparseTensorValue` whose values are indices along the last dimension of
+    `labels`.
+  """
+  indices = []
+  values = []
+  batch = 0
+  for row in labels:
+    label = 0
+    xi = 0
+    for x in row:
+      if x == 1:
+        indices.append([batch, xi])
+        values.append(label)
+        xi += 1
+      else:
+        assert x == 0
+      label += 1
+    batch += 1
+  shape = [len(labels), len(labels[0])]
+  return tf.SparseTensorValue(
+      np.array(indices, np.int64),
+      np.array(values, np.int64),
+      np.array(shape, np.int64))
+
+
+def _binary_2d_label_to_sparse(labels):
+  """Convert dense 2D binary indicator tensor to sparse tensor.
+
+  Only 1 values in `labels` are included in result.
+
+  Args:
+    labels: Dense 2D binary indicator tensor.
+
+  Returns:
+    `SparseTensor` whose values are indices along the last dimension of
+    `labels`.
+  """
+  return tf.SparseTensor.from_value(_binary_2d_label_to_sparse_value(labels))
+
+
+def _binary_3d_label_to_sparse_value(labels):
+  """Convert dense 3D binary indicator tensor to sparse tensor.
+
+  Only 1 values in `labels` are included in result.
+
+  Args:
+    labels: Dense 2D binary indicator tensor.
+
+  Returns:
+    `SparseTensorValue` whose values are indices along the last dimension of
+    `labels`.
+  """
+  indices = []
+  values = []
+  for d0, labels_d0 in enumerate(labels):
+    for d1, labels_d1 in enumerate(labels_d0):
+      d2 = 0
+      for class_id, label in enumerate(labels_d1):
+        if label == 1:
+          values.append(class_id)
+          indices.append([d0, d1, d2])
+          d2 += 1
+        else:
+          assert label == 0
+  shape = [len(labels), len(labels[0]), len(labels[0][0])]
+  return tf.SparseTensorValue(
+      np.array(indices, np.int64),
+      np.array(values, np.int64),
+      np.array(shape, np.int64))
+
+
+def _binary_3d_label_to_sparse(labels):
+  """Convert dense 3D binary indicator tensor to sparse tensor.
+
+  Only 1 values in `labels` are included in result.
+
+  Args:
+    labels: Dense 2D binary indicator tensor.
+
+  Returns:
+    `SparseTensor` whose values are indices along the last dimension of
+    `labels`.
+  """
+  return tf.SparseTensor.from_value(_binary_3d_label_to_sparse_value(labels))
+
+
+def _assert_nan(test_case, actual):
+  test_case.assertTrue(math.isnan(actual), 'Expected NAN, got %s.' % actual)
+
+
+def _assert_local_variables(test_case, expected):
+  test_case.assertEquals(
+      set(expected), set(v.name for v in tf.local_variables()))
+
+
+class MeanTest(tf.test.TestCase):
+
+  def setUp(self):
+    tf.reset_default_graph()
+
+  def testVars(self):
+    metrics.mean(tf.ones([4, 3]))
+    _assert_local_variables(self, ('mean/count:0', 'mean/total:0'))
+
+  def testMetricsCollection(self):
+    my_collection_name = '__metrics__'
+    mean, _ = metrics.mean(
+        tf.ones([4, 3]),
+        metrics_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [mean])
+
+  def testUpdatesCollection(self):
+    my_collection_name = '__updates__'
+    _, update_op = metrics.mean(
+        tf.ones([4, 3]),
+        updates_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [update_op])
+
+  def testBasic(self):
+    with self.test_session() as sess:
+      values_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 2))
+      _enqueue_vector(sess, values_queue, [0, 1])
+      _enqueue_vector(sess, values_queue, [-4.2, 9.1])
+      _enqueue_vector(sess, values_queue, [6.5, 0])
+      _enqueue_vector(sess, values_queue, [-3.2, 4.0])
+      values = values_queue.dequeue()
+
+      mean, update_op = metrics.mean(values)
+
+      sess.run(tf.local_variables_initializer())
+      for _ in range(4):
+        sess.run(update_op)
+      self.assertAlmostEqual(1.65, sess.run(mean), 5)
+
+  def testUpdateOpsReturnsCurrentValue(self):
+    with self.test_session() as sess:
+      values_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 2))
+      _enqueue_vector(sess, values_queue, [0, 1])
+      _enqueue_vector(sess, values_queue, [-4.2, 9.1])
+      _enqueue_vector(sess, values_queue, [6.5, 0])
+      _enqueue_vector(sess, values_queue, [-3.2, 4.0])
+      values = values_queue.dequeue()
+
+      mean, update_op = metrics.mean(values)
+
+      sess.run(tf.local_variables_initializer())
+
+      self.assertAlmostEqual(0.5, sess.run(update_op), 5)
+      self.assertAlmostEqual(1.475, sess.run(update_op), 5)
+      self.assertAlmostEqual(12.4/6.0, sess.run(update_op), 5)
+      self.assertAlmostEqual(1.65, sess.run(update_op), 5)
+
+      self.assertAlmostEqual(1.65, sess.run(mean), 5)
+
+  def test1dWeightedValues(self):
+    with self.test_session() as sess:
+      # Create the queue that populates the values.
+      values_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 2))
+      _enqueue_vector(sess, values_queue, [0, 1])
+      _enqueue_vector(sess, values_queue, [-4.2, 9.1])
+      _enqueue_vector(sess, values_queue, [6.5, 0])
+      _enqueue_vector(sess, values_queue, [-3.2, 4.0])
+      values = values_queue.dequeue()
+
+      # Create the queue that populates the weighted labels.
+      weights_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 1))
+      _enqueue_vector(sess, weights_queue, [1])
+      _enqueue_vector(sess, weights_queue, [0])
+      _enqueue_vector(sess, weights_queue, [0])
+      _enqueue_vector(sess, weights_queue, [1])
+      weights = weights_queue.dequeue()
+
+      mean, update_op = metrics.mean(values, weights)
+
+      tf.local_variables_initializer().run()
+      for _ in range(4):
+        update_op.eval()
+      self.assertAlmostEqual((0 + 1 - 3.2 + 4.0) / 4.0, mean.eval(), 5)
+
+  def test1dWeightedValues_placeholders(self):
+    with self.test_session() as sess:
+      # Create the queue that populates the values.
+      feed_values = (
+          (0, 1),
+          (-4.2, 9.1),
+          (6.5, 0),
+          (-3.2, 4.0)
+      )
+      values = tf.placeholder(dtype=tf.float32)
+
+      # Create the queue that populates the weighted labels.
+      weights_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 1))
+      _enqueue_vector(sess, weights_queue, [1])
+      _enqueue_vector(sess, weights_queue, [0])
+      _enqueue_vector(sess, weights_queue, [0])
+      _enqueue_vector(sess, weights_queue, [1])
+      weights = weights_queue.dequeue()
+
+      mean, update_op = metrics.mean(values, weights)
+
+      tf.local_variables_initializer().run()
+      for i in range(4):
+        update_op.eval(feed_dict={values: feed_values[i]})
+      self.assertAlmostEqual((0 + 1 - 3.2 + 4.0) / 4.0, mean.eval(), 5)
+
+  def test2dWeightedValues(self):
+    with self.test_session() as sess:
+      # Create the queue that populates the values.
+      values_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 2))
+      _enqueue_vector(sess, values_queue, [0, 1])
+      _enqueue_vector(sess, values_queue, [-4.2, 9.1])
+      _enqueue_vector(sess, values_queue, [6.5, 0])
+      _enqueue_vector(sess, values_queue, [-3.2, 4.0])
+      values = values_queue.dequeue()
+
+      # Create the queue that populates the weighted labels.
+      weights_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 2))
+      _enqueue_vector(sess, weights_queue, [1, 1])
+      _enqueue_vector(sess, weights_queue, [1, 0])
+      _enqueue_vector(sess, weights_queue, [0, 1])
+      _enqueue_vector(sess, weights_queue, [0, 0])
+      weights = weights_queue.dequeue()
+
+      mean, update_op = metrics.mean(values, weights)
+
+      tf.local_variables_initializer().run()
+      for _ in range(4):
+        update_op.eval()
+      self.assertAlmostEqual((0 + 1 - 4.2 + 0) / 4.0, mean.eval(), 5)
+
+  def test2dWeightedValues_placeholders(self):
+    with self.test_session() as sess:
+      # Create the queue that populates the values.
+      feed_values = (
+          (0, 1),
+          (-4.2, 9.1),
+          (6.5, 0),
+          (-3.2, 4.0)
+      )
+      values = tf.placeholder(dtype=tf.float32)
+
+      # Create the queue that populates the weighted labels.
+      weights_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 2))
+      _enqueue_vector(sess, weights_queue, [1, 1])
+      _enqueue_vector(sess, weights_queue, [1, 0])
+      _enqueue_vector(sess, weights_queue, [0, 1])
+      _enqueue_vector(sess, weights_queue, [0, 0])
+      weights = weights_queue.dequeue()
+
+      mean, update_op = metrics.mean(values, weights)
+
+      tf.local_variables_initializer().run()
+      for i in range(4):
+        update_op.eval(feed_dict={values: feed_values[i]})
+      self.assertAlmostEqual((0 + 1 - 4.2 + 0) / 4.0, mean.eval(), 5)
+
+
+class StreamingMeanTensorTest(tf.test.TestCase):
+
+  def setUp(self):
+    tf.reset_default_graph()
+
+  def testVars(self):
+    metrics.mean_tensor(tf.ones([4, 3]))
+    _assert_local_variables(self, (
+        'mean/total_tensor:0', 'mean/count_tensor:0'))
+
+  def testMetricsCollection(self):
+    my_collection_name = '__metrics__'
+    mean, _ = metrics.mean_tensor(
+        tf.ones([4, 3]),
+        metrics_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [mean])
+
+  def testUpdatesCollection(self):
+    my_collection_name = '__updates__'
+    _, update_op = metrics.mean_tensor(
+        tf.ones([4, 3]),
+        updates_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [update_op])
+
+  def testBasic(self):
+    with self.test_session() as sess:
+      values_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 2))
+      _enqueue_vector(sess, values_queue, [0, 1])
+      _enqueue_vector(sess, values_queue, [-4.2, 9.1])
+      _enqueue_vector(sess, values_queue, [6.5, 0])
+      _enqueue_vector(sess, values_queue, [-3.2, 4.0])
+      values = values_queue.dequeue()
+
+      mean, update_op = metrics.mean_tensor(values)
+
+      sess.run(tf.local_variables_initializer())
+      for _ in range(4):
+        sess.run(update_op)
+      self.assertAllClose([[-0.9/4., 3.525]], sess.run(mean))
+
+  def testMultiDimensional(self):
+    with self.test_session() as sess:
+      values_queue = tf.FIFOQueue(2, dtypes=tf.float32, shapes=(2, 2, 2))
+      _enqueue_vector(sess,
+                      values_queue,
+                      [[[1, 2], [1, 2]], [[1, 2], [1, 2]]],
+                      shape=(2, 2, 2))
+      _enqueue_vector(sess,
+                      values_queue,
+                      [[[1, 2], [1, 2]], [[3, 4], [9, 10]]],
+                      shape=(2, 2, 2))
+      values = values_queue.dequeue()
+
+      mean, update_op = metrics.mean_tensor(values)
+
+      sess.run(tf.local_variables_initializer())
+      for _ in range(2):
+        sess.run(update_op)
+      self.assertAllClose([[[1, 2], [1, 2]], [[2, 3], [5, 6]]],
+                          sess.run(mean))
+
+  def testUpdateOpsReturnsCurrentValue(self):
+    with self.test_session() as sess:
+      values_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 2))
+      _enqueue_vector(sess, values_queue, [0, 1])
+      _enqueue_vector(sess, values_queue, [-4.2, 9.1])
+      _enqueue_vector(sess, values_queue, [6.5, 0])
+      _enqueue_vector(sess, values_queue, [-3.2, 4.0])
+      values = values_queue.dequeue()
+
+      mean, update_op = metrics.mean_tensor(values)
+
+      sess.run(tf.local_variables_initializer())
+
+      self.assertAllClose([[0, 1]], sess.run(update_op), 5)
+      self.assertAllClose([[-2.1, 5.05]], sess.run(update_op), 5)
+      self.assertAllClose([[2.3/3., 10.1/3.]], sess.run(update_op), 5)
+      self.assertAllClose([[-0.9/4., 3.525]], sess.run(update_op), 5)
+
+      self.assertAllClose([[-0.9/4., 3.525]], sess.run(mean), 5)
+
+  def testWeighted1d(self):
+    with self.test_session() as sess:
+      # Create the queue that populates the values.
+      values_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 2))
+      _enqueue_vector(sess, values_queue, [0, 1])
+      _enqueue_vector(sess, values_queue, [-4.2, 9.1])
+      _enqueue_vector(sess, values_queue, [6.5, 0])
+      _enqueue_vector(sess, values_queue, [-3.2, 4.0])
+      values = values_queue.dequeue()
+
+      # Create the queue that populates the weights.
+      weights_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 1))
+      _enqueue_vector(sess, weights_queue, [[1]])
+      _enqueue_vector(sess, weights_queue, [[0]])
+      _enqueue_vector(sess, weights_queue, [[1]])
+      _enqueue_vector(sess, weights_queue, [[0]])
+      weights = weights_queue.dequeue()
+
+      mean, update_op = metrics.mean_tensor(values, weights)
+
+      sess.run(tf.local_variables_initializer())
+      for _ in range(4):
+        sess.run(update_op)
+      self.assertAllClose([[3.25, 0.5]], sess.run(mean), 5)
+
+  def testWeighted2d_1(self):
+    with self.test_session() as sess:
+      # Create the queue that populates the values.
+      values_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 2))
+      _enqueue_vector(sess, values_queue, [0, 1])
+      _enqueue_vector(sess, values_queue, [-4.2, 9.1])
+      _enqueue_vector(sess, values_queue, [6.5, 0])
+      _enqueue_vector(sess, values_queue, [-3.2, 4.0])
+      values = values_queue.dequeue()
+
+      # Create the queue that populates the weights.
+      weights_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 2))
+      _enqueue_vector(sess, weights_queue, [1, 1])
+      _enqueue_vector(sess, weights_queue, [1, 0])
+      _enqueue_vector(sess, weights_queue, [0, 1])
+      _enqueue_vector(sess, weights_queue, [0, 0])
+      weights = weights_queue.dequeue()
+
+      mean, update_op = metrics.mean_tensor(values, weights)
+
+      sess.run(tf.local_variables_initializer())
+      for _ in range(4):
+        sess.run(update_op)
+      self.assertAllClose([[-2.1, 0.5]], sess.run(mean), 5)
+
+  def testWeighted2d_2(self):
+    with self.test_session() as sess:
+      # Create the queue that populates the values.
+      values_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 2))
+      _enqueue_vector(sess, values_queue, [0, 1])
+      _enqueue_vector(sess, values_queue, [-4.2, 9.1])
+      _enqueue_vector(sess, values_queue, [6.5, 0])
+      _enqueue_vector(sess, values_queue, [-3.2, 4.0])
+      values = values_queue.dequeue()
+
+      # Create the queue that populates the weights.
+      weights_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 2))
+      _enqueue_vector(sess, weights_queue, [0, 1])
+      _enqueue_vector(sess, weights_queue, [0, 0])
+      _enqueue_vector(sess, weights_queue, [0, 1])
+      _enqueue_vector(sess, weights_queue, [0, 0])
+      weights = weights_queue.dequeue()
+
+      mean, update_op = metrics.mean_tensor(values, weights)
+
+      sess.run(tf.local_variables_initializer())
+      for _ in range(4):
+        sess.run(update_op)
+      self.assertAllClose([[0, 0.5]], sess.run(mean), 5)
+
+
+class AccuracyTest(tf.test.TestCase):
+
+  def setUp(self):
+    tf.reset_default_graph()
+
+  def testVars(self):
+    metrics.accuracy(
+        predictions=tf.ones((10, 1)), labels=tf.ones((10, 1)),
+        name='my_accuracy')
+    _assert_local_variables(self, (
+        'my_accuracy/count:0', 'my_accuracy/total:0'))
+
+  def testMetricsCollection(self):
+    my_collection_name = '__metrics__'
+    mean, _ = metrics.accuracy(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        metrics_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [mean])
+
+  def testUpdatesCollection(self):
+    my_collection_name = '__updates__'
+    _, update_op = metrics.accuracy(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        updates_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [update_op])
+
+  def testPredictionsAndLabelsOfDifferentSizeRaisesValueError(self):
+    predictions = tf.ones((10, 3))
+    labels = tf.ones((10, 4))
+    with self.assertRaises(ValueError):
+      metrics.accuracy(labels, predictions)
+
+  def testPredictionsAndWeightsOfDifferentSizeRaisesValueError(self):
+    predictions = tf.ones((10, 3))
+    labels = tf.ones((10, 3))
+    weights = tf.ones((9, 3))
+    with self.assertRaises(ValueError):
+      metrics.accuracy(labels, predictions, weights)
+
+  def testValueTensorIsIdempotent(self):
+    predictions = tf.random_uniform((10, 3), maxval=3, dtype=tf.int64, seed=1)
+    labels = tf.random_uniform((10, 3), maxval=3, dtype=tf.int64, seed=1)
+    accuracy, update_op = metrics.accuracy(
+        labels, predictions)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+
+      # Run several updates.
+      for _ in range(10):
+        sess.run(update_op)
+
+      # Then verify idempotency.
+      initial_accuracy = accuracy.eval()
+      for _ in range(10):
+        self.assertEqual(initial_accuracy, accuracy.eval())
+
+  def testMultipleUpdates(self):
+    with self.test_session() as sess:
+      # Create the queue that populates the predictions.
+      preds_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 1))
+      _enqueue_vector(sess, preds_queue, [0])
+      _enqueue_vector(sess, preds_queue, [1])
+      _enqueue_vector(sess, preds_queue, [2])
+      _enqueue_vector(sess, preds_queue, [1])
+      predictions = preds_queue.dequeue()
+
+      # Create the queue that populates the labels.
+      labels_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 1))
+      _enqueue_vector(sess, labels_queue, [0])
+      _enqueue_vector(sess, labels_queue, [1])
+      _enqueue_vector(sess, labels_queue, [1])
+      _enqueue_vector(sess, labels_queue, [2])
+      labels = labels_queue.dequeue()
+
+      accuracy, update_op = metrics.accuracy(
+          labels, predictions)
+
+      sess.run(tf.local_variables_initializer())
+      for _ in xrange(3):
+        sess.run(update_op)
+      self.assertEqual(0.5, sess.run(update_op))
+      self.assertEqual(0.5, accuracy.eval())
+
+  def testEffectivelyEquivalentSizes(self):
+    predictions = tf.ones((40, 1))
+    labels = tf.ones((40,))
+    with self.test_session() as sess:
+      accuracy, update_op = metrics.accuracy(
+          labels, predictions)
+
+      sess.run(tf.local_variables_initializer())
+      self.assertEqual(1.0, update_op.eval())
+      self.assertEqual(1.0, accuracy.eval())
+
+  def testEffectivelyEquivalentSizesWithStaicShapedWeight(self):
+    predictions = tf.convert_to_tensor([1, 1, 1])  # shape 3,
+    labels = tf.expand_dims(tf.convert_to_tensor([1, 0, 0]), 1)  # shape 3, 1
+    weights = tf.expand_dims(tf.convert_to_tensor([100, 1, 1]), 1)  # shape 3, 1
+
+    with self.test_session() as sess:
+      accuracy, update_op = metrics.accuracy(
+          labels, predictions, weights)
+
+      sess.run(tf.local_variables_initializer())
+      # if streaming_accuracy does not flatten the weight, accuracy would be
+      # 0.33333334 due to an intended broadcast of weight. Due to flattening,
+      # it will be higher than .95
+      self.assertGreater(update_op.eval(), .95)
+      self.assertGreater(accuracy.eval(), .95)
+
+  def testEffectivelyEquivalentSizesWithDynamicallyShapedWeight(self):
+    predictions = tf.convert_to_tensor([1, 1, 1])  # shape 3,
+    labels = tf.expand_dims(tf.convert_to_tensor([1, 0, 0]), 1)  # shape 3, 1
+
+    weights = [[100], [1], [1]]  # shape 3, 1
+    weights_placeholder = tf.placeholder(dtype=tf.int32, name='weights')
+    feed_dict = {weights_placeholder: weights}
+
+    with self.test_session() as sess:
+      accuracy, update_op = metrics.accuracy(
+          labels, predictions, weights_placeholder)
+
+      sess.run(tf.local_variables_initializer())
+      # if streaming_accuracy does not flatten the weight, accuracy would be
+      # 0.33333334 due to an intended broadcast of weight. Due to flattening,
+      # it will be higher than .95
+      self.assertGreater(update_op.eval(feed_dict=feed_dict), .95)
+      self.assertGreater(accuracy.eval(feed_dict=feed_dict), .95)
+
+  def testMultipleUpdatesWithWeightedValues(self):
+    with self.test_session() as sess:
+      # Create the queue that populates the predictions.
+      preds_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 1))
+      _enqueue_vector(sess, preds_queue, [0])
+      _enqueue_vector(sess, preds_queue, [1])
+      _enqueue_vector(sess, preds_queue, [2])
+      _enqueue_vector(sess, preds_queue, [1])
+      predictions = preds_queue.dequeue()
+
+      # Create the queue that populates the labels.
+      labels_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 1))
+      _enqueue_vector(sess, labels_queue, [0])
+      _enqueue_vector(sess, labels_queue, [1])
+      _enqueue_vector(sess, labels_queue, [1])
+      _enqueue_vector(sess, labels_queue, [2])
+      labels = labels_queue.dequeue()
+
+      # Create the queue that populates the weights.
+      weights_queue = tf.FIFOQueue(4, dtypes=tf.int64, shapes=(1, 1))
+      _enqueue_vector(sess, weights_queue, [1])
+      _enqueue_vector(sess, weights_queue, [1])
+      _enqueue_vector(sess, weights_queue, [0])
+      _enqueue_vector(sess, weights_queue, [0])
+      weights = weights_queue.dequeue()
+
+      accuracy, update_op = metrics.accuracy(
+          labels, predictions, weights)
+
+      sess.run(tf.local_variables_initializer())
+      for _ in xrange(3):
+        sess.run(update_op)
+      self.assertEqual(1.0, sess.run(update_op))
+      self.assertEqual(1.0, accuracy.eval())
+
+
+class PrecisionTest(tf.test.TestCase):
+
+  def setUp(self):
+    np.random.seed(1)
+    tf.reset_default_graph()
+
+  def testVars(self):
+    metrics.precision(
+        predictions=tf.ones((10, 1)), labels=tf.ones((10, 1)))
+    _assert_local_variables(self, (
+        'precision/false_positives/count:0',
+        'precision/true_positives/count:0'
+    ))
+
+  def testMetricsCollection(self):
+    my_collection_name = '__metrics__'
+    mean, _ = metrics.precision(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        metrics_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [mean])
+
+  def testUpdatesCollection(self):
+    my_collection_name = '__updates__'
+    _, update_op = metrics.precision(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        updates_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [update_op])
+
+  def testValueTensorIsIdempotent(self):
+    predictions = tf.random_uniform((10, 3), maxval=1, dtype=tf.int64, seed=1)
+    labels = tf.random_uniform((10, 3), maxval=1, dtype=tf.int64, seed=1)
+    precision, update_op = metrics.precision(
+        labels, predictions)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+
+      # Run several updates.
+      for _ in range(10):
+        sess.run(update_op)
+
+      # Then verify idempotency.
+      initial_precision = precision.eval()
+      for _ in range(10):
+        self.assertEqual(initial_precision, precision.eval())
+
+  def testAllCorrect(self):
+    inputs = np.random.randint(0, 2, size=(100, 1))
+
+    predictions = tf.constant(inputs)
+    labels = tf.constant(inputs)
+    precision, update_op = metrics.precision(
+        labels, predictions)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      self.assertAlmostEqual(1, sess.run(update_op))
+      self.assertAlmostEqual(1, precision.eval())
+
+  def testSomeCorrect(self):
+    predictions = tf.constant([1, 0, 1, 0], shape=(1, 4))
+    labels = tf.constant([0, 1, 1, 0], shape=(1, 4))
+    precision, update_op = metrics.precision(
+        labels, predictions)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      self.assertAlmostEqual(0.5, update_op.eval())
+      self.assertAlmostEqual(0.5, precision.eval())
+
+  def testWeighted1d(self):
+    predictions = tf.constant([[1, 0, 1, 0], [1, 0, 1, 0]])
+    labels = tf.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
+    precision, update_op = metrics.precision(
+        labels, predictions, weights=tf.constant([[2], [5]]))
+
+    with self.test_session():
+      tf.local_variables_initializer().run()
+      weighted_tp = 2.0 + 5.0
+      weighted_positives = (2.0 + 2.0) + (5.0 + 5.0)
+      expected_precision = weighted_tp / weighted_positives
+      self.assertAlmostEqual(expected_precision, update_op.eval())
+      self.assertAlmostEqual(expected_precision, precision.eval())
+
+  def testWeighted1d_placeholders(self):
+    predictions = tf.placeholder(dtype=tf.float32)
+    labels = tf.placeholder(dtype=tf.float32)
+    feed_dict = {
+        predictions: ((1, 0, 1, 0), (1, 0, 1, 0)),
+        labels: ((0, 1, 1, 0), (1, 0, 0, 1))
+    }
+    precision, update_op = metrics.precision(
+        labels, predictions, weights=tf.constant([[2], [5]]))
+
+    with self.test_session():
+      tf.local_variables_initializer().run()
+      weighted_tp = 2.0 + 5.0
+      weighted_positives = (2.0 + 2.0) + (5.0 + 5.0)
+      expected_precision = weighted_tp / weighted_positives
+      self.assertAlmostEqual(
+          expected_precision, update_op.eval(feed_dict=feed_dict))
+      self.assertAlmostEqual(
+          expected_precision, precision.eval(feed_dict=feed_dict))
+
+  def testWeighted2d(self):
+    predictions = tf.constant([[1, 0, 1, 0], [1, 0, 1, 0]])
+    labels = tf.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
+    precision, update_op = metrics.precision(
+        labels, predictions, weights=tf.constant([[1, 2, 3, 4], [4, 3, 2, 1]]))
+
+    with self.test_session():
+      tf.local_variables_initializer().run()
+      weighted_tp = 3.0 + 4.0
+      weighted_positives = (1.0 + 3.0) + (4.0 + 2.0)
+      expected_precision = weighted_tp / weighted_positives
+      self.assertAlmostEqual(expected_precision, update_op.eval())
+      self.assertAlmostEqual(expected_precision, precision.eval())
+
+  def testWeighted2d_placeholders(self):
+    predictions = tf.placeholder(dtype=tf.float32)
+    labels = tf.placeholder(dtype=tf.float32)
+    feed_dict = {
+        predictions: ((1, 0, 1, 0), (1, 0, 1, 0)),
+        labels: ((0, 1, 1, 0), (1, 0, 0, 1))
+    }
+    precision, update_op = metrics.precision(
+        labels, predictions, weights=tf.constant([[1, 2, 3, 4], [4, 3, 2, 1]]))
+
+    with self.test_session():
+      tf.local_variables_initializer().run()
+      weighted_tp = 3.0 + 4.0
+      weighted_positives = (1.0 + 3.0) + (4.0 + 2.0)
+      expected_precision = weighted_tp / weighted_positives
+      self.assertAlmostEqual(
+          expected_precision, update_op.eval(feed_dict=feed_dict))
+      self.assertAlmostEqual(
+          expected_precision, precision.eval(feed_dict=feed_dict))
+
+  def testAllIncorrect(self):
+    inputs = np.random.randint(0, 2, size=(100, 1))
+
+    predictions = tf.constant(inputs)
+    labels = tf.constant(1 - inputs)
+    precision, update_op = metrics.precision(
+        labels, predictions)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      sess.run(update_op)
+      self.assertAlmostEqual(0, precision.eval())
+
+  def testZeroTrueAndFalsePositivesGivesZeroPrecision(self):
+    predictions = tf.constant([0, 0, 0, 0])
+    labels = tf.constant([0, 0, 0, 0])
+    precision, update_op = metrics.precision(
+        labels, predictions)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      sess.run(update_op)
+      self.assertEqual(0.0, precision.eval())
+
+
+class StreamingRecallTest(tf.test.TestCase):
+
+  def setUp(self):
+    np.random.seed(1)
+    tf.reset_default_graph()
+
+  def testVars(self):
+    metrics.recall(
+        predictions=tf.ones((10, 1)), labels=tf.ones((10, 1)))
+    _assert_local_variables(self, (
+        'recall/false_negatives/count:0',
+        'recall/true_positives/count:0'
+    ))
+
+  def testMetricsCollection(self):
+    my_collection_name = '__metrics__'
+    mean, _ = metrics.recall(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        metrics_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [mean])
+
+  def testUpdatesCollection(self):
+    my_collection_name = '__updates__'
+    _, update_op = metrics.recall(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        updates_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [update_op])
+
+  def testValueTensorIsIdempotent(self):
+    predictions = tf.random_uniform((10, 3), maxval=1, dtype=tf.int64, seed=1)
+    labels = tf.random_uniform((10, 3), maxval=1, dtype=tf.int64, seed=1)
+    recall, update_op = metrics.recall(
+        labels, predictions)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+
+      # Run several updates.
+      for _ in range(10):
+        sess.run(update_op)
+
+      # Then verify idempotency.
+      initial_recall = recall.eval()
+      for _ in range(10):
+        self.assertEqual(initial_recall, recall.eval())
+
+  def testAllCorrect(self):
+    np_inputs = np.random.randint(0, 2, size=(100, 1))
+
+    predictions = tf.constant(np_inputs)
+    labels = tf.constant(np_inputs)
+    recall, update_op = metrics.recall(labels, predictions)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      sess.run(update_op)
+      self.assertEqual(1, recall.eval())
+
+  def testSomeCorrect(self):
+    predictions = tf.constant([1, 0, 1, 0], shape=(1, 4))
+    labels = tf.constant([0, 1, 1, 0], shape=(1, 4))
+    recall, update_op = metrics.recall(labels, predictions)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      self.assertAlmostEqual(0.5, update_op.eval())
+      self.assertAlmostEqual(0.5, recall.eval())
+
+  def testWeighted1d(self):
+    predictions = tf.constant([[1, 0, 1, 0], [0, 1, 0, 1]])
+    labels = tf.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
+    weights = tf.constant([[2], [5]])
+    recall, update_op = metrics.recall(
+        labels, predictions, weights=weights)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      weighted_tp = 2.0 + 5.0
+      weighted_t = (2.0 + 2.0) + (5.0 + 5.0)
+      expected_precision = weighted_tp / weighted_t
+      self.assertAlmostEqual(expected_precision, update_op.eval())
+      self.assertAlmostEqual(expected_precision, recall.eval())
+
+  def testWeighted2d(self):
+    predictions = tf.constant([[1, 0, 1, 0], [0, 1, 0, 1]])
+    labels = tf.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
+    weights = tf.constant([[1, 2, 3, 4], [4, 3, 2, 1]])
+    recall, update_op = metrics.recall(
+        labels, predictions, weights=weights)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      weighted_tp = 3.0 + 1.0
+      weighted_t = (2.0 + 3.0) + (4.0 + 1.0)
+      expected_precision = weighted_tp / weighted_t
+      self.assertAlmostEqual(expected_precision, update_op.eval())
+      self.assertAlmostEqual(expected_precision, recall.eval())
+
+  def testAllIncorrect(self):
+    np_inputs = np.random.randint(0, 2, size=(100, 1))
+
+    predictions = tf.constant(np_inputs)
+    labels = tf.constant(1 - np_inputs)
+    recall, update_op = metrics.recall(labels, predictions)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      sess.run(update_op)
+      self.assertEqual(0, recall.eval())
+
+  def testZeroTruePositivesAndFalseNegativesGivesZeroRecall(self):
+    predictions = tf.zeros((1, 4))
+    labels = tf.zeros((1, 4))
+    recall, update_op = metrics.recall(labels, predictions)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      sess.run(update_op)
+      self.assertEqual(0, recall.eval())
+
+
+class StreamingAUCTest(tf.test.TestCase):
+
+  def setUp(self):
+    np.random.seed(1)
+    tf.reset_default_graph()
+
+  def testVars(self):
+    metrics.auc(
+        predictions=tf.ones((10, 1)), labels=tf.ones((10, 1)))
+    _assert_local_variables(self, (
+        'auc/true_positives:0',
+        'auc/false_negatives:0',
+        'auc/false_positives:0',
+        'auc/true_negatives:0'
+    ))
+
+  def testMetricsCollection(self):
+    my_collection_name = '__metrics__'
+    mean, _ = metrics.auc(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        metrics_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [mean])
+
+  def testUpdatesCollection(self):
+    my_collection_name = '__updates__'
+    _, update_op = metrics.auc(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        updates_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [update_op])
+
+  def testValueTensorIsIdempotent(self):
+    predictions = tf.random_uniform((10, 3), maxval=1, dtype=tf.float32, seed=1)
+    labels = tf.random_uniform((10, 3), maxval=1, dtype=tf.int64, seed=1)
+    auc, update_op = metrics.auc(
+        labels, predictions)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+
+      # Run several updates.
+      for _ in range(10):
+        sess.run(update_op)
+
+      # Then verify idempotency.
+      initial_auc = auc.eval()
+      for _ in range(10):
+        self.assertAlmostEqual(initial_auc, auc.eval(), 5)
+
+  def testAllCorrect(self):
+    self.allCorrectAsExpected('ROC')
+
+  def allCorrectAsExpected(self, curve):
+    inputs = np.random.randint(0, 2, size=(100, 1))
+
+    with self.test_session() as sess:
+      predictions = tf.constant(inputs, dtype=tf.float32)
+      labels = tf.constant(inputs)
+      auc, update_op = metrics.auc(labels, predictions, curve=curve)
+
+      sess.run(tf.local_variables_initializer())
+      self.assertEqual(1, sess.run(update_op))
+
+      self.assertEqual(1, auc.eval())
+
+  def testSomeCorrect(self):
+    with self.test_session() as sess:
+      predictions = tf.constant([1, 0, 1, 0], shape=(1, 4), dtype=tf.float32)
+      labels = tf.constant([0, 1, 1, 0], shape=(1, 4))
+      auc, update_op = metrics.auc(labels, predictions)
+
+      sess.run(tf.local_variables_initializer())
+      self.assertAlmostEqual(0.5, sess.run(update_op))
+
+      self.assertAlmostEqual(0.5, auc.eval())
+
+  def testWeighted1d(self):
+    with self.test_session() as sess:
+      predictions = tf.constant([1, 0, 1, 0], shape=(1, 4), dtype=tf.float32)
+      labels = tf.constant([0, 1, 1, 0], shape=(1, 4))
+      weights = tf.constant([2], shape=(1, 1))
+      auc, update_op = metrics.auc(labels,
+                                   predictions, weights=weights)
+
+      sess.run(tf.local_variables_initializer())
+      self.assertAlmostEqual(0.5, sess.run(update_op), 5)
+
+      self.assertAlmostEqual(0.5, auc.eval(), 5)
+
+  def testWeighted2d(self):
+    with self.test_session() as sess:
+      predictions = tf.constant([1, 0, 1, 0], shape=(1, 4), dtype=tf.float32)
+      labels = tf.constant([0, 1, 1, 0], shape=(1, 4))
+      weights = tf.constant([1, 2, 3, 4], shape=(1, 4))
+      auc, update_op = metrics.auc(labels,
+                                   predictions, weights=weights)
+
+      sess.run(tf.local_variables_initializer())
+      self.assertAlmostEqual(0.7, sess.run(update_op), 5)
+
+      self.assertAlmostEqual(0.7, auc.eval(), 5)
+
+  def testAUCPRSpecialCase(self):
+    with self.test_session() as sess:
+      predictions = tf.constant([0.1, 0.4, 0.35, 0.8],
+                                shape=(1, 4), dtype=tf.float32)
+      labels = tf.constant([0, 0, 1, 1], shape=(1, 4))
+      auc, update_op = metrics.auc(labels, predictions, curve='PR')
+
+      sess.run(tf.local_variables_initializer())
+      self.assertAlmostEqual(0.79166, sess.run(update_op), delta=1e-3)
+
+      self.assertAlmostEqual(0.79166, auc.eval(), delta=1e-3)
+
+  def testAnotherAUCPRSpecialCase(self):
+    with self.test_session() as sess:
+      predictions = tf.constant([0.1, 0.4, 0.35, 0.8, 0.1, 0.135, 0.81],
+                                shape=(1, 7), dtype=tf.float32)
+      labels = tf.constant([0, 0, 1, 0, 1, 0, 1], shape=(1, 7))
+      auc, update_op = metrics.auc(labels, predictions, curve='PR')
+
+      sess.run(tf.local_variables_initializer())
+      self.assertAlmostEqual(0.610317, sess.run(update_op), delta=1e-3)
+
+      self.assertAlmostEqual(0.610317, auc.eval(), delta=1e-3)
+
+  def testThirdAUCPRSpecialCase(self):
+    with self.test_session() as sess:
+      predictions = tf.constant([0.0, 0.1, 0.2, 0.33, 0.3, 0.4, 0.5],
+                                shape=(1, 7), dtype=tf.float32)
+      labels = tf.constant([0, 0, 0, 0, 1, 1, 1], shape=(1, 7))
+      auc, update_op = metrics.auc(labels, predictions, curve='PR')
+
+      sess.run(tf.local_variables_initializer())
+      self.assertAlmostEqual(0.90277, sess.run(update_op), delta=1e-3)
+
+      self.assertAlmostEqual(0.90277, auc.eval(), delta=1e-3)
+
+  def testAllIncorrect(self):
+    inputs = np.random.randint(0, 2, size=(100, 1))
+
+    with self.test_session() as sess:
+      predictions = tf.constant(inputs, dtype=tf.float32)
+      labels = tf.constant(1 - inputs, dtype=tf.float32)
+      auc, update_op = metrics.auc(labels, predictions)
+
+      sess.run(tf.local_variables_initializer())
+      self.assertAlmostEqual(0, sess.run(update_op))
+
+      self.assertAlmostEqual(0, auc.eval())
+
+  def testZeroTruePositivesAndFalseNegativesGivesOneAUC(self):
+    with self.test_session() as sess:
+      predictions = tf.zeros([4], dtype=tf.float32)
+      labels = tf.zeros([4])
+      auc, update_op = metrics.auc(labels, predictions)
+
+      sess.run(tf.local_variables_initializer())
+      self.assertAlmostEqual(1, sess.run(update_op), 6)
+
+      self.assertAlmostEqual(1, auc.eval(), 6)
+
+  def testRecallOneAndPrecisionOneGivesOnePRAUC(self):
+    with self.test_session() as sess:
+      predictions = tf.ones([4], dtype=tf.float32)
+      labels = tf.ones([4])
+      auc, update_op = metrics.auc(labels,
+                                   predictions,
+                                   curve='PR')
+
+      sess.run(tf.local_variables_initializer())
+      self.assertAlmostEqual(1, sess.run(update_op), 6)
+
+      self.assertAlmostEqual(1, auc.eval(), 6)
+
+  def np_auc(self, predictions, labels, weights):
+    """Computes the AUC explicitely using Numpy.
+
+    Args:
+      predictions: an ndarray with shape [N].
+      labels: an ndarray with shape [N].
+      weights: an ndarray with shape [N].
+
+    Returns:
+      the area under the ROC curve.
+    """
+    if weights is None:
+      weights = np.ones(np.size(predictions))
+    is_positive = labels > 0
+    num_positives = np.sum(weights[is_positive])
+    num_negatives = np.sum(weights[~is_positive])
+
+    # Sort descending:
+    inds = np.argsort(-predictions)
+
+    sorted_labels = labels[inds]
+    sorted_weights = weights[inds]
+    is_positive = sorted_labels > 0
+
+    tp = np.cumsum(sorted_weights * is_positive) / num_positives
+    return np.sum((sorted_weights * tp)[~is_positive]) / num_negatives
+
+  def testWithMultipleUpdates(self):
+    num_samples = 1000
+    batch_size = 10
+    num_batches = int(num_samples / batch_size)
+
+    # Create the labels and data.
+    labels = np.random.randint(0, 2, size=num_samples)
+    noise = np.random.normal(0.0, scale=0.2, size=num_samples)
+    predictions = 0.4 + 0.2 * labels + noise
+    predictions[predictions > 1] = 1
+    predictions[predictions < 0] = 0
+
+    def _enqueue_as_batches(x, enqueue_ops):
+      x_batches = x.astype(np.float32).reshape((num_batches, batch_size))
+      x_queue = tf.FIFOQueue(num_batches, dtypes=tf.float32,
+                             shapes=(batch_size,))
+      for i in range(num_batches):
+        enqueue_ops[i].append(x_queue.enqueue(x_batches[i, :]))
+      return x_queue.dequeue()
+
+    for weights in (None,
+                    np.ones(num_samples),
+                    np.random.exponential(scale=1.0, size=num_samples)):
+      expected_auc = self.np_auc(predictions, labels, weights)
+
+      with self.test_session() as sess:
+        enqueue_ops = [[] for i in range(num_batches)]
+        tf_predictions = _enqueue_as_batches(predictions, enqueue_ops)
+        tf_labels = _enqueue_as_batches(labels, enqueue_ops)
+        tf_weights = (_enqueue_as_batches(weights, enqueue_ops)
+                      if weights is not None else None)
+
+        for i in range(num_batches):
+          sess.run(enqueue_ops[i])
+
+        auc, update_op = metrics.auc(
+            tf_labels, tf_predictions, curve='ROC', num_thresholds=500,
+            weights=tf_weights)
+
+        sess.run(tf.local_variables_initializer())
+        for i in range(num_batches):
+          sess.run(update_op)
+
+        # Since this is only approximate, we can't expect a 6 digits match.
+        # Although with higher number of samples/thresholds we should see the
+        # accuracy improving
+        self.assertAlmostEqual(expected_auc, auc.eval(), 2)
+
+
+class SpecificityAtSensitivityTest(tf.test.TestCase):
+
+  def setUp(self):
+    np.random.seed(1)
+    tf.reset_default_graph()
+
+  def testVars(self):
+    metrics.specificity_at_sensitivity(
+        predictions=tf.ones((10, 1)), labels=tf.ones((10, 1)), sensitivity=0.7)
+    _assert_local_variables(self, (
+        'specificity_at_sensitivity/true_positives:0',
+        'specificity_at_sensitivity/false_negatives:0',
+        'specificity_at_sensitivity/false_positives:0',
+        'specificity_at_sensitivity/true_negatives:0'
+    ))
+
+  def testMetricsCollection(self):
+    my_collection_name = '__metrics__'
+    mean, _ = metrics.specificity_at_sensitivity(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        sensitivity=0.7,
+        metrics_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [mean])
+
+  def testUpdatesCollection(self):
+    my_collection_name = '__updates__'
+    _, update_op = metrics.specificity_at_sensitivity(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        sensitivity=0.7,
+        updates_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [update_op])
+
+  def testValueTensorIsIdempotent(self):
+    predictions = tf.random_uniform((10, 3), maxval=1, dtype=tf.float32, seed=1)
+    labels = tf.random_uniform((10, 3), maxval=2, dtype=tf.int64, seed=1)
+    specificity, update_op = metrics.specificity_at_sensitivity(
+        labels, predictions, sensitivity=0.7)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+
+      # Run several updates.
+      for _ in range(10):
+        sess.run(update_op)
+
+      # Then verify idempotency.
+      initial_specificity = specificity.eval()
+      for _ in range(10):
+        self.assertAlmostEqual(initial_specificity, specificity.eval(), 5)
+
+  def testAllCorrect(self):
+    inputs = np.random.randint(0, 2, size=(100, 1))
+
+    predictions = tf.constant(inputs, dtype=tf.float32)
+    labels = tf.constant(inputs)
+    specificity, update_op = metrics.specificity_at_sensitivity(
+        labels, predictions, sensitivity=0.7)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      self.assertEqual(1, sess.run(update_op))
+      self.assertEqual(1, specificity.eval())
+
+  def testSomeCorrectHighSensitivity(self):
+    predictions_values = [0.1, 0.2, 0.4, 0.3, 0.0,
+                          0.1, 0.45, 0.5, 0.8, 0.9]
+    labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+
+    predictions = tf.constant(predictions_values, dtype=tf.float32)
+    labels = tf.constant(labels_values)
+    specificity, update_op = metrics.specificity_at_sensitivity(
+        labels, predictions, sensitivity=0.8)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      self.assertAlmostEqual(1.0, sess.run(update_op))
+      self.assertAlmostEqual(1.0, specificity.eval())
+
+  def testSomeCorrectLowSensitivity(self):
+    predictions_values = [0.1, 0.2, 0.4, 0.3, 0.0,
+                          0.1, 0.2, 0.2, 0.26, 0.26]
+    labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+
+    predictions = tf.constant(predictions_values, dtype=tf.float32)
+    labels = tf.constant(labels_values)
+    specificity, update_op = metrics.specificity_at_sensitivity(
+        labels, predictions, sensitivity=0.4)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+
+      self.assertAlmostEqual(0.6, sess.run(update_op))
+      self.assertAlmostEqual(0.6, specificity.eval())
+
+  def testWeighted1d(self):
+    predictions_values = [0.1, 0.2, 0.4, 0.3, 0.0,
+                          0.1, 0.2, 0.2, 0.26, 0.26]
+    labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+    weights_values = [3]
+
+    predictions = tf.constant(predictions_values, dtype=tf.float32)
+    labels = tf.constant(labels_values)
+    weights = tf.constant(weights_values)
+    specificity, update_op = metrics.specificity_at_sensitivity(
+        labels, predictions, weights=weights, sensitivity=0.4)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+
+      self.assertAlmostEqual(0.6, sess.run(update_op))
+      self.assertAlmostEqual(0.6, specificity.eval())
+
+  def testWeighted2d(self):
+    predictions_values = [0.1, 0.2, 0.4, 0.3, 0.0,
+                          0.1, 0.2, 0.2, 0.26, 0.26]
+    labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+    weights_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+    predictions = tf.constant(predictions_values, dtype=tf.float32)
+    labels = tf.constant(labels_values)
+    weights = tf.constant(weights_values)
+    specificity, update_op = metrics.specificity_at_sensitivity(
+        labels, predictions, weights=weights, sensitivity=0.4)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+
+      self.assertAlmostEqual(8.0 / 15.0, sess.run(update_op))
+      self.assertAlmostEqual(8.0 / 15.0, specificity.eval())
+
+
+class StreamingSensitivityAtSpecificityTest(tf.test.TestCase):
+
+  def setUp(self):
+    np.random.seed(1)
+    tf.reset_default_graph()
+
+  def testVars(self):
+    metrics.sensitivity_at_specificity(
+        predictions=tf.ones((10, 1)), labels=tf.ones((10, 1)), specificity=0.7)
+    _assert_local_variables(self, (
+        'sensitivity_at_specificity/true_positives:0',
+        'sensitivity_at_specificity/false_negatives:0',
+        'sensitivity_at_specificity/false_positives:0',
+        'sensitivity_at_specificity/true_negatives:0'
+    ))
+
+  def testMetricsCollection(self):
+    my_collection_name = '__metrics__'
+    mean, _ = metrics.sensitivity_at_specificity(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        specificity=0.7,
+        metrics_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [mean])
+
+  def testUpdatesCollection(self):
+    my_collection_name = '__updates__'
+    _, update_op = metrics.sensitivity_at_specificity(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        specificity=0.7,
+        updates_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [update_op])
+
+  def testValueTensorIsIdempotent(self):
+    predictions = tf.random_uniform((10, 3), maxval=1, dtype=tf.float32, seed=1)
+    labels = tf.random_uniform((10, 3), maxval=2, dtype=tf.int64, seed=1)
+    sensitivity, update_op = metrics.sensitivity_at_specificity(
+        labels, predictions, specificity=0.7)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+
+      # Run several updates.
+      for _ in range(10):
+        sess.run(update_op)
+
+      # Then verify idempotency.
+      initial_sensitivity = sensitivity.eval()
+      for _ in range(10):
+        self.assertAlmostEqual(initial_sensitivity, sensitivity.eval(), 5)
+
+  def testAllCorrect(self):
+    inputs = np.random.randint(0, 2, size=(100, 1))
+
+    predictions = tf.constant(inputs, dtype=tf.float32)
+    labels = tf.constant(inputs)
+    specificity, update_op = metrics.sensitivity_at_specificity(
+        labels, predictions, specificity=0.7)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      self.assertEqual(1, sess.run(update_op))
+      self.assertEqual(1, specificity.eval())
+
+  def testSomeCorrectHighSpecificity(self):
+    predictions_values = [0.0, 0.1, 0.2, 0.3, 0.4,
+                          0.1, 0.45, 0.5, 0.8, 0.9]
+    labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+
+    predictions = tf.constant(predictions_values, dtype=tf.float32)
+    labels = tf.constant(labels_values)
+    specificity, update_op = metrics.sensitivity_at_specificity(
+        labels, predictions, specificity=0.8)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      self.assertAlmostEqual(0.8, sess.run(update_op))
+      self.assertAlmostEqual(0.8, specificity.eval())
+
+  def testSomeCorrectLowSpecificity(self):
+    predictions_values = [0.0, 0.1, 0.2, 0.3, 0.4,
+                          0.01, 0.02, 0.25, 0.26, 0.26]
+    labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+
+    predictions = tf.constant(predictions_values, dtype=tf.float32)
+    labels = tf.constant(labels_values)
+    specificity, update_op = metrics.sensitivity_at_specificity(
+        labels, predictions, specificity=0.4)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      self.assertAlmostEqual(0.6, sess.run(update_op))
+      self.assertAlmostEqual(0.6, specificity.eval())
+
+  def testWeighted(self):
+    predictions_values = [0.0, 0.1, 0.2, 0.3, 0.4,
+                          0.01, 0.02, 0.25, 0.26, 0.26]
+    labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+    weights_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+    predictions = tf.constant(predictions_values, dtype=tf.float32)
+    labels = tf.constant(labels_values)
+    weights = tf.constant(weights_values)
+    specificity, update_op = metrics.sensitivity_at_specificity(
+        labels, predictions, weights=weights, specificity=0.4)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      self.assertAlmostEqual(0.675, sess.run(update_op))
+      self.assertAlmostEqual(0.675, specificity.eval())
+
+
+# TODO(nsilberman): Break this up into two sets of tests.
+class StreamingPrecisionRecallThresholdsTest(tf.test.TestCase):
+
+  def setUp(self):
+    np.random.seed(1)
+    tf.reset_default_graph()
+
+  def testVars(self):
+    metrics.precision_at_thresholds(
+        predictions=tf.ones((10, 1)), labels=tf.ones((10, 1)),
+        thresholds=[0, 0.5, 1.0])
+    _assert_local_variables(self, (
+        'precision_at_thresholds/true_positives:0',
+        'precision_at_thresholds/false_positives:0',
+    ))
+
+  def testMetricsCollection(self):
+    my_collection_name = '__metrics__'
+    prec, _ = metrics.precision_at_thresholds(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        thresholds=[0, 0.5, 1.0],
+        metrics_collections=[my_collection_name])
+    rec, _ = metrics.recall_at_thresholds(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        thresholds=[0, 0.5, 1.0],
+        metrics_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [prec, rec])
+
+  def testUpdatesCollection(self):
+    my_collection_name = '__updates__'
+    _, precision_op = metrics.precision_at_thresholds(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        thresholds=[0, 0.5, 1.0],
+        updates_collections=[my_collection_name])
+    _, recall_op = metrics.recall_at_thresholds(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        thresholds=[0, 0.5, 1.0],
+        updates_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name),
+                         [precision_op, recall_op])
+
+  def testValueTensorIsIdempotent(self):
+    predictions = tf.random_uniform((10, 3), maxval=1, dtype=tf.float32, seed=1)
+    labels = tf.random_uniform((10, 3), maxval=1, dtype=tf.int64, seed=1)
+    thresholds = [0, 0.5, 1.0]
+    prec, prec_op = metrics.precision_at_thresholds(
+        labels, predictions, thresholds)
+    rec, rec_op = metrics.recall_at_thresholds(
+        labels, predictions, thresholds)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+
+      # Run several updates, then verify idempotency.
+      sess.run([prec_op, rec_op])
+      initial_prec = prec.eval()
+      initial_rec = rec.eval()
+      for _ in range(10):
+        sess.run([prec_op, rec_op])
+        self.assertAllClose(initial_prec, prec.eval())
+        self.assertAllClose(initial_rec, rec.eval())
+
+  # TODO(nsilberman): fix tests (passing but incorrect).
+  def testAllCorrect(self):
+    inputs = np.random.randint(0, 2, size=(100, 1))
+
+    with self.test_session() as sess:
+      predictions = tf.constant(inputs, dtype=tf.float32)
+      labels = tf.constant(inputs)
+      thresholds = [0.5]
+      prec, prec_op = metrics.precision_at_thresholds(
+          labels, predictions, thresholds)
+      rec, rec_op = metrics.recall_at_thresholds(
+          labels, predictions, thresholds)
+
+      sess.run(tf.local_variables_initializer())
+      sess.run([prec_op, rec_op])
+
+      self.assertEqual(1, prec.eval())
+      self.assertEqual(1, rec.eval())
+
+  def testSomeCorrect(self):
+    with self.test_session() as sess:
+      predictions = tf.constant([1, 0, 1, 0], shape=(1, 4), dtype=tf.float32)
+      labels = tf.constant([0, 1, 1, 0], shape=(1, 4))
+      thresholds = [0.5]
+      prec, prec_op = metrics.precision_at_thresholds(
+          labels, predictions, thresholds)
+      rec, rec_op = metrics.recall_at_thresholds(
+          labels, predictions, thresholds)
+
+      sess.run(tf.local_variables_initializer())
+      sess.run([prec_op, rec_op])
+
+      self.assertAlmostEqual(0.5, prec.eval())
+      self.assertAlmostEqual(0.5, rec.eval())
+
+  def testAllIncorrect(self):
+    inputs = np.random.randint(0, 2, size=(100, 1))
+
+    with self.test_session() as sess:
+      predictions = tf.constant(inputs, dtype=tf.float32)
+      labels = tf.constant(1 - inputs, dtype=tf.float32)
+      thresholds = [0.5]
+      prec, prec_op = metrics.precision_at_thresholds(
+          labels, predictions, thresholds)
+      rec, rec_op = metrics.recall_at_thresholds(
+          labels, predictions, thresholds)
+
+      sess.run(tf.local_variables_initializer())
+      sess.run([prec_op, rec_op])
+
+      self.assertAlmostEqual(0, prec.eval())
+      self.assertAlmostEqual(0, rec.eval())
+
+  def testWeights1d(self):
+    with self.test_session() as sess:
+      predictions = tf.constant([[1, 0], [1, 0]], shape=(2, 2),
+                                dtype=tf.float32)
+      labels = tf.constant([[0, 1], [1, 0]], shape=(2, 2))
+      weights = tf.constant([[0], [1]], shape=(2, 1), dtype=tf.float32)
+      thresholds = [0.5, 1.1]
+      prec, prec_op = metrics.precision_at_thresholds(
+          labels, predictions, thresholds, weights=weights)
+      rec, rec_op = metrics.recall_at_thresholds(
+          labels, predictions, thresholds, weights=weights)
+
+      [prec_low, prec_high] = tf.split(0, 2, prec)
+      prec_low = tf.reshape(prec_low, shape=())
+      prec_high = tf.reshape(prec_high, shape=())
+      [rec_low, rec_high] = tf.split(0, 2, rec)
+      rec_low = tf.reshape(rec_low, shape=())
+      rec_high = tf.reshape(rec_high, shape=())
+
+      sess.run(tf.local_variables_initializer())
+      sess.run([prec_op, rec_op])
+
+      self.assertAlmostEqual(1.0, prec_low.eval(), places=5)
+      self.assertAlmostEqual(0.0, prec_high.eval(), places=5)
+      self.assertAlmostEqual(1.0, rec_low.eval(), places=5)
+      self.assertAlmostEqual(0.0, rec_high.eval(), places=5)
+
+  def testWeights2d(self):
+    with self.test_session() as sess:
+      predictions = tf.constant([[1, 0], [1, 0]], shape=(2, 2),
+                                dtype=tf.float32)
+      labels = tf.constant([[0, 1], [1, 0]], shape=(2, 2))
+      weights = tf.constant([[0, 0], [1, 1]], shape=(2, 2), dtype=tf.float32)
+      thresholds = [0.5, 1.1]
+      prec, prec_op = metrics.precision_at_thresholds(
+          labels, predictions, thresholds, weights=weights)
+      rec, rec_op = metrics.recall_at_thresholds(
+          labels, predictions, thresholds, weights=weights)
+
+      [prec_low, prec_high] = tf.split(0, 2, prec)
+      prec_low = tf.reshape(prec_low, shape=())
+      prec_high = tf.reshape(prec_high, shape=())
+      [rec_low, rec_high] = tf.split(0, 2, rec)
+      rec_low = tf.reshape(rec_low, shape=())
+      rec_high = tf.reshape(rec_high, shape=())
+
+      sess.run(tf.local_variables_initializer())
+      sess.run([prec_op, rec_op])
+
+      self.assertAlmostEqual(1.0, prec_low.eval(), places=5)
+      self.assertAlmostEqual(0.0, prec_high.eval(), places=5)
+      self.assertAlmostEqual(1.0, rec_low.eval(), places=5)
+      self.assertAlmostEqual(0.0, rec_high.eval(), places=5)
+
+  def testExtremeThresholds(self):
+    with self.test_session() as sess:
+      predictions = tf.constant([1, 0, 1, 0], shape=(1, 4), dtype=tf.float32)
+      labels = tf.constant([0, 1, 1, 1], shape=(1, 4))
+      thresholds = [-1.0, 2.0]  # lower/higher than any values
+      prec, prec_op = metrics.precision_at_thresholds(
+          labels, predictions, thresholds)
+      rec, rec_op = metrics.recall_at_thresholds(
+          labels, predictions, thresholds)
+
+      [prec_low, prec_high] = tf.split(0, 2, prec)
+      [rec_low, rec_high] = tf.split(0, 2, rec)
+
+      sess.run(tf.local_variables_initializer())
+      sess.run([prec_op, rec_op])
+
+      self.assertAlmostEqual(0.75, prec_low.eval())
+      self.assertAlmostEqual(0.0, prec_high.eval())
+      self.assertAlmostEqual(1.0, rec_low.eval())
+      self.assertAlmostEqual(0.0, rec_high.eval())
+
+  def testZeroLabelsPredictions(self):
+    with self.test_session() as sess:
+      predictions = tf.zeros([4], dtype=tf.float32)
+      labels = tf.zeros([4])
+      thresholds = [0.5]
+      prec, prec_op = metrics.precision_at_thresholds(
+          labels, predictions, thresholds)
+      rec, rec_op = metrics.recall_at_thresholds(
+          labels, predictions, thresholds)
+
+      sess.run(tf.local_variables_initializer())
+      sess.run([prec_op, rec_op])
+
+      self.assertAlmostEqual(0, prec.eval(), 6)
+      self.assertAlmostEqual(0, rec.eval(), 6)
+
+  def testWithMultipleUpdates(self):
+    num_samples = 1000
+    batch_size = 10
+    num_batches = int(num_samples / batch_size)
+
+    # Create the labels and data.
+    labels = np.random.randint(0, 2, size=(num_samples, 1))
+    noise = np.random.normal(0.0, scale=0.2, size=(num_samples, 1))
+    predictions = 0.4 + 0.2 * labels + noise
+    predictions[predictions > 1] = 1
+    predictions[predictions < 0] = 0
+    thresholds = [0.3]
+
+    tp = 0
+    fp = 0
+    fn = 0
+    tn = 0
+    for i in range(num_samples):
+      if predictions[i] > thresholds[0]:
+        if labels[i] == 1:
+          tp += 1
+        else:
+          fp += 1
+      else:
+        if labels[i] == 1:
+          fn += 1
+        else:
+          tn += 1
+    epsilon = 1e-7
+    expected_prec = tp / (epsilon + tp + fp)
+    expected_rec = tp / (epsilon + tp + fn)
+
+    labels = labels.astype(np.float32)
+    predictions = predictions.astype(np.float32)
+
+    with self.test_session() as sess:
+      # Reshape the data so its easy to queue up:
+      predictions_batches = predictions.reshape((batch_size, num_batches))
+      labels_batches = labels.reshape((batch_size, num_batches))
+
+      # Enqueue the data:
+      predictions_queue = tf.FIFOQueue(num_batches, dtypes=tf.float32,
+                                       shapes=(batch_size,))
+      labels_queue = tf.FIFOQueue(num_batches, dtypes=tf.float32,
+                                  shapes=(batch_size,))
+
+      for i in range(int(num_batches)):
+        tf_prediction = tf.constant(predictions_batches[:, i])
+        tf_label = tf.constant(labels_batches[:, i])
+        sess.run([predictions_queue.enqueue(tf_prediction),
+                  labels_queue.enqueue(tf_label)])
+
+      tf_predictions = predictions_queue.dequeue()
+      tf_labels = labels_queue.dequeue()
+
+      prec, prec_op = metrics.precision_at_thresholds(
+          tf_labels, tf_predictions, thresholds)
+      rec, rec_op = metrics.recall_at_thresholds(
+          tf_labels, tf_predictions, thresholds)
+
+      sess.run(tf.local_variables_initializer())
+      for _ in range(int(num_samples / batch_size)):
+        sess.run([prec_op, rec_op])
+      # Since this is only approximate, we can't expect a 6 digits match.
+      # Although with higher number of samples/thresholds we should see the
+      # accuracy improving
+      self.assertAlmostEqual(expected_prec, prec.eval(), 2)
+      self.assertAlmostEqual(expected_rec, rec.eval(), 2)
+
+
+class StreamingSparsePrecisionTest(tf.test.TestCase):
+
+  def _test_streaming_sparse_precision_at_k(self,
+                                            predictions,
+                                            labels,
+                                            k,
+                                            expected,
+                                            class_id=None,
+                                            weights=None):
+    with tf.Graph().as_default() as g, self.test_session(g):
+      if weights is not None:
+        weights = tf.constant(weights, tf.float32)
+      metric, update = metrics.sparse_precision_at_k(
+          predictions=tf.constant(predictions, tf.float32), labels=labels,
+          k=k, class_id=class_id, weights=weights)
+
+      # Fails without initialized vars.
+      self.assertRaises(tf.OpError, metric.eval)
+      self.assertRaises(tf.OpError, update.eval)
+      tf.initialize_variables(tf.local_variables()).run()
+
+      # Run per-step op and assert expected values.
+      if math.isnan(expected):
+        _assert_nan(self, update.eval())
+        _assert_nan(self, metric.eval())
+      else:
+        self.assertEqual(expected, update.eval())
+        self.assertEqual(expected, metric.eval())
+
+  def _test_streaming_sparse_average_precision_at_k(
+      self, predictions, labels, k, expected, weights=None):
+    with tf.Graph().as_default() as g, self.test_session(g):
+      if weights is not None:
+        weights = tf.constant(weights, tf.float32)
+      predictions = tf.constant(predictions, tf.float32)
+      metric, update = metrics.sparse_average_precision_at_k(
+          labels, predictions, k, weights=weights)
+
+      # Fails without initialized vars.
+      self.assertRaises(tf.OpError, metric.eval)
+      self.assertRaises(tf.OpError, update.eval)
+      local_variables = tf.local_variables()
+      tf.initialize_variables(local_variables).run()
+
+      # Run per-step op and assert expected values.
+      if math.isnan(expected):
+        _assert_nan(self, update.eval())
+        _assert_nan(self, metric.eval())
+      else:
+        self.assertAlmostEqual(expected, update.eval())
+        self.assertAlmostEqual(expected, metric.eval())
+
+  def test_average_precision(self):
+    # Example 1.
+    # Matches example here:
+    # fastml.com/what-you-wanted-to-know-about-mean-average-precision
+    labels_ex1 = (0, 1, 2, 3, 4)
+    labels = np.array([labels_ex1], dtype=np.int64)
+    predictions_ex1 = (0.2, 0.1, 0.0, 0.4, 0.0, 0.5, 0.3)
+    predictions = (predictions_ex1,)
+    precision_ex1 = (
+        0.0 / 1,
+        1.0 / 2,
+        1.0 / 3,
+        2.0 / 4
+    )
+    avg_precision_ex1 = (
+        0.0 / 1,
+        precision_ex1[1] / 2,
+        precision_ex1[1] / 3,
+        (precision_ex1[1] + precision_ex1[3]) / 4
+    )
+    for i in xrange(4):
+      k = i + 1
+      self._test_streaming_sparse_precision_at_k(
+          predictions, labels, k, expected=precision_ex1[i])
+      self._test_streaming_sparse_average_precision_at_k(
+          predictions, labels, k, expected=avg_precision_ex1[i])
+
+    # Example 2.
+    labels_ex2 = (0, 2, 4, 5, 6)
+    labels = np.array([labels_ex2], dtype=np.int64)
+    predictions_ex2 = (0.3, 0.5, 0.0, 0.4, 0.0, 0.1, 0.2)
+    predictions = (predictions_ex2,)
+    precision_ex2 = (
+        0.0 / 1,
+        0.0 / 2,
+        1.0 / 3,
+        2.0 / 4
+    )
+    avg_precision_ex2 = (
+        0.0 / 1,
+        0.0 / 2,
+        precision_ex2[2] / 3,
+        (precision_ex2[2] + precision_ex2[3]) / 4
+    )
+    for i in xrange(4):
+      k = i + 1
+      self._test_streaming_sparse_precision_at_k(
+          predictions, labels, k, expected=precision_ex2[i])
+      self._test_streaming_sparse_average_precision_at_k(
+          predictions, labels, k, expected=avg_precision_ex2[i])
+
+    # Both examples, we expect both precision and average precision to be the
+    # average of the 2 examples.
+    labels = np.array([labels_ex1, labels_ex2], dtype=np.int64)
+    predictions = (predictions_ex1, predictions_ex2)
+    streaming_precision = [
+        (ex1 + ex2) / 2
+        for ex1, ex2 in zip(precision_ex1, precision_ex2)]
+    streaming_average_precision = [
+        (ex1 + ex2) / 2
+        for ex1, ex2 in zip(avg_precision_ex1, avg_precision_ex2)]
+    for i in xrange(4):
+      k = i + 1
+      self._test_streaming_sparse_precision_at_k(
+          predictions, labels, k, expected=streaming_precision[i])
+      self._test_streaming_sparse_average_precision_at_k(
+          predictions, labels, k, expected=streaming_average_precision[i])
+
+    # Weighted examples, we expect streaming average precision to be the
+    # weighted average of the 2 examples.
+    weights = (0.3, 0.6)
+    streaming_average_precision = [
+        (weights[0] * ex1 + weights[1] * ex2) / (weights[0] + weights[1])
+        for ex1, ex2 in zip(avg_precision_ex1, avg_precision_ex2)]
+    for i in xrange(4):
+      k = i + 1
+      self._test_streaming_sparse_average_precision_at_k(
+          predictions, labels, k, expected=streaming_average_precision[i],
+          weights=weights)
+
+  def test_average_precision_some_labels_out_of_range(self):
+    """Tests that labels outside the [0, n_classes) range are ignored."""
+    labels_ex1 = (-1, 0, 1, 2, 3, 4, 7)
+    labels = np.array([labels_ex1], dtype=np.int64)
+    predictions_ex1 = (0.2, 0.1, 0.0, 0.4, 0.0, 0.5, 0.3)
+    predictions = (predictions_ex1,)
+    precision_ex1 = (
+        0.0 / 1,
+        1.0 / 2,
+        1.0 / 3,
+        2.0 / 4
+    )
+    avg_precision_ex1 = (
+        0.0 / 1,
+        precision_ex1[1] / 2,
+        precision_ex1[1] / 3,
+        (precision_ex1[1] + precision_ex1[3]) / 4
+    )
+    for i in xrange(4):
+      k = i + 1
+      self._test_streaming_sparse_precision_at_k(
+          predictions, labels, k, expected=precision_ex1[i])
+      self._test_streaming_sparse_average_precision_at_k(
+          predictions, labels, k, expected=avg_precision_ex1[i])
+
+  def test_one_label_at_k1_nan(self):
+    predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
+    sparse_labels = _binary_2d_label_to_sparse_value(
+        [[0, 0, 0, 1], [0, 0, 1, 0]])
+    dense_labels = np.array([[3], [2]], dtype=np.int64)
+
+    for labels in (sparse_labels, dense_labels):
+      # Classes 0,1,2 have 0 predictions, classes -1 and 4 are out of range.
+      for class_id in (-1, 0, 1, 2, 4):
+        self._test_streaming_sparse_precision_at_k(
+            predictions, labels, k=1, expected=NAN, class_id=class_id)
+
+  def test_one_label_at_k1(self):
+    predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
+    sparse_labels = _binary_2d_label_to_sparse_value(
+        [[0, 0, 0, 1], [0, 0, 1, 0]])
+    dense_labels = np.array([[3], [2]], dtype=np.int64)
+
+    for labels in (sparse_labels, dense_labels):
+      # Class 3: 1 label, 2 predictions, 1 correct.
+      self._test_streaming_sparse_precision_at_k(
+          predictions, labels, k=1, expected=1.0 / 2, class_id=3)
+
+      # All classes: 2 labels, 2 predictions, 1 correct.
+      self._test_streaming_sparse_precision_at_k(
+          predictions, labels, k=1, expected=1.0 / 2)
+
+  def test_three_labels_at_k5_no_predictions(self):
+    predictions = [
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]
+    ]
+    sparse_labels = _binary_2d_label_to_sparse_value([
+        [0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
+        [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]
+    ])
+    dense_labels = np.array([[2, 7, 8], [1, 2, 5]], dtype=np.int64)
+
+    for labels in (sparse_labels, dense_labels):
+      # Classes 1,3,8 have 0 predictions, classes -1 and 10 are out of range.
+      for class_id in (-1, 1, 3, 8, 10):
+        self._test_streaming_sparse_precision_at_k(
+            predictions, labels, k=5, expected=NAN, class_id=class_id)
+
+  def test_three_labels_at_k5_no_labels(self):
+    predictions = [
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]
+    ]
+    sparse_labels = _binary_2d_label_to_sparse_value([
+        [0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
+        [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]
+    ])
+    dense_labels = np.array([[2, 7, 8], [1, 2, 5]], dtype=np.int64)
+
+    for labels in (sparse_labels, dense_labels):
+      # Classes 0,4,6,9: 0 labels, >=1 prediction.
+      for class_id in (0, 4, 6, 9):
+        self._test_streaming_sparse_precision_at_k(
+            predictions, labels, k=5, expected=0.0, class_id=class_id)
+
+  def test_three_labels_at_k5(self):
+    predictions = [
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]
+    ]
+    sparse_labels = _binary_2d_label_to_sparse_value([
+        [0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
+        [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]
+    ])
+    dense_labels = np.array([[2, 7, 8], [1, 2, 5]], dtype=np.int64)
+
+    for labels in (sparse_labels, dense_labels):
+      # Class 2: 2 labels, 2 correct predictions.
+      self._test_streaming_sparse_precision_at_k(
+          predictions, labels, k=5, expected=2.0 / 2,
+          class_id=2)
+
+      # Class 5: 1 label, 1 correct prediction.
+      self._test_streaming_sparse_precision_at_k(
+          predictions, labels, k=5, expected=1.0 / 1, class_id=5)
+
+      # Class 7: 1 label, 1 incorrect prediction.
+      self._test_streaming_sparse_precision_at_k(
+          predictions, labels, k=5, expected=0.0 / 1, class_id=7)
+
+      # All classes: 10 predictions, 3 correct.
+      self._test_streaming_sparse_precision_at_k(
+          predictions, labels, k=5, expected=3.0 / 10)
+
+  def test_three_labels_at_k5_some_out_of_range(self):
+    """Tests that labels outside the [0, n_classes) range are ignored."""
+    predictions = [
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]
+    ]
+    sp_labels = tf.SparseTensorValue(
+        indices=[[0, 0], [0, 1], [0, 2], [0, 3],
+                 [1, 0], [1, 1], [1, 2], [1, 3]],
+        # values -1 and 10 are outside the [0, n_classes) range and are ignored.
+        values=np.array([2, 7, -1, 8,
+                         1, 2, 5, 10], np.int64),
+        shape=[2, 4])
+
+    # Class 2: 2 labels, 2 correct predictions.
+    self._test_streaming_sparse_precision_at_k(
+        predictions, sp_labels, k=5, expected=2.0 / 2, class_id=2)
+
+    # Class 5: 1 label, 1 correct prediction.
+    self._test_streaming_sparse_precision_at_k(
+        predictions, sp_labels, k=5, expected=1.0 / 1, class_id=5)
+
+    # Class 7: 1 label, 1 incorrect prediction.
+    self._test_streaming_sparse_precision_at_k(
+        predictions, sp_labels, k=5, expected=0.0 / 1, class_id=7)
+
+    # All classes: 10 predictions, 3 correct.
+    self._test_streaming_sparse_precision_at_k(
+        predictions, sp_labels, k=5, expected=3.0 / 10)
+
+  def test_3d_nan(self):
+    predictions = [[
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]
+    ], [
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6],
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9]
+    ]]
+    labels = _binary_3d_label_to_sparse_value([[
+        [0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
+        [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]
+    ], [
+        [0, 1, 1, 0, 0, 1, 0, 1, 0, 0],
+        [0, 0, 1, 0, 0, 0, 0, 0, 1, 0]
+    ]])
+
+    # Classes 1,3,8 have 0 predictions, classes -1 and 10 are out of range.
+    for class_id in (-1, 1, 3, 8, 10):
+      self._test_streaming_sparse_precision_at_k(
+          predictions, labels, k=5, expected=NAN, class_id=class_id)
+
+  def test_3d_no_labels(self):
+    predictions = [[
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]
+    ], [
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6],
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9]
+    ]]
+    labels = _binary_3d_label_to_sparse_value([[
+        [0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
+        [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]
+    ], [
+        [0, 1, 1, 0, 0, 1, 0, 1, 0, 0],
+        [0, 0, 1, 0, 0, 0, 0, 0, 1, 0]
+    ]])
+
+    # Classes 0,4,6,9: 0 labels, >=1 prediction.
+    for class_id in (0, 4, 6, 9):
+      self._test_streaming_sparse_precision_at_k(
+          predictions, labels, k=5, expected=0.0, class_id=class_id)
+
+  def test_3d(self):
+    predictions = [[
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]
+    ], [
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6],
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9]
+    ]]
+    labels = _binary_3d_label_to_sparse_value([[
+        [0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
+        [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]
+    ], [
+        [0, 1, 1, 0, 0, 1, 0, 1, 0, 0],
+        [0, 0, 1, 0, 0, 0, 0, 0, 1, 0]
+    ]])
+
+    # Class 2: 4 predictions, all correct.
+    self._test_streaming_sparse_precision_at_k(
+        predictions, labels, k=5, expected=4.0 / 4, class_id=2)
+
+    # Class 5: 2 predictions, both correct.
+    self._test_streaming_sparse_precision_at_k(
+        predictions, labels, k=5, expected=2.0 / 2, class_id=5)
+
+    # Class 7: 2 predictions, 1 correct.
+    self._test_streaming_sparse_precision_at_k(
+        predictions, labels, k=5, expected=1.0 / 2, class_id=7)
+
+    # All classes: 20 predictions, 7 correct.
+    self._test_streaming_sparse_precision_at_k(
+        predictions, labels, k=5, expected=7.0 / 20)
+
+  def test_3d_ignore_some(self):
+    predictions = [[
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]
+    ], [
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6],
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9]
+    ]]
+    labels = _binary_3d_label_to_sparse_value([[
+        [0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
+        [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]
+    ], [
+        [0, 1, 1, 0, 0, 1, 0, 1, 0, 0],
+        [0, 0, 1, 0, 0, 0, 0, 0, 1, 0]
+    ]])
+
+    # Class 2: 2 predictions, both correct.
+    self._test_streaming_sparse_precision_at_k(
+        predictions, labels, k=5, expected=2.0 / 2.0, class_id=2,
+        weights=[[1], [0]])
+
+    # Class 2: 2 predictions, both correct.
+    self._test_streaming_sparse_precision_at_k(
+        predictions, labels, k=5, expected=2.0 / 2.0, class_id=2,
+        weights=[[0], [1]])
+
+    # Class 7: 1 incorrect prediction.
+    self._test_streaming_sparse_precision_at_k(
+        predictions, labels, k=5, expected=0.0 / 1.0, class_id=7,
+        weights=[[1], [0]])
+
+    # Class 7: 1 correct prediction.
+    self._test_streaming_sparse_precision_at_k(
+        predictions, labels, k=5, expected=1.0 / 1.0, class_id=7,
+        weights=[[0], [1]])
+
+    # Class 7: no predictions.
+    self._test_streaming_sparse_precision_at_k(
+        predictions, labels, k=5, expected=NAN, class_id=7,
+        weights=[[1, 0], [0, 1]])
+
+    # Class 7: 2 predictions, 1 correct.
+    self._test_streaming_sparse_precision_at_k(
+        predictions, labels, k=5, expected=1.0 / 2.0, class_id=7,
+        weights=[[0, 1], [1, 0]])
+
+  def test_sparse_tensor_value(self):
+    predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
+    labels = [[0, 0, 0, 1], [0, 0, 1, 0]]
+    expected_precision = 0.5
+    with self.test_session():
+      _, precision = metrics.sparse_precision_at_k(
+          predictions=tf.constant(predictions, tf.float32),
+          labels=_binary_2d_label_to_sparse_value(labels), k=1)
+
+      tf.initialize_variables(tf.local_variables()).run()
+
+      self.assertEqual(expected_precision, precision.eval())
+
+
+class RecallAtkTest(tf.test.TestCase):
+
+  def _test_streaming_sparse_recall_at_k(self,
+                                         predictions,
+                                         labels,
+                                         k,
+                                         expected,
+                                         class_id=None,
+                                         weights=None):
+    with tf.Graph().as_default() as g, self.test_session(g):
+      if weights is not None:
+        weights = tf.constant(weights, tf.float32)
+      metric, update = metrics.recall_at_k(
+          predictions=tf.constant(predictions, tf.float32),
+          labels=labels, k=k, class_id=class_id, weights=weights)
+
+      # Fails without initialized vars.
+      self.assertRaises(tf.OpError, metric.eval)
+      self.assertRaises(tf.OpError, update.eval)
+      tf.initialize_variables(tf.local_variables()).run()
+
+      # Run per-step op and assert expected values.
+      if math.isnan(expected):
+        _assert_nan(self, update.eval())
+        _assert_nan(self, metric.eval())
+      else:
+        self.assertEqual(expected, update.eval())
+        self.assertEqual(expected, metric.eval())
+
+  def test_one_label_at_k1_nan(self):
+    predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
+    sparse_labels = _binary_2d_label_to_sparse_value(
+        [[0, 0, 0, 1], [0, 0, 1, 0]])
+    dense_labels = np.array([[3], [2]], dtype=np.int64)
+
+    # Classes 0,1 have 0 labels, 0 predictions, classes -1 and 4 are out of
+    # range.
+    for labels in (sparse_labels, dense_labels):
+      for class_id in (-1, 0, 1, 4):
+        self._test_streaming_sparse_recall_at_k(
+            predictions, labels, k=1, expected=NAN,
+            class_id=class_id)
+
+  def test_one_label_at_k1_no_predictions(self):
+    predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
+    sparse_labels = _binary_2d_label_to_sparse_value(
+        [[0, 0, 0, 1], [0, 0, 1, 0]])
+    dense_labels = np.array([[3], [2]], dtype=np.int64)
+
+    for labels in (sparse_labels, dense_labels):
+      # Class 2: 0 predictions.
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=0.0,
+          class_id=2)
+
+  def test_one_label_at_k1(self):
+    predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
+    sparse_labels = _binary_2d_label_to_sparse_value(
+        [[0, 0, 0, 1], [0, 0, 1, 0]])
+    dense_labels = np.array([[3], [2]], dtype=np.int64)
+
+    for labels in (sparse_labels, dense_labels):
+      # Class 3: 1 label, 2 predictions, 1 correct.
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=1.0 / 1,
+          class_id=3)
+
+      # All classes: 2 labels, 2 predictions, 1 correct.
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=1.0 / 2)
+
+  def test_one_label_at_k1_weighted(self):
+    predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
+    sparse_labels = _binary_2d_label_to_sparse_value(
+        [[0, 0, 0, 1], [0, 0, 1, 0]])
+    dense_labels = np.array([[3], [2]], dtype=np.int64)
+
+    for labels in (sparse_labels, dense_labels):
+      # Class 3: 1 label, 2 predictions, 1 correct.
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=NAN, class_id=3, weights=(0.0,))
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=1.0 / 1, class_id=3,
+          weights=(1.0,))
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=1.0 / 1, class_id=3,
+          weights=(2.0,))
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=NAN, class_id=3,
+          weights=(0.0, 0.0))
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=NAN, class_id=3,
+          weights=(0.0, 1.0))
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=1.0 / 1, class_id=3,
+          weights=(1.0, 0.0))
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=1.0 / 1, class_id=3,
+          weights=(1.0, 1.0))
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=2.0 / 2, class_id=3,
+          weights=(2.0, 3.0))
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=3.0 / 3, class_id=3,
+          weights=(3.0, 2.0))
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=0.3 / 0.3, class_id=3,
+          weights=(0.3, 0.6))
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=0.6 / 0.6, class_id=3,
+          weights=(0.6, 0.3))
+
+      # All classes: 2 labels, 2 predictions, 1 correct.
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=NAN, weights=(0.0,))
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=1.0 / 2, weights=(1.0,))
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=1.0 / 2, weights=(2.0,))
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=1.0 / 1, weights=(1.0, 0.0))
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=0.0 / 1, weights=(0.0, 1.0))
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=1.0 / 2, weights=(1.0, 1.0))
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=2.0 / 5, weights=(2.0, 3.0))
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=3.0 / 5, weights=(3.0, 2.0))
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=0.3 / 0.9, weights=(0.3, 0.6))
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=0.6 / 0.9, weights=(0.6, 0.3))
+
+  def test_three_labels_at_k5_nan(self):
+    predictions = [
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]]
+    sparse_labels = _binary_2d_label_to_sparse_value([
+        [0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
+        [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]])
+    dense_labels = np.array([[2, 7, 8], [1, 2, 5]], dtype=np.int64)
+
+    for labels in (sparse_labels, dense_labels):
+      # Classes 0,3,4,6,9 have 0 labels, class 10 is out of range.
+      for class_id in (0, 3, 4, 6, 9, 10):
+        self._test_streaming_sparse_recall_at_k(
+            predictions, labels, k=5, expected=NAN, class_id=class_id)
+
+  def test_three_labels_at_k5_no_predictions(self):
+    predictions = [
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]]
+    sparse_labels = _binary_2d_label_to_sparse_value([
+        [0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
+        [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]])
+    dense_labels = np.array([[2, 7, 8], [1, 2, 5]], dtype=np.int64)
+
+    for labels in (sparse_labels, dense_labels):
+      # Class 8: 1 label, no predictions.
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=5, expected=0.0 / 1, class_id=8)
+
+  def test_three_labels_at_k5(self):
+    predictions = [
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]]
+    sparse_labels = _binary_2d_label_to_sparse_value([
+        [0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
+        [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]])
+    dense_labels = np.array([[2, 7, 8], [1, 2, 5]], dtype=np.int64)
+
+    for labels in (sparse_labels, dense_labels):
+      # Class 2: 2 labels, both correct.
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=5, expected=2.0 / 2, class_id=2)
+
+      # Class 5: 1 label, incorrect.
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=5, expected=1.0 / 1, class_id=5)
+
+      # Class 7: 1 label, incorrect.
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=5, expected=0.0 / 1, class_id=7)
+
+      # All classes: 6 labels, 3 correct.
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=5, expected=3.0 / 6)
+
+  def test_three_labels_at_k5_some_out_of_range(self):
+    """Tests that labels outside the [0, n_classes) count in denominator."""
+    predictions = [
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]]
+    sp_labels = tf.SparseTensorValue(
+        indices=[[0, 0], [0, 1], [0, 2], [0, 3],
+                 [1, 0], [1, 1], [1, 2], [1, 3]],
+        # values -1 and 10 are outside the [0, n_classes) range.
+        values=np.array([2, 7, -1, 8,
+                         1, 2, 5, 10], np.int64),
+        shape=[2, 4])
+
+    # Class 2: 2 labels, both correct.
+    self._test_streaming_sparse_recall_at_k(
+        predictions=predictions, labels=sp_labels, k=5, expected=2.0 / 2,
+        class_id=2)
+
+    # Class 5: 1 label, incorrect.
+    self._test_streaming_sparse_recall_at_k(
+        predictions=predictions, labels=sp_labels, k=5, expected=1.0 / 1,
+        class_id=5)
+
+    # Class 7: 1 label, incorrect.
+    self._test_streaming_sparse_recall_at_k(
+        predictions=predictions, labels=sp_labels, k=5, expected=0.0 / 1,
+        class_id=7)
+
+    # All classes: 8 labels, 3 correct.
+    self._test_streaming_sparse_recall_at_k(
+        predictions=predictions, labels=sp_labels, k=5, expected=3.0 / 8)
+
+  def test_3d_nan(self):
+    predictions = [[
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]
+    ], [
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6],
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9]
+    ]]
+    sparse_labels = _binary_3d_label_to_sparse_value([[
+        [0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
+        [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]
+    ], [
+        [0, 1, 1, 0, 0, 1, 0, 0, 0, 0],
+        [0, 0, 1, 0, 0, 0, 0, 1, 1, 0]
+    ]])
+    dense_labels = np.array([[
+        [2, 7, 8],
+        [1, 2, 5]
+    ], [
+        [1, 2, 5],
+        [2, 7, 8],
+    ]], dtype=np.int64)
+
+    for labels in (sparse_labels, dense_labels):
+      # Classes 0,3,4,6,9 have 0 labels, class 10 is out of range.
+      for class_id in (0, 3, 4, 6, 9, 10):
+        self._test_streaming_sparse_recall_at_k(
+            predictions, labels, k=5, expected=NAN, class_id=class_id)
+
+  def test_3d_no_predictions(self):
+    predictions = [[
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]
+    ], [
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6],
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9]
+    ]]
+    sparse_labels = _binary_3d_label_to_sparse_value([[
+        [0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
+        [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]
+    ], [
+        [0, 1, 1, 0, 0, 1, 0, 0, 0, 0],
+        [0, 0, 1, 0, 0, 0, 0, 1, 1, 0]
+    ]])
+    dense_labels = np.array([[
+        [2, 7, 8],
+        [1, 2, 5]
+    ], [
+        [1, 2, 5],
+        [2, 7, 8],
+    ]], dtype=np.int64)
+
+    for labels in (sparse_labels, dense_labels):
+      # Classes 1,8 have 0 predictions, >=1 label.
+      for class_id in (1, 8):
+        self._test_streaming_sparse_recall_at_k(
+            predictions, labels, k=5, expected=0.0, class_id=class_id)
+
+  def test_3d(self):
+    predictions = [[
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]
+    ], [
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6],
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9]
+    ]]
+    labels = _binary_3d_label_to_sparse_value([[
+        [0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
+        [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]
+    ], [
+        [0, 1, 1, 0, 0, 1, 0, 1, 0, 0],
+        [0, 0, 1, 0, 0, 0, 0, 0, 1, 0]
+    ]])
+
+    # Class 2: 4 labels, all correct.
+    self._test_streaming_sparse_recall_at_k(
+        predictions, labels, k=5, expected=4.0 / 4, class_id=2)
+
+    # Class 5: 2 labels, both correct.
+    self._test_streaming_sparse_recall_at_k(
+        predictions, labels, k=5, expected=2.0 / 2, class_id=5)
+
+    # Class 7: 2 labels, 1 incorrect.
+    self._test_streaming_sparse_recall_at_k(
+        predictions, labels, k=5, expected=1.0 / 2, class_id=7)
+
+    # All classes: 12 labels, 7 correct.
+    self._test_streaming_sparse_recall_at_k(
+        predictions, labels, k=5, expected=7.0 / 12)
+
+  def test_3d_ignore_all(self):
+    predictions = [[
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]
+    ], [
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6],
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9]
+    ]]
+    labels = _binary_3d_label_to_sparse_value([[
+        [0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
+        [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]
+    ], [
+        [0, 1, 1, 0, 0, 1, 0, 1, 0, 0],
+        [0, 0, 1, 0, 0, 0, 0, 0, 1, 0]
+    ]])
+
+    for class_id in xrange(10):
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=5, expected=NAN, class_id=class_id,
+          weights=[[0], [0]])
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=5, expected=NAN, class_id=class_id,
+          weights=[[0, 0], [0, 0]])
+    self._test_streaming_sparse_recall_at_k(
+        predictions, labels, k=5, expected=NAN, weights=[[0], [0]])
+    self._test_streaming_sparse_recall_at_k(
+        predictions, labels, k=5, expected=NAN, weights=[[0, 0], [0, 0]])
+
+  def test_3d_ignore_some(self):
+    predictions = [[
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]
+    ], [
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6],
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9]
+    ]]
+    labels = _binary_3d_label_to_sparse_value([[
+        [0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
+        [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]
+    ], [
+        [0, 1, 1, 0, 0, 1, 0, 1, 0, 0],
+        [0, 0, 1, 0, 0, 0, 0, 0, 1, 0]
+    ]])
+
+    # Class 2: 2 labels, both correct.
+    self._test_streaming_sparse_recall_at_k(
+        predictions, labels, k=5, expected=2.0 / 2.0, class_id=2,
+        weights=[[1], [0]])
+
+    # Class 2: 2 labels, both correct.
+    self._test_streaming_sparse_recall_at_k(
+        predictions, labels, k=5, expected=2.0 / 2.0, class_id=2,
+        weights=[[0], [1]])
+
+    # Class 7: 1 label, correct.
+    self._test_streaming_sparse_recall_at_k(
+        predictions, labels, k=5, expected=1.0 / 1.0, class_id=7,
+        weights=[[0], [1]])
+
+    # Class 7: 1 label, incorrect.
+    self._test_streaming_sparse_recall_at_k(
+        predictions, labels, k=5, expected=0.0 / 1.0, class_id=7,
+        weights=[[1], [0]])
+
+    # Class 7: 2 labels, 1 correct.
+    self._test_streaming_sparse_recall_at_k(
+        predictions, labels, k=5, expected=1.0 / 2.0, class_id=7,
+        weights=[[1, 0], [1, 0]])
+
+    # Class 7: No labels.
+    self._test_streaming_sparse_recall_at_k(
+        predictions, labels, k=5, expected=NAN, class_id=7,
+        weights=[[0, 1], [0, 1]])
+
+  def test_sparse_tensor_value(self):
+    predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
+    labels = [[0, 0, 1, 0], [0, 0, 0, 1]]
+    expected_recall = 0.5
+    with self.test_session():
+      _, recall = metrics.recall_at_k(
+          predictions=tf.constant(predictions, tf.float32),
+          labels=_binary_2d_label_to_sparse_value(labels), k=1)
+
+      tf.initialize_variables(tf.local_variables()).run()
+
+      self.assertEqual(expected_recall, recall.eval())
+
+
+class MeanAbsoluteErrorTest(tf.test.TestCase):
+
+  def setUp(self):
+    tf.reset_default_graph()
+
+  def testVars(self):
+    metrics.mean_absolute_error(
+        predictions=tf.ones((10, 1)), labels=tf.ones((10, 1)))
+    _assert_local_variables(self, (
+        'mean_absolute_error/count:0',
+        'mean_absolute_error/total:0'
+    ))
+
+  def testMetricsCollection(self):
+    my_collection_name = '__metrics__'
+    mean, _ = metrics.mean_absolute_error(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        metrics_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [mean])
+
+  def testUpdatesCollection(self):
+    my_collection_name = '__updates__'
+    _, update_op = metrics.mean_absolute_error(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        updates_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [update_op])
+
+  def testValueTensorIsIdempotent(self):
+    predictions = tf.random_normal((10, 3), seed=1)
+    labels = tf.random_normal((10, 3), seed=2)
+    error, update_op = metrics.mean_absolute_error(
+        labels, predictions)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+
+      # Run several updates.
+      for _ in range(10):
+        sess.run(update_op)
+
+      # Then verify idempotency.
+      initial_error = error.eval()
+      for _ in range(10):
+        self.assertEqual(initial_error, error.eval())
+
+  def testSingleUpdateWithErrorAndWeights(self):
+    predictions = tf.constant([2, 4, 6, 8], shape=(1, 4), dtype=tf.float32)
+    labels = tf.constant([1, 3, 2, 3], shape=(1, 4), dtype=tf.float32)
+    weights = tf.constant([0, 1, 0, 1], shape=(1, 4))
+
+    error, update_op = metrics.mean_absolute_error(
+        labels, predictions, weights)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      self.assertEqual(3, sess.run(update_op))
+      self.assertEqual(3, error.eval())
+
+
+class MeanRelativeErrorTest(tf.test.TestCase):
+
+  def setUp(self):
+    tf.reset_default_graph()
+
+  def testVars(self):
+    metrics.mean_relative_error(
+        predictions=tf.ones((10, 1)), labels=tf.ones((10, 1)),
+        normalizer=tf.ones((10, 1)))
+    _assert_local_variables(self, (
+        'mean_relative_error/count:0',
+        'mean_relative_error/total:0'
+    ))
+
+  def testMetricsCollection(self):
+    my_collection_name = '__metrics__'
+    mean, _ = metrics.mean_relative_error(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        normalizer=tf.ones((10, 1)),
+        metrics_collections=[my_collection_name])
+    self.assertListEqual(
+        tf.get_collection(my_collection_name), [mean])
+
+  def testUpdatesCollection(self):
+    my_collection_name = '__updates__'
+    _, update_op = metrics.mean_relative_error(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        normalizer=tf.ones((10, 1)),
+        updates_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [update_op])
+
+  def testValueTensorIsIdempotent(self):
+    predictions = tf.random_normal((10, 3), seed=1)
+    labels = tf.random_normal((10, 3), seed=2)
+    normalizer = tf.random_normal((10, 3), seed=3)
+    error, update_op = metrics.mean_relative_error(
+        labels, predictions, normalizer)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+
+      # Run several updates.
+      for _ in range(10):
+        sess.run(update_op)
+
+      # Then verify idempotency.
+      initial_error = error.eval()
+      for _ in range(10):
+        self.assertEqual(initial_error, error.eval())
+
+  def testSingleUpdateNormalizedByLabels(self):
+    np_predictions = np.asarray([2, 4, 6, 8], dtype=np.float32)
+    np_labels = np.asarray([1, 3, 2, 3], dtype=np.float32)
+    expected_error = np.mean(
+        np.divide(np.absolute(np_predictions - np_labels),
+                  np_labels))
+
+    predictions = tf.constant(np_predictions, shape=(1, 4), dtype=tf.float32)
+    labels = tf.constant(np_labels, shape=(1, 4))
+
+    error, update_op = metrics.mean_relative_error(
+        labels, predictions, normalizer=labels)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      self.assertEqual(expected_error, sess.run(update_op))
+      self.assertEqual(expected_error, error.eval())
+
+  def testSingleUpdateNormalizedByZeros(self):
+    np_predictions = np.asarray([2, 4, 6, 8], dtype=np.float32)
+
+    predictions = tf.constant(np_predictions, shape=(1, 4), dtype=tf.float32)
+    labels = tf.constant([1, 3, 2, 3], shape=(1, 4), dtype=tf.float32)
+
+    error, update_op = metrics.mean_relative_error(
+        labels, predictions, normalizer=tf.zeros_like(labels))
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      self.assertEqual(0.0, sess.run(update_op))
+      self.assertEqual(0.0, error.eval())
+
+
+class MeanSquaredErrorTest(tf.test.TestCase):
+
+  def setUp(self):
+    tf.reset_default_graph()
+
+  def testVars(self):
+    metrics.mean_squared_error(
+        predictions=tf.ones((10, 1)), labels=tf.ones((10, 1)))
+    _assert_local_variables(self, (
+        'mean_squared_error/count:0',
+        'mean_squared_error/total:0'
+    ))
+
+  def testMetricsCollection(self):
+    my_collection_name = '__metrics__'
+    mean, _ = metrics.mean_squared_error(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        metrics_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [mean])
+
+  def testUpdatesCollection(self):
+    my_collection_name = '__updates__'
+    _, update_op = metrics.mean_squared_error(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        updates_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [update_op])
+
+  def testValueTensorIsIdempotent(self):
+    predictions = tf.random_normal((10, 3), seed=1)
+    labels = tf.random_normal((10, 3), seed=2)
+    error, update_op = metrics.mean_squared_error(
+        labels, predictions)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+
+      # Run several updates.
+      for _ in range(10):
+        sess.run(update_op)
+
+      # Then verify idempotency.
+      initial_error = error.eval()
+      for _ in range(10):
+        self.assertEqual(initial_error, error.eval())
+
+  def testSingleUpdateZeroError(self):
+    predictions = tf.zeros((1, 3), dtype=tf.float32)
+    labels = tf.zeros((1, 3), dtype=tf.float32)
+
+    error, update_op = metrics.mean_squared_error(
+        labels, predictions)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      self.assertEqual(0, sess.run(update_op))
+      self.assertEqual(0, error.eval())
+
+  def testSingleUpdateWithError(self):
+    predictions = tf.constant([2, 4, 6], shape=(1, 3), dtype=tf.float32)
+    labels = tf.constant([1, 3, 2], shape=(1, 3), dtype=tf.float32)
+
+    error, update_op = metrics.mean_squared_error(
+        labels, predictions)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      self.assertEqual(6, sess.run(update_op))
+      self.assertEqual(6, error.eval())
+
+  def testSingleUpdateWithErrorAndWeights(self):
+    predictions = tf.constant([2, 4, 6, 8], shape=(1, 4), dtype=tf.float32)
+    labels = tf.constant([1, 3, 2, 3], shape=(1, 4), dtype=tf.float32)
+    weights = tf.constant([0, 1, 0, 1], shape=(1, 4))
+
+    error, update_op = metrics.mean_squared_error(
+        labels, predictions, weights)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      self.assertEqual(13, sess.run(update_op))
+      self.assertEqual(13, error.eval())
+
+  def testMultipleBatchesOfSizeOne(self):
+    with self.test_session() as sess:
+      # Create the queue that populates the predictions.
+      preds_queue = tf.FIFOQueue(2, dtypes=tf.float32, shapes=(1, 3))
+      _enqueue_vector(sess, preds_queue, [10, 8, 6])
+      _enqueue_vector(sess, preds_queue, [-4, 3, -1])
+      predictions = preds_queue.dequeue()
+
+      # Create the queue that populates the labels.
+      labels_queue = tf.FIFOQueue(2, dtypes=tf.float32, shapes=(1, 3))
+      _enqueue_vector(sess, labels_queue, [1, 3, 2])
+      _enqueue_vector(sess, labels_queue, [2, 4, 6])
+      labels = labels_queue.dequeue()
+
+      error, update_op = metrics.mean_squared_error(
+          labels, predictions)
+
+      sess.run(tf.local_variables_initializer())
+      sess.run(update_op)
+      self.assertAlmostEqual(208.0 / 6, sess.run(update_op), 5)
+
+      self.assertAlmostEqual(208.0 / 6, error.eval(), 5)
+
+  def testMetricsComputedConcurrently(self):
+    with self.test_session() as sess:
+      # Create the queue that populates one set of predictions.
+      preds_queue0 = tf.FIFOQueue(2, dtypes=tf.float32, shapes=(1, 3))
+      _enqueue_vector(sess, preds_queue0, [10, 8, 6])
+      _enqueue_vector(sess, preds_queue0, [-4, 3, -1])
+      predictions0 = preds_queue0.dequeue()
+
+      # Create the queue that populates one set of predictions.
+      preds_queue1 = tf.FIFOQueue(2, dtypes=tf.float32, shapes=(1, 3))
+      _enqueue_vector(sess, preds_queue1, [0, 1, 1])
+      _enqueue_vector(sess, preds_queue1, [1, 1, 0])
+      predictions1 = preds_queue1.dequeue()
+
+      # Create the queue that populates one set of labels.
+      labels_queue0 = tf.FIFOQueue(2, dtypes=tf.float32, shapes=(1, 3))
+      _enqueue_vector(sess, labels_queue0, [1, 3, 2])
+      _enqueue_vector(sess, labels_queue0, [2, 4, 6])
+      labels0 = labels_queue0.dequeue()
+
+      # Create the queue that populates another set of labels.
+      labels_queue1 = tf.FIFOQueue(2, dtypes=tf.float32, shapes=(1, 3))
+      _enqueue_vector(sess, labels_queue1, [-5, -3, -1])
+      _enqueue_vector(sess, labels_queue1, [5, 4, 3])
+      labels1 = labels_queue1.dequeue()
+
+      mse0, update_op0 = metrics.mean_squared_error(
+          labels0, predictions0, name='msd0')
+      mse1, update_op1 = metrics.mean_squared_error(
+          labels1, predictions1, name='msd1')
+
+      sess.run(tf.local_variables_initializer())
+      sess.run([update_op0, update_op1])
+      sess.run([update_op0, update_op1])
+
+      mse0, mse1 = sess.run([mse0, mse1])
+      self.assertAlmostEqual(208.0 / 6, mse0, 5)
+      self.assertAlmostEqual(79.0 / 6, mse1, 5)
+
+  def testMultipleMetricsOnMultipleBatchesOfSizeOne(self):
+    with self.test_session() as sess:
+      # Create the queue that populates the predictions.
+      preds_queue = tf.FIFOQueue(2, dtypes=tf.float32, shapes=(1, 3))
+      _enqueue_vector(sess, preds_queue, [10, 8, 6])
+      _enqueue_vector(sess, preds_queue, [-4, 3, -1])
+      predictions = preds_queue.dequeue()
+
+      # Create the queue that populates the labels.
+      labels_queue = tf.FIFOQueue(2, dtypes=tf.float32, shapes=(1, 3))
+      _enqueue_vector(sess, labels_queue, [1, 3, 2])
+      _enqueue_vector(sess, labels_queue, [2, 4, 6])
+      labels = labels_queue.dequeue()
+
+      mae, ma_update_op = metrics.mean_absolute_error(
+          labels, predictions)
+      mse, ms_update_op = metrics.mean_squared_error(
+          labels, predictions)
+
+      sess.run(tf.local_variables_initializer())
+      sess.run([ma_update_op, ms_update_op])
+      sess.run([ma_update_op, ms_update_op])
+
+      self.assertAlmostEqual(32.0 / 6, mae.eval(), 5)
+      self.assertAlmostEqual(208.0 / 6, mse.eval(), 5)
+
+
+class RootMeanSquaredErrorTest(tf.test.TestCase):
+
+  def setUp(self):
+    tf.reset_default_graph()
+
+  def testVars(self):
+    metrics.root_mean_squared_error(
+        predictions=tf.ones((10, 1)), labels=tf.ones((10, 1)))
+    _assert_local_variables(self, (
+        'root_mean_squared_error/count:0',
+        'root_mean_squared_error/total:0'
+    ))
+
+  def testMetricsCollection(self):
+    my_collection_name = '__metrics__'
+    mean, _ = metrics.root_mean_squared_error(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        metrics_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [mean])
+
+  def testUpdatesCollection(self):
+    my_collection_name = '__updates__'
+    _, update_op = metrics.root_mean_squared_error(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        updates_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [update_op])
+
+  def testValueTensorIsIdempotent(self):
+    predictions = tf.random_normal((10, 3), seed=1)
+    labels = tf.random_normal((10, 3), seed=2)
+    error, update_op = metrics.root_mean_squared_error(
+        labels, predictions)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+
+      # Run several updates.
+      for _ in range(10):
+        sess.run(update_op)
+
+      # Then verify idempotency.
+      initial_error = error.eval()
+      for _ in range(10):
+        self.assertEqual(initial_error, error.eval())
+
+  def testSingleUpdateZeroError(self):
+    with self.test_session() as sess:
+      predictions = tf.constant(0.0, shape=(1, 3), dtype=tf.float32)
+      labels = tf.constant(0.0, shape=(1, 3), dtype=tf.float32)
+
+      rmse, update_op = metrics.root_mean_squared_error(
+          labels, predictions)
+
+      sess.run(tf.local_variables_initializer())
+      self.assertEqual(0, sess.run(update_op))
+
+      self.assertEqual(0, rmse.eval())
+
+  def testSingleUpdateWithError(self):
+    with self.test_session() as sess:
+      predictions = tf.constant([2, 4, 6], shape=(1, 3), dtype=tf.float32)
+      labels = tf.constant([1, 3, 2], shape=(1, 3), dtype=tf.float32)
+
+      rmse, update_op = metrics.root_mean_squared_error(
+          labels, predictions)
+
+      sess.run(tf.local_variables_initializer())
+      self.assertAlmostEqual(math.sqrt(6), update_op.eval(), 5)
+      self.assertAlmostEqual(math.sqrt(6), rmse.eval(), 5)
+
+  def testSingleUpdateWithErrorAndWeights(self):
+    with self.test_session() as sess:
+      predictions = tf.constant([2, 4, 6, 8], shape=(1, 4), dtype=tf.float32)
+      labels = tf.constant([1, 3, 2, 3], shape=(1, 4), dtype=tf.float32)
+      weights = tf.constant([0, 1, 0, 1], shape=(1, 4))
+
+      rmse, update_op = metrics.root_mean_squared_error(
+          labels, predictions, weights)
+
+      sess.run(tf.local_variables_initializer())
+      self.assertAlmostEqual(math.sqrt(13), sess.run(update_op))
+
+      self.assertAlmostEqual(math.sqrt(13), rmse.eval(), 5)
+
+
+def _reweight(predictions, labels, weights):
+  return (np.concatenate([[p] * int(w) for p, w in zip(predictions, weights)]),
+          np.concatenate([[l] * int(w) for l, w in zip(labels, weights)]))
+
+
+class MeanCosineDistanceTest(tf.test.TestCase):
+
+  def setUp(self):
+    tf.reset_default_graph()
+
+  def testVars(self):
+    metrics.mean_cosine_distance(
+        predictions=tf.ones((10, 3)), labels=tf.ones((10, 3)), dim=1)
+    _assert_local_variables(self, (
+        'mean_cosine_distance/count:0',
+        'mean_cosine_distance/total:0',
+    ))
+
+  def testMetricsCollection(self):
+    my_collection_name = '__metrics__'
+    mean, _ = metrics.mean_cosine_distance(
+        predictions=tf.ones((10, 3)),
+        labels=tf.ones((10, 3)),
+        dim=1,
+        metrics_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [mean])
+
+  def testUpdatesCollection(self):
+    my_collection_name = '__updates__'
+    _, update_op = metrics.mean_cosine_distance(
+        predictions=tf.ones((10, 3)),
+        labels=tf.ones((10, 3)),
+        dim=1,
+        updates_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [update_op])
+
+  def testValueTensorIsIdempotent(self):
+    predictions = tf.random_normal((10, 3), seed=1)
+    labels = tf.random_normal((10, 3), seed=2)
+    error, update_op = metrics.mean_cosine_distance(
+        labels, predictions, dim=1)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+
+      # Run several updates.
+      for _ in range(10):
+        sess.run(update_op)
+
+      # Then verify idempotency.
+      initial_error = error.eval()
+      for _ in range(10):
+        self.assertEqual(initial_error, error.eval())
+
+  def testSingleUpdateZeroError(self):
+    np_labels = np.matrix(('1 0 0;'
+                           '0 0 1;'
+                           '0 1 0'))
+
+    predictions = tf.constant(np_labels, shape=(1, 3, 3), dtype=tf.float32)
+    labels = tf.constant(np_labels, shape=(1, 3, 3), dtype=tf.float32)
+
+    error, update_op = metrics.mean_cosine_distance(
+        labels, predictions, dim=2)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      self.assertEqual(0, sess.run(update_op))
+      self.assertEqual(0, error.eval())
+
+  def testSingleUpdateWithError1(self):
+    np_labels = np.matrix(('1 0 0;'
+                           '0 0 1;'
+                           '0 1 0'))
+    np_predictions = np.matrix(('1 0 0;'
+                                '0 0 -1;'
+                                '1 0 0'))
+
+    predictions = tf.constant(np_predictions, shape=(3, 1, 3), dtype=tf.float32)
+    labels = tf.constant(np_labels, shape=(3, 1, 3), dtype=tf.float32)
+
+    error, update_op = metrics.mean_cosine_distance(
+        labels, predictions, dim=2)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      self.assertAlmostEqual(1, sess.run(update_op), 5)
+      self.assertAlmostEqual(1, error.eval(), 5)
+
+  def testSingleUpdateWithError2(self):
+    np_predictions = np.matrix((
+        '0.819031913261206 0.567041924552012 0.087465312324590;'
+        '-0.665139432070255 -0.739487441769973 -0.103671883216994;'
+        '0.707106781186548 -0.707106781186548 0'))
+    np_labels = np.matrix((
+        '0.819031913261206 0.567041924552012 0.087465312324590;'
+        '0.665139432070255 0.739487441769973 0.103671883216994;'
+        '0.707106781186548 0.707106781186548 0'))
+
+    predictions = tf.constant(np_predictions, shape=(3, 1, 3), dtype=tf.float32)
+    labels = tf.constant(np_labels, shape=(3, 1, 3), dtype=tf.float32)
+    error, update_op = metrics.mean_cosine_distance(
+        labels, predictions, dim=2)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      self.assertAlmostEqual(1.0, sess.run(update_op), 5)
+      self.assertAlmostEqual(1.0, error.eval(), 5)
+
+  def testSingleUpdateWithErrorAndWeights1(self):
+    np_predictions = np.matrix(('1 0 0;'
+                                '0 0 -1;'
+                                '1 0 0'))
+    np_labels = np.matrix(('1 0 0;'
+                           '0 0 1;'
+                           '0 1 0'))
+
+    predictions = tf.constant(np_predictions, shape=(3, 1, 3), dtype=tf.float32)
+    labels = tf.constant(np_labels, shape=(3, 1, 3), dtype=tf.float32)
+    weights = tf.constant([1, 0, 0], shape=(3, 1, 1), dtype=tf.float32)
+
+    error, update_op = metrics.mean_cosine_distance(
+        labels, predictions, dim=2, weights=weights)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      self.assertEqual(0, sess.run(update_op))
+      self.assertEqual(0, error.eval())
+
+  def testSingleUpdateWithErrorAndWeights2(self):
+    np_predictions = np.matrix(('1 0 0;'
+                                '0 0 -1;'
+                                '1 0 0'))
+    np_labels = np.matrix(('1 0 0;'
+                           '0 0 1;'
+                           '0 1 0'))
+
+    predictions = tf.constant(np_predictions, shape=(3, 1, 3), dtype=tf.float32)
+    labels = tf.constant(np_labels, shape=(3, 1, 3), dtype=tf.float32)
+    weights = tf.constant([0, 1, 1], shape=(3, 1, 1), dtype=tf.float32)
+
+    error, update_op = metrics.mean_cosine_distance(
+        labels, predictions, dim=2, weights=weights)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      self.assertEqual(1.5, update_op.eval())
+      self.assertEqual(1.5, error.eval())
+
+
+class PcntBelowThreshTest(tf.test.TestCase):
+
+  def setUp(self):
+    tf.reset_default_graph()
+
+  def testVars(self):
+    metrics.percentage_below(values=tf.ones((10,)), threshold=2)
+    _assert_local_variables(self, (
+        'percentage_below_threshold/count:0',
+        'percentage_below_threshold/total:0',
+    ))
+
+  def testMetricsCollection(self):
+    my_collection_name = '__metrics__'
+    mean, _ = metrics.percentage_below(
+        values=tf.ones((10,)),
+        threshold=2,
+        metrics_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [mean])
+
+  def testUpdatesCollection(self):
+    my_collection_name = '__updates__'
+    _, update_op = metrics.percentage_below(
+        values=tf.ones((10,)),
+        threshold=2,
+        updates_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [update_op])
+
+  def testOneUpdate(self):
+    with self.test_session() as sess:
+      values = tf.constant([2, 4, 6, 8], shape=(1, 4), dtype=tf.float32)
+
+      pcnt0, update_op0 = metrics.percentage_below(
+          values, 100, name='high')
+      pcnt1, update_op1 = metrics.percentage_below(
+          values, 7, name='medium')
+      pcnt2, update_op2 = metrics.percentage_below(
+          values, 1, name='low')
+
+      sess.run(tf.local_variables_initializer())
+      sess.run([update_op0, update_op1, update_op2])
+
+      pcnt0, pcnt1, pcnt2 = sess.run([pcnt0, pcnt1, pcnt2])
+      self.assertAlmostEqual(1.0, pcnt0, 5)
+      self.assertAlmostEqual(0.75, pcnt1, 5)
+      self.assertAlmostEqual(0.0, pcnt2, 5)
+
+  def testSomePresentOneUpdate(self):
+    with self.test_session() as sess:
+      values = tf.constant([2, 4, 6, 8], shape=(1, 4), dtype=tf.float32)
+      weights = tf.constant([1, 0, 0, 1], shape=(1, 4), dtype=tf.float32)
+
+      pcnt0, update_op0 = metrics.percentage_below(
+          values, 100, weights=weights, name='high')
+      pcnt1, update_op1 = metrics.percentage_below(
+          values, 7, weights=weights, name='medium')
+      pcnt2, update_op2 = metrics.percentage_below(
+          values, 1, weights=weights, name='low')
+
+      sess.run(tf.local_variables_initializer())
+      self.assertListEqual([1.0, 0.5, 0.0],
+                           sess.run([update_op0, update_op1, update_op2]))
+
+      pcnt0, pcnt1, pcnt2 = sess.run([pcnt0, pcnt1, pcnt2])
+      self.assertAlmostEqual(1.0, pcnt0, 5)
+      self.assertAlmostEqual(0.5, pcnt1, 5)
+      self.assertAlmostEqual(0.0, pcnt2, 5)
+
+
+class MeanIOUTest(tf.test.TestCase):
+
+  def setUp(self):
+    np.random.seed(1)
+    tf.reset_default_graph()
+
+  def testVars(self):
+    metrics.mean_iou(
+        predictions=tf.ones([10, 1]), labels=tf.ones([10, 1]), num_classes=2)
+    _assert_local_variables(self, ('mean_iou/total_confusion_matrix:0',))
+
+  def testMetricsCollections(self):
+    my_collection_name = '__metrics__'
+    mean_iou, _ = metrics.mean_iou(
+        predictions=tf.ones([10, 1]),
+        labels=tf.ones([10, 1]),
+        num_classes=2,
+        metrics_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [mean_iou])
+
+  def testUpdatesCollection(self):
+    my_collection_name = '__updates__'
+    _, update_op = metrics.mean_iou(
+        predictions=tf.ones([10, 1]),
+        labels=tf.ones([10, 1]),
+        num_classes=2,
+        updates_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [update_op])
+
+  def testPredictionsAndLabelsOfDifferentSizeRaisesValueError(self):
+    predictions = tf.ones([10, 3])
+    labels = tf.ones([10, 4])
+    with self.assertRaises(ValueError):
+      metrics.mean_iou(
+          labels, predictions, num_classes=2)
+
+  def testLabelsAndWeightsOfDifferentSizeRaisesValueError(self):
+    predictions = tf.ones([10])
+    labels = tf.ones([10])
+    weights = tf.zeros([9])
+    with self.assertRaises(ValueError):
+      metrics.mean_iou(
+          labels, predictions, num_classes=2, weights=weights)
+
+  def testValueTensorIsIdempotent(self):
+    num_classes = 3
+    predictions = tf.random_uniform([10], maxval=num_classes,
+                                    dtype=tf.int64, seed=1)
+    labels = tf.random_uniform([10], maxval=num_classes,
+                               dtype=tf.int64, seed=1)
+    miou, update_op = metrics.mean_iou(
+        labels, predictions, num_classes=num_classes)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+
+      # Run several updates.
+      for _ in range(10):
+        sess.run(update_op)
+
+      # Then verify idempotency.
+      initial_miou = miou.eval()
+      for _ in range(10):
+        self.assertEqual(initial_miou, miou.eval())
+
+  def testMultipleUpdates(self):
+    num_classes = 3
+    with self.test_session() as sess:
+      # Create the queue that populates the predictions.
+      preds_queue = tf.FIFOQueue(5, dtypes=tf.int32, shapes=(1, 1))
+      _enqueue_vector(sess, preds_queue, [0])
+      _enqueue_vector(sess, preds_queue, [1])
+      _enqueue_vector(sess, preds_queue, [2])
+      _enqueue_vector(sess, preds_queue, [1])
+      _enqueue_vector(sess, preds_queue, [0])
+      predictions = preds_queue.dequeue()
+
+      # Create the queue that populates the labels.
+      labels_queue = tf.FIFOQueue(5, dtypes=tf.int32, shapes=(1, 1))
+      _enqueue_vector(sess, labels_queue, [0])
+      _enqueue_vector(sess, labels_queue, [1])
+      _enqueue_vector(sess, labels_queue, [1])
+      _enqueue_vector(sess, labels_queue, [2])
+      _enqueue_vector(sess, labels_queue, [1])
+      labels = labels_queue.dequeue()
+
+      miou, update_op = metrics.mean_iou(
+          labels, predictions, num_classes)
+
+      sess.run(tf.local_variables_initializer())
+      for _ in range(5):
+        sess.run(update_op)
+      desired_output = np.mean([1.0/2.0, 1.0/4.0, 0.])
+      self.assertEqual(desired_output, miou.eval())
+
+  def testMultipleUpdatesWithWeights(self):
+    num_classes = 2
+    with self.test_session() as sess:
+      # Create the queue that populates the predictions.
+      preds_queue = tf.FIFOQueue(6, dtypes=tf.int32, shapes=(1, 1))
+      _enqueue_vector(sess, preds_queue, [0])
+      _enqueue_vector(sess, preds_queue, [1])
+      _enqueue_vector(sess, preds_queue, [0])
+      _enqueue_vector(sess, preds_queue, [1])
+      _enqueue_vector(sess, preds_queue, [0])
+      _enqueue_vector(sess, preds_queue, [1])
+      predictions = preds_queue.dequeue()
+
+      # Create the queue that populates the labels.
+      labels_queue = tf.FIFOQueue(6, dtypes=tf.int32, shapes=(1, 1))
+      _enqueue_vector(sess, labels_queue, [0])
+      _enqueue_vector(sess, labels_queue, [1])
+      _enqueue_vector(sess, labels_queue, [1])
+      _enqueue_vector(sess, labels_queue, [0])
+      _enqueue_vector(sess, labels_queue, [0])
+      _enqueue_vector(sess, labels_queue, [1])
+      labels = labels_queue.dequeue()
+
+      # Create the queue that populates the weights.
+      weights_queue = tf.FIFOQueue(6, dtypes=tf.float32, shapes=(1, 1))
+      _enqueue_vector(sess, weights_queue, [1.0])
+      _enqueue_vector(sess, weights_queue, [1.0])
+      _enqueue_vector(sess, weights_queue, [1.0])
+      _enqueue_vector(sess, weights_queue, [0.0])
+      _enqueue_vector(sess, weights_queue, [1.0])
+      _enqueue_vector(sess, weights_queue, [0.0])
+      weights = weights_queue.dequeue()
+
+      miou, update_op = metrics.mean_iou(
+          labels, predictions, num_classes, weights=weights)
+
+      sess.run(tf.local_variables_initializer())
+      for _ in range(6):
+        sess.run(update_op)
+      desired_output = np.mean([2.0/3.0, 1.0/2.0])
+      self.assertAlmostEqual(desired_output, miou.eval())
+
+  def testMultipleUpdatesWithMissingClass(self):
+    # Test the case where there are no predicions and labels for
+    # one class, and thus there is one row and one column with
+    # zero entries in the confusion matrix.
+    num_classes = 3
+    with self.test_session() as sess:
+      # Create the queue that populates the predictions.
+      # There is no prediction for class 2.
+      preds_queue = tf.FIFOQueue(5, dtypes=tf.int32, shapes=(1, 1))
+      _enqueue_vector(sess, preds_queue, [0])
+      _enqueue_vector(sess, preds_queue, [1])
+      _enqueue_vector(sess, preds_queue, [1])
+      _enqueue_vector(sess, preds_queue, [1])
+      _enqueue_vector(sess, preds_queue, [0])
+      predictions = preds_queue.dequeue()
+
+      # Create the queue that populates the labels.
+      # There is label for class 2.
+      labels_queue = tf.FIFOQueue(5, dtypes=tf.int32, shapes=(1, 1))
+      _enqueue_vector(sess, labels_queue, [0])
+      _enqueue_vector(sess, labels_queue, [1])
+      _enqueue_vector(sess, labels_queue, [1])
+      _enqueue_vector(sess, labels_queue, [0])
+      _enqueue_vector(sess, labels_queue, [1])
+      labels = labels_queue.dequeue()
+
+      miou, update_op = metrics.mean_iou(
+          labels, predictions, num_classes)
+
+      sess.run(tf.local_variables_initializer())
+      for _ in range(5):
+        sess.run(update_op)
+      desired_output = np.mean([1.0/3.0, 2.0/4.0, 0.])
+      self.assertAlmostEqual(desired_output, miou.eval())
+
+  def testUpdateOpEvalIsAccumulatedConfusionMatrix(self):
+    predictions = tf.concat(0,
+                            [tf.constant(0, shape=[5]),
+                             tf.constant(1, shape=[5])])
+    labels = tf.concat(0,
+                       [tf.constant(0, shape=[3]),
+                        tf.constant(1, shape=[7])])
+    num_classes = 2
+    with self.test_session() as sess:
+      miou, update_op = metrics.mean_iou(
+          labels, predictions, num_classes)
+      sess.run(tf.local_variables_initializer())
+      confusion_matrix = update_op.eval()
+      self.assertAllEqual([[3, 2], [0, 5]], confusion_matrix)
+      desired_miou = np.mean([3./5., 5./7.])
+      self.assertAlmostEqual(desired_miou, miou.eval())
+
+  def testAllCorrect(self):
+    predictions = tf.zeros([40])
+    labels = tf.zeros([40])
+    num_classes = 1
+    with self.test_session() as sess:
+      miou, update_op = metrics.mean_iou(
+          labels, predictions, num_classes)
+      sess.run(tf.local_variables_initializer())
+      self.assertEqual(40, update_op.eval()[0])
+      self.assertEqual(1.0, miou.eval())
+
+  def testAllWrong(self):
+    predictions = tf.zeros([40])
+    labels = tf.ones([40])
+    num_classes = 2
+    with self.test_session() as sess:
+      miou, update_op = metrics.mean_iou(
+          labels, predictions, num_classes)
+      sess.run(tf.local_variables_initializer())
+      self.assertAllEqual([[0, 40], [0, 0]], update_op.eval())
+      self.assertEqual(0., miou.eval())
+
+  def testResultsWithSomeMissing(self):
+    predictions = tf.concat(0, [tf.constant(0, shape=[5]),
+                                tf.constant(1, shape=[5])])
+    labels = tf.concat(0, [tf.constant(0, shape=[3]),
+                           tf.constant(1, shape=[7])])
+    num_classes = 2
+    weights = tf.concat(0, [tf.constant(0, shape=[1]),
+                            tf.constant(1, shape=[8]),
+                            tf.constant(0, shape=[1])])
+    with self.test_session() as sess:
+      miou, update_op = metrics.mean_iou(
+          labels, predictions, num_classes, weights=weights)
+      sess.run(tf.local_variables_initializer())
+      self.assertAllEqual([[2, 2], [0, 4]], update_op.eval())
+      desired_miou = np.mean([2./4., 4./6.])
+      self.assertAlmostEqual(desired_miou, miou.eval())
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/python/kernel_tests/parsing_ops_test.py b/tensorflow/python/kernel_tests/parsing_ops_test.py
index cbc5ee278e10b9..56bef403016dbb 100644
--- a/tensorflow/python/kernel_tests/parsing_ops_test.py
+++ b/tensorflow/python/kernel_tests/parsing_ops_test.py
@@ -47,7 +47,8 @@ def flatten(list_of_lists):
 def flatten_values_tensors_or_sparse(tensors_list):
   """Flatten each SparseTensor object into 3 Tensors for session.run()."""
   return list(
-      flatten([[v.indices, v.values, v.shape] if isinstance(v, tf.SparseTensor)
+      flatten([[v.indices, v.values, v.dense_shape]
+               if isinstance(v, tf.SparseTensor)
                else [v] for v in tensors_list]))
 
 
@@ -102,7 +103,8 @@ def _test(self, kwargs, expected_values=None, expected_err=None):
           self.assertEqual(
               tuple(out[k].indices.get_shape().as_list()), (None, 2))
           self.assertEqual(tuple(out[k].values.get_shape().as_list()), (None,))
-          self.assertEqual(tuple(out[k].shape.get_shape().as_list()), (2,))
+          self.assertEqual(
+              tuple(out[k].dense_shape.get_shape().as_list()), (2,))
 
   def testEmptySerializedWithAllDefaults(self):
     sparse_name = "st_a"
@@ -169,7 +171,9 @@ def testEmptySerializedWithoutDefaultsShouldFail(self):
             "serialized": [original.SerializeToString()],
             "features": input_features,
         },
-        expected_err=(tf.OpError, "Name: in1, Feature: c is required"))
+        expected_err=(
+            tf.OpError,
+            "Name: in1, Feature: c \\(data type: float\\) is required"))
 
     # Standard case of missing key and value.
     self._test(
@@ -178,7 +182,9 @@ def testEmptySerializedWithoutDefaultsShouldFail(self):
             "serialized": ["", ""],
             "features": input_features,
         },
-        expected_err=(tf.OpError, "Name: in1, Feature: c is required"))
+        expected_err=(
+            tf.OpError,
+            "Name: in1, Feature: c \\(data type: float\\) is required"))
 
   def testDenseNotMatchingShapeShouldFail(self):
     original = [
@@ -597,7 +603,8 @@ def _test(self, kwargs, expected_values=None, expected_err=None):
           self.assertEqual(
               tuple(out[k].indices.get_shape().as_list()), (None, 1))
           self.assertEqual(tuple(out[k].values.get_shape().as_list()), (None,))
-          self.assertEqual(tuple(out[k].shape.get_shape().as_list()), (1,))
+          self.assertEqual(
+              tuple(out[k].dense_shape.get_shape().as_list()), (1,))
 
   def testSingleExampleWithSparseAndSparseFeatureAndDense(self):
     original = example(features=features({"c": float_feature([3, 4]),
@@ -712,7 +719,7 @@ def _test(self,
             self.assertEqual(
                 tuple(context_out[k].values.get_shape().as_list()), (None,))
             self.assertEqual(
-                tuple(context_out[k].shape.get_shape().as_list()), (1,))
+                tuple(context_out[k].dense_shape.get_shape().as_list()), (1,))
 
   def testSequenceExampleWithSparseAndDenseContext(self):
     original = sequence_example(context=features({"c": float_feature([3, 4]),
diff --git a/tensorflow/python/kernel_tests/reverse_sequence_op_test.py b/tensorflow/python/kernel_tests/reverse_sequence_op_test.py
index fbfad8327ed6ba..ca6b198fa8b21c 100644
--- a/tensorflow/python/kernel_tests/reverse_sequence_op_test.py
+++ b/tensorflow/python/kernel_tests/reverse_sequence_op_test.py
@@ -25,13 +25,17 @@
 
 class ReverseSequenceTest(tf.test.TestCase):
 
-  def _testReverseSequence(self, x, batch_dim, seq_dim, seq_lengths,
-                           truth, use_gpu=False, expected_err_re=None):
+  def _testReverseSequence(self,
+                           x,
+                           batch_axis,
+                           seq_axis,
+                           seq_lengths,
+                           truth,
+                           use_gpu=False,
+                           expected_err_re=None):
     with self.test_session(use_gpu=use_gpu):
-      ans = tf.reverse_sequence(x,
-                                batch_dim=batch_dim,
-                                seq_dim=seq_dim,
-                                seq_lengths=seq_lengths)
+      ans = tf.reverse_sequence(
+          x, batch_axis=batch_axis, seq_axis=seq_axis, seq_lengths=seq_lengths)
       if expected_err_re is None:
         tf_ans = ans.eval()
         self.assertAllClose(tf_ans, truth, atol=1e-10)
@@ -40,12 +44,17 @@ def _testReverseSequence(self, x, batch_dim, seq_dim, seq_lengths,
         with self.assertRaisesOpError(expected_err_re):
           ans.eval()
 
-  def _testBothReverseSequence(self, x, batch_dim, seq_dim, seq_lengths,
-                               truth, expected_err_re=None):
-    self._testReverseSequence(x, batch_dim, seq_dim, seq_lengths,
-                              truth, True, expected_err_re)
-    self._testReverseSequence(x, batch_dim, seq_dim, seq_lengths,
-                              truth, False, expected_err_re)
+  def _testBothReverseSequence(self,
+                               x,
+                               batch_axis,
+                               seq_axis,
+                               seq_lengths,
+                               truth,
+                               expected_err_re=None):
+    self._testReverseSequence(x, batch_axis, seq_axis, seq_lengths, truth, True,
+                              expected_err_re)
+    self._testReverseSequence(x, batch_axis, seq_axis, seq_lengths, truth,
+                              False, expected_err_re)
 
   def _testBasic(self, dtype, len_dtype=np.int64):
     x = np.asarray([
@@ -66,9 +75,9 @@ def _testBasic(self, dtype, len_dtype=np.int64):
     truth_orig = truth_orig.reshape(3, 2, 4, 1, 1)
     truth = truth_orig.transpose([2, 1, 0, 3, 4])  # permute axes 0 <=> 2
 
-    seq_dim = 0    # permute seq_dim and batch_dim (originally 2 and 0, resp.)
-    batch_dim = 2
-    self._testBothReverseSequence(x, batch_dim, seq_dim, seq_lengths, truth)
+    seq_axis = 0  # permute seq_axis and batch_axis (originally 2 and 0, resp.)
+    batch_axis = 2
+    self._testBothReverseSequence(x, batch_axis, seq_axis, seq_lengths, truth)
 
   def testSeqLengthInt32(self):
     self._testBasic(np.float32, np.int32)
@@ -100,17 +109,18 @@ def testFloatReverseSequenceGrad(self):
     x = x.transpose([2, 1, 0, 3, 4])  # transpose axes 0 <=> 2
 
     # reverse dim 0 up to (0:3, none, 0:4) along dim=2
-    seq_dim = 0
-    batch_dim = 2
+    seq_axis = 0
+    batch_axis = 2
     seq_lengths = np.asarray([3, 0, 4], dtype=np.int64)
 
     with self.test_session():
       input_t = tf.constant(x, shape=x.shape)
       seq_lengths_t = tf.constant(seq_lengths, shape=seq_lengths.shape)
-      reverse_sequence_out = tf.reverse_sequence(input_t,
-                                                 batch_dim=batch_dim,
-                                                 seq_dim=seq_dim,
-                                                 seq_lengths=seq_lengths_t)
+      reverse_sequence_out = tf.reverse_sequence(
+          input_t,
+          batch_axis=batch_axis,
+          seq_axis=seq_axis,
+          seq_lengths=seq_lengths_t)
       err = tf.test.compute_gradient_error(input_t,
                                            x.shape,
                                            reverse_sequence_out,
@@ -121,41 +131,49 @@ def testFloatReverseSequenceGrad(self):
 
   def testShapeFunctionEdgeCases(self):
     t = tf.reverse_sequence(
-        tf.placeholder(tf.float32, shape=None),
-        seq_lengths=tf.placeholder(tf.int64, shape=(32,)),
-        batch_dim=0, seq_dim=1)
+        tf.placeholder(
+            tf.float32, shape=None),
+        seq_lengths=tf.placeholder(
+            tf.int64, shape=(32,)),
+        batch_axis=0,
+        seq_axis=1)
     self.assertIs(t.get_shape().ndims, None)
 
     # Batch size mismatched between input and seq_lengths.
     with self.assertRaises(ValueError):
       tf.reverse_sequence(
-          tf.placeholder(tf.float32, shape=(32, 2, 3)),
-          seq_lengths=tf.placeholder(tf.int64, shape=(33,)),
-          seq_dim=3)
+          tf.placeholder(
+              tf.float32, shape=(32, 2, 3)),
+          seq_lengths=tf.placeholder(
+              tf.int64, shape=(33,)),
+          seq_axis=3)
 
-    # seq_dim out of bounds.
+    # seq_axis out of bounds.
     with self.assertRaisesRegexp(ValueError, "seq_dim must be < input rank"):
       tf.reverse_sequence(
-          tf.placeholder(tf.float32, shape=(32, 2, 3)),
-          seq_lengths=tf.placeholder(tf.int64, shape=(32,)),
-          seq_dim=3)
+          tf.placeholder(
+              tf.float32, shape=(32, 2, 3)),
+          seq_lengths=tf.placeholder(
+              tf.int64, shape=(32,)),
+          seq_axis=3)
 
-    # batch_dim out of bounds.
+    # batch_axis out of bounds.
     with self.assertRaisesRegexp(
         ValueError, "batch_dim must be < input rank"):
       tf.reverse_sequence(
-          tf.placeholder(tf.float32, shape=(32, 2, 3)),
-          seq_lengths=tf.placeholder(tf.int64, shape=(32,)),
-          seq_dim=0,
-          batch_dim=3)
+          tf.placeholder(
+              tf.float32, shape=(32, 2, 3)),
+          seq_lengths=tf.placeholder(
+              tf.int64, shape=(32,)),
+          seq_axis=0,
+          batch_axis=3)
 
     with self.test_session():
       inputs = tf.placeholder(tf.float32, shape=(32, 2, 3))
       seq_lengths = tf.placeholder(tf.int64, shape=(32,))
       output = tf.reverse_sequence(
-          inputs,
-          seq_lengths=seq_lengths,
-          seq_dim=0)  # batch_dim default is 0
+          inputs, seq_lengths=seq_lengths,
+          seq_axis=0)  # batch_axis default is 0
       with self.assertRaisesOpError("batch_dim == seq_dim"):
         output.eval(feed_dict={inputs: np.random.rand(32, 2, 3),
                                seq_lengths: xrange(32)})
diff --git a/tensorflow/python/kernel_tests/rnn_test.py b/tensorflow/python/kernel_tests/rnn_test.py
index d3897afb9238b0..abb5675ff476f7 100644
--- a/tensorflow/python/kernel_tests/rnn_test.py
+++ b/tensorflow/python/kernel_tests/rnn_test.py
@@ -30,7 +30,7 @@
 from tensorflow.python.util import nest
 
 
-class Plus1RNNCell(tf.nn.rnn_cell.RNNCell):
+class Plus1RNNCell(tf.contrib.rnn.RNNCell):
   """RNN Cell generating (output, new_state) = (input + 1, state + 1)."""
 
   @property
@@ -45,7 +45,7 @@ def __call__(self, input_, state, scope=None):
     return (input_ + 1, state + 1)
 
 
-class DummyMultiDimensionalLSTM(tf.nn.rnn_cell.RNNCell):
+class DummyMultiDimensionalLSTM(tf.contrib.rnn.RNNCell):
   """LSTM Cell generating (output, new_state) = (input + 1, state + 1).
 
   The input to this cell may have an arbitrary number of dimensions that follow
@@ -79,7 +79,7 @@ def __call__(self, input_, state, scope=None):
     return (input_ + 1, (h + 1, c + 1))
 
 
-class NestedRNNCell(tf.nn.rnn_cell.RNNCell):
+class NestedRNNCell(tf.contrib.rnn.RNNCell):
   """RNN Cell generating (output, new_state) = (input + 1, state + 1).
 
   The input, output and state of this cell is a tuple of two tensors.
@@ -135,154 +135,10 @@ def setUp(self):
   def testInvalidSequenceLengthShape(self):
     cell = Plus1RNNCell()
     inputs = [tf.placeholder(tf.float32, shape=(3, 4))]
-    with self.assertRaisesRegexp(ValueError, "must be a vector"):
-      tf.nn.rnn(cell, inputs, dtype=tf.float32, sequence_length=4)
     with self.assertRaisesRegexp(ValueError, "must be a vector"):
       tf.nn.dynamic_rnn(
           cell, tf.stack(inputs), dtype=tf.float32, sequence_length=[[4]])
 
-  def testRNN(self):
-    cell = Plus1RNNCell()
-    batch_size = 2
-    input_size = 5
-    max_length = 8  # unrolled up to this length
-    inputs = max_length * [
-        tf.placeholder(tf.float32, shape=(batch_size, input_size))]
-    outputs, state = tf.nn.rnn(cell, inputs, dtype=tf.float32)
-    self.assertEqual(len(outputs), len(inputs))
-    for out, inp in zip(outputs, inputs):
-      self.assertEqual(out.get_shape(), inp.get_shape())
-      self.assertEqual(out.dtype, inp.dtype)
-
-    with self.test_session(use_gpu=False) as sess:
-      input_value = np.random.randn(batch_size, input_size)
-      values = sess.run(outputs + [state],
-                        feed_dict={inputs[0]: input_value})
-
-      # Outputs
-      for v in values[:-1]:
-        self.assertAllClose(v, input_value + 1.0)
-
-      # Final state
-      self.assertAllClose(
-          values[-1],
-          max_length * np.ones((batch_size, input_size), dtype=np.float32))
-
-  def testDropout(self):
-    cell = Plus1RNNCell()
-    full_dropout_cell = tf.nn.rnn_cell.DropoutWrapper(
-        cell, input_keep_prob=1e-12, seed=0)
-    batch_size = 2
-    input_size = 5
-    max_length = 8
-    inputs = max_length * [
-        tf.placeholder(tf.float32, shape=(batch_size, input_size))]
-    with tf.variable_scope("share_scope"):
-      outputs, state = tf.nn.rnn(cell, inputs, dtype=tf.float32)
-    with tf.variable_scope("drop_scope"):
-      dropped_outputs, _ = tf.nn.rnn(
-          full_dropout_cell, inputs, dtype=tf.float32)
-    self.assertEqual(len(outputs), len(inputs))
-    for out, inp in zip(outputs, inputs):
-      self.assertEqual(out.get_shape().as_list(), inp.get_shape().as_list())
-      self.assertEqual(out.dtype, inp.dtype)
-
-    with self.test_session(use_gpu=False) as sess:
-      input_value = np.random.randn(batch_size, input_size)
-      values = sess.run(outputs + [state],
-                        feed_dict={inputs[0]: input_value})
-      full_dropout_values = sess.run(dropped_outputs,
-                                     feed_dict={inputs[0]: input_value})
-
-      for v in values[:-1]:
-        self.assertAllClose(v, input_value + 1.0)
-      for d_v in full_dropout_values[:-1]:  # Add 1.0 to dropped_out (all zeros)
-        self.assertAllClose(d_v, np.ones_like(input_value))
-
-  def _testDynamicCalculation(self, use_gpu):
-    cell = Plus1RNNCell()
-    sequence_length = tf.placeholder(tf.int64)
-    batch_size = 2
-    input_size = 5
-    max_length = 8
-    inputs = max_length * [
-        tf.placeholder(tf.float32, shape=(batch_size, input_size))]
-    with tf.variable_scope("drop_scope"):
-      dynamic_outputs, dynamic_state = tf.nn.rnn(
-          cell, inputs, sequence_length=sequence_length, dtype=tf.float32)
-    self.assertEqual(len(dynamic_outputs), len(inputs))
-
-    with self.test_session(use_gpu=use_gpu) as sess:
-      input_value = np.random.randn(batch_size, input_size)
-      dynamic_values = sess.run(dynamic_outputs,
-                                feed_dict={inputs[0]: input_value,
-                                           sequence_length: [2, 3]})
-      dynamic_state_value = sess.run([dynamic_state],
-                                     feed_dict={inputs[0]: input_value,
-                                                sequence_length: [2, 3]})
-
-      # outputs are fully calculated for t = 0, 1
-      for v in dynamic_values[:2]:
-        self.assertAllClose(v, input_value + 1.0)
-
-      # outputs at t = 2 are zero for entry 0, calculated for entry 1
-      self.assertAllClose(
-          dynamic_values[2],
-          np.vstack((
-              np.zeros((input_size)),
-              1.0 + input_value[1, :])))
-
-      # outputs at t = 3+ are zero
-      for v in dynamic_values[3:]:
-        self.assertAllEqual(v, np.zeros_like(input_value))
-
-      # the final states are:
-      #  entry 0: the values from the calculation at t=1
-      #  entry 1: the values from the calculation at t=2
-      self.assertAllEqual(
-          dynamic_state_value[0],
-          np.vstack((
-              1.0 * (1 + 1) * np.ones((input_size)),
-              1.0 * (2 + 1) * np.ones((input_size)))))
-
-  def testDynamicCalculation(self):
-    self._testDynamicCalculation(True)
-    self._testDynamicCalculation(False)
-
-  def _testScope(self, factory, prefix="prefix", use_outer_scope=True):
-    with self.test_session(use_gpu=True, graph=tf.Graph()):
-      if use_outer_scope:
-        with tf.variable_scope(prefix) as scope:
-          factory(scope)
-      else:
-        factory(prefix)
-
-      # check that all the variables names starts
-      # with the proper scope.
-      tf.global_variables_initializer()
-      all_vars = tf.global_variables()
-      prefix = prefix or "rnn"
-      scope_vars = [v for v in all_vars if v.name.startswith(prefix + "/")]
-      tf.logging.info("RNN with scope: %s (%s)"
-                      % (prefix, "scope" if use_outer_scope else "str"))
-      for v in scope_vars:
-        tf.logging.info(v.name)
-      self.assertEqual(len(scope_vars), len(all_vars))
-
-  def testScope(self):
-    def factory(scope):
-      cell = Plus1RNNCell()
-      batch_size = 2
-      input_size = 5
-      max_length = 8  # unrolled up to this length
-      inputs = max_length * [
-          tf.placeholder(tf.float32, shape=(batch_size, input_size))]
-      return tf.nn.rnn(cell, inputs, dtype=tf.float32, scope=scope)
-
-    self._testScope(factory, use_outer_scope=True)
-    self._testScope(factory, use_outer_scope=False)
-    self._testScope(factory, prefix=None, use_outer_scope=False)
-
 
 class GRUTest(tf.test.TestCase):
 
@@ -304,7 +160,7 @@ def _testDynamic(self, use_gpu):
       concat_inputs = tf.placeholder(
           tf.float32, shape=(time_steps, batch_size, input_size))
 
-      cell = tf.nn.rnn_cell.GRUCell(num_units=num_units)
+      cell = tf.contrib.rnn.GRUCell(num_units=num_units)
 
       with tf.variable_scope("dynamic_scope"):
         outputs_dynamic, state_dynamic = tf.nn.dynamic_rnn(
@@ -352,7 +208,7 @@ def testDynamicScope(self):
     def factory(scope):
       concat_inputs = tf.placeholder(
           tf.float32, shape=(time_steps, batch_size, input_size))
-      cell = tf.nn.rnn_cell.GRUCell(num_units=num_units)
+      cell = tf.contrib.rnn.GRUCell(num_units=num_units)
       return tf.nn.dynamic_rnn(cell, inputs=concat_inputs,
                                sequence_length=sequence_length,
                                time_major=True, dtype=tf.float32,
@@ -369,466 +225,9 @@ def setUp(self):
     self._seed = 23489
     np.random.seed(self._seed)
 
-  def _testNoProjNoSharding(self, use_gpu):
-    num_units = 3
-    input_size = 5
-    batch_size = 2
-    max_length = 8
-    with self.test_session(use_gpu=use_gpu, graph=tf.Graph()) as sess:
-      initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=self._seed)
-      cell = tf.nn.rnn_cell.LSTMCell(num_units, initializer=initializer,
-                                     state_is_tuple=False)
-      inputs = max_length * [
-          tf.placeholder(tf.float32, shape=(batch_size, input_size))]
-      outputs, _ = tf.nn.rnn(cell, inputs, dtype=tf.float32)
-      self.assertEqual(len(outputs), len(inputs))
-      for out in outputs:
-        self.assertEqual(out.get_shape().as_list(), [batch_size, num_units])
-
-      tf.global_variables_initializer().run()
-      input_value = np.random.randn(batch_size, input_size)
-      sess.run(outputs, feed_dict={inputs[0]: input_value})
-
-  def _testCellClipping(self, use_gpu):
-    num_units = 3
-    input_size = 5
-    batch_size = 2
-    max_length = 8
-    with self.test_session(use_gpu=use_gpu, graph=tf.Graph()) as sess:
-      initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=self._seed)
-      cell = tf.nn.rnn_cell.LSTMCell(
-          num_units, use_peepholes=True, cell_clip=0.0, initializer=initializer,
-          state_is_tuple=False)
-      inputs = max_length * [
-          tf.placeholder(tf.float32, shape=(batch_size, input_size))]
-      outputs, _ = tf.nn.rnn(cell, inputs, dtype=tf.float32)
-      self.assertEqual(len(outputs), len(inputs))
-      for out in outputs:
-        self.assertEqual(out.get_shape().as_list(), [batch_size, num_units])
-
-      tf.global_variables_initializer().run()
-      input_value = np.random.randn(batch_size, input_size)
-      values = sess.run(outputs, feed_dict={inputs[0]: input_value})
-
-    for value in values:
-      # if cell c is clipped to 0, tanh(c) = 0 => m==0
-      self.assertAllEqual(value, np.zeros((batch_size, num_units)))
-
-  def _testNoProjNoShardingSimpleStateSaver(self, use_gpu):
-    num_units = 3
-    input_size = 5
-    batch_size = 2
-    max_length = 8
-    with self.test_session(use_gpu=use_gpu, graph=tf.Graph()) as sess:
-      initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=self._seed)
-      state_saver = TestStateSaver(batch_size, 2 * num_units)
-      cell = tf.nn.rnn_cell.LSTMCell(
-          num_units, use_peepholes=False, initializer=initializer,
-          state_is_tuple=False)
-      inputs = max_length * [
-          tf.placeholder(tf.float32, shape=(batch_size, input_size))]
-      with tf.variable_scope("share_scope"):
-        outputs, state = tf.nn.state_saving_rnn(
-            cell, inputs, state_saver=state_saver, state_name="save_lstm")
-      self.assertEqual(len(outputs), len(inputs))
-      for out in outputs:
-        self.assertEqual(out.get_shape().as_list(), [batch_size, num_units])
-
-      tf.global_variables_initializer().run()
-      input_value = np.random.randn(batch_size, input_size)
-      (last_state_value, saved_state_value) = sess.run(
-          [state, state_saver.saved_state["save_lstm"]],
-          feed_dict={inputs[0]: input_value})
-      self.assertAllEqual(last_state_value, saved_state_value)
-
-  def testNoProjNoShardingTupleStateSaver(self):
-    num_units = 3
-    input_size = 5
-    batch_size = 2
-    max_length = 8
-    with self.test_session(graph=tf.Graph()) as sess:
-      initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=self._seed)
-      state_saver = TestStateSaver(batch_size, num_units)
-      cell = tf.nn.rnn_cell.LSTMCell(
-          num_units, use_peepholes=False, initializer=initializer,
-          state_is_tuple=True)
-      inputs = max_length * [
-          tf.placeholder(tf.float32, shape=(batch_size, input_size))]
-      with tf.variable_scope("share_scope"):
-        outputs, state = tf.nn.state_saving_rnn(
-            cell, inputs, state_saver=state_saver, state_name=("c", "m"))
-      self.assertEqual(len(outputs), len(inputs))
-      for out in outputs:
-        self.assertEqual(out.get_shape().as_list(), [batch_size, num_units])
-
-      tf.global_variables_initializer().run()
-      input_value = np.random.randn(batch_size, input_size)
-      last_and_saved_states = sess.run(
-          state + (state_saver.saved_state["c"], state_saver.saved_state["m"]),
-          feed_dict={inputs[0]: input_value})
-      self.assertEqual(4, len(last_and_saved_states))
-      self.assertAllEqual(last_and_saved_states[:2], last_and_saved_states[2:])
-
-  def testNoProjNoShardingNestedTupleStateSaver(self):
-    num_units = 3
-    input_size = 5
-    batch_size = 2
-    max_length = 8
-    with self.test_session(graph=tf.Graph()) as sess:
-      initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=self._seed)
-      state_saver = TestStateSaver(batch_size, {"c0": num_units,
-                                                "m0": num_units,
-                                                "c1": num_units + 1,
-                                                "m1": num_units + 1,
-                                                "c2": num_units + 2,
-                                                "m2": num_units + 2,
-                                                "c3": num_units + 3,
-                                                "m3": num_units + 3})
-      def _cell(i):
-        return tf.nn.rnn_cell.LSTMCell(
-            num_units + i, use_peepholes=False, initializer=initializer,
-            state_is_tuple=True)
-
-      # This creates a state tuple which has 4 sub-tuples of length 2 each.
-      cell = tf.nn.rnn_cell.MultiRNNCell(
-          [_cell(i) for i in range(4)], state_is_tuple=True)
-
-      self.assertEqual(len(cell.state_size), 4)
-      for i in range(4):
-        self.assertEqual(len(cell.state_size[i]), 2)
-
-      inputs = max_length * [
-          tf.placeholder(tf.float32, shape=(batch_size, input_size))]
-
-      state_names = (("c0", "m0"), ("c1", "m1"),
-                     ("c2", "m2"), ("c3", "m3"))
-      with tf.variable_scope("share_scope"):
-        outputs, state = tf.nn.state_saving_rnn(
-            cell, inputs, state_saver=state_saver, state_name=state_names)
-      self.assertEqual(len(outputs), len(inputs))
-
-      # Final output comes from _cell(3) which has state size num_units + 3
-      for out in outputs:
-        self.assertEqual(out.get_shape().as_list(), [batch_size, num_units + 3])
-
-      tf.global_variables_initializer().run()
-      input_value = np.random.randn(batch_size, input_size)
-      last_states = sess.run(
-          list(nest.flatten(state)), feed_dict={inputs[0]: input_value})
-      saved_states = sess.run(
-          list(state_saver.saved_state.values()),
-          feed_dict={inputs[0]: input_value})
-      self.assertEqual(8, len(last_states))
-      self.assertEqual(8, len(saved_states))
-      flat_state_names = nest.flatten(state_names)
-      named_saved_states = dict(
-          zip(state_saver.saved_state.keys(), saved_states))
-
-      for i in range(8):
-        self.assertAllEqual(
-            last_states[i],
-            named_saved_states[flat_state_names[i]])
-
-  def _testProjNoSharding(self, use_gpu):
-    num_units = 3
-    input_size = 5
-    batch_size = 2
-    num_proj = 4
-    max_length = 8
-    with self.test_session(use_gpu=use_gpu, graph=tf.Graph()) as sess:
-      initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=self._seed)
-      inputs = max_length * [
-          tf.placeholder(tf.float32, shape=(None, input_size))]
-      cell = tf.nn.rnn_cell.LSTMCell(
-          num_units, use_peepholes=True,
-          num_proj=num_proj, initializer=initializer,
-          state_is_tuple=False)
-      outputs, _ = tf.nn.rnn(cell, inputs, dtype=tf.float32)
-      self.assertEqual(len(outputs), len(inputs))
-
-      tf.global_variables_initializer().run()
-      input_value = np.random.randn(batch_size, input_size)
-      sess.run(outputs, feed_dict={inputs[0]: input_value})
-
-  def testStateTupleWithProjAndSequenceLength(self):
-    num_units = 3
-    input_size = 5
-    batch_size = 2
-    num_proj = 4
-    max_length = 8
-    sequence_length = [4, 6]
-    with self.test_session(graph=tf.Graph()) as sess:
-      initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=self._seed)
-      inputs = max_length * [
-          tf.placeholder(tf.float32, shape=(None, input_size))]
-      cell_notuple = tf.nn.rnn_cell.LSTMCell(
-          num_units, use_peepholes=True,
-          num_proj=num_proj, initializer=initializer, state_is_tuple=False)
-      cell_tuple = tf.nn.rnn_cell.LSTMCell(
-          num_units, use_peepholes=True,
-          num_proj=num_proj, initializer=initializer, state_is_tuple=True)
-      with tf.variable_scope("root") as scope:
-        outputs_notuple, state_notuple = tf.nn.rnn(
-            cell_notuple, inputs, dtype=tf.float32,
-            sequence_length=sequence_length, scope=scope)
-        scope.reuse_variables()
-        outputs_tuple, state_tuple = tf.nn.rnn(
-            cell_tuple, inputs, dtype=tf.float32,
-            sequence_length=sequence_length, scope=scope)
-      self.assertEqual(len(outputs_notuple), len(inputs))
-      self.assertEqual(len(outputs_tuple), len(inputs))
-      self.assertTrue(isinstance(state_tuple, tuple))
-      self.assertTrue(isinstance(state_notuple, tf.Tensor))
-
-      tf.global_variables_initializer().run()
-      input_value = np.random.randn(batch_size, input_size)
-      outputs_notuple_v = sess.run(
-          outputs_notuple, feed_dict={inputs[0]: input_value})
-      outputs_tuple_v = sess.run(
-          outputs_tuple, feed_dict={inputs[0]: input_value})
-      self.assertAllEqual(outputs_notuple_v, outputs_tuple_v)
-
-      (state_notuple_v,) = sess.run(
-          (state_notuple,), feed_dict={inputs[0]: input_value})
-      state_tuple_v = sess.run(
-          state_tuple, feed_dict={inputs[0]: input_value})
-      self.assertAllEqual(state_notuple_v, np.hstack(state_tuple_v))
-
-  def _testProjSharding(self, use_gpu):
-    num_units = 3
-    input_size = 5
-    batch_size = 2
-    num_proj = 4
-    num_proj_shards = 3
-    num_unit_shards = 2
-    max_length = 8
-    with self.test_session(use_gpu=use_gpu, graph=tf.Graph()) as sess:
-      initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=self._seed)
-
-      inputs = max_length * [
-          tf.placeholder(tf.float32, shape=(None, input_size))]
-
-      cell = tf.nn.rnn_cell.LSTMCell(
-          num_units,
-          use_peepholes=True,
-          num_proj=num_proj,
-          num_unit_shards=num_unit_shards,
-          num_proj_shards=num_proj_shards,
-          initializer=initializer,
-          state_is_tuple=False)
-
-      outputs, _ = tf.nn.rnn(cell, inputs, dtype=tf.float32)
-
-      self.assertEqual(len(outputs), len(inputs))
-
-      tf.global_variables_initializer().run()
-      input_value = np.random.randn(batch_size, input_size)
-      sess.run(outputs, feed_dict={inputs[0]: input_value})
-
-  def _testDoubleInput(self, use_gpu):
-    num_units = 3
-    input_size = 5
-    batch_size = 2
-    num_proj = 4
-    num_proj_shards = 3
-    num_unit_shards = 2
-    max_length = 8
-    with self.test_session(use_gpu=use_gpu, graph=tf.Graph()) as sess:
-      initializer = tf.random_uniform_initializer(-1, 1, seed=self._seed)
-      inputs = max_length * [
-          tf.placeholder(tf.float64, shape=(None, input_size))]
-
-      cell = tf.nn.rnn_cell.LSTMCell(
-          num_units,
-          use_peepholes=True,
-          num_proj=num_proj,
-          num_unit_shards=num_unit_shards,
-          num_proj_shards=num_proj_shards,
-          initializer=initializer,
-          state_is_tuple=False)
-
-      outputs, _ = tf.nn.rnn(
-          cell, inputs, initial_state=cell.zero_state(batch_size, tf.float64))
-
-      self.assertEqual(len(outputs), len(inputs))
-
-      tf.global_variables_initializer().run()
-      input_value = np.asarray(np.random.randn(batch_size, input_size),
-                               dtype=np.float64)
-      values = sess.run(outputs, feed_dict={inputs[0]: input_value})
-      self.assertEqual(values[0].dtype, input_value.dtype)
-
-  def _testShardNoShardEquivalentOutput(self, use_gpu):
-    num_units = 3
-    input_size = 5
-    batch_size = 2
-    num_proj = 4
-    num_proj_shards = 3
-    num_unit_shards = 2
-    max_length = 8
-    with self.test_session(use_gpu=use_gpu, graph=tf.Graph()) as sess:
-      inputs = max_length * [
-          tf.placeholder(tf.float32, shape=(None, input_size))]
-      initializer = tf.constant_initializer(0.001)
-
-      cell_noshard = tf.nn.rnn_cell.LSTMCell(
-          num_units,
-          num_proj=num_proj,
-          use_peepholes=True,
-          initializer=initializer,
-          num_unit_shards=num_unit_shards,
-          num_proj_shards=num_proj_shards,
-          state_is_tuple=False)
-
-      cell_shard = tf.nn.rnn_cell.LSTMCell(
-          num_units, use_peepholes=True,
-          initializer=initializer, num_proj=num_proj,
-          state_is_tuple=False)
-
-      with tf.variable_scope("noshard_scope"):
-        outputs_noshard, state_noshard = tf.nn.rnn(
-            cell_noshard, inputs, dtype=tf.float32)
-      with tf.variable_scope("shard_scope"):
-        outputs_shard, state_shard = tf.nn.rnn(
-            cell_shard, inputs, dtype=tf.float32)
-
-      self.assertEqual(len(outputs_noshard), len(inputs))
-      self.assertEqual(len(outputs_noshard), len(outputs_shard))
-
-      tf.global_variables_initializer().run()
-      input_value = np.random.randn(batch_size, input_size)
-      feeds = dict((x, input_value) for x in inputs)
-      values_noshard = sess.run(outputs_noshard, feed_dict=feeds)
-      values_shard = sess.run(outputs_shard, feed_dict=feeds)
-      state_values_noshard = sess.run([state_noshard], feed_dict=feeds)
-      state_values_shard = sess.run([state_shard], feed_dict=feeds)
-      self.assertEqual(len(values_noshard), len(values_shard))
-      self.assertEqual(len(state_values_noshard), len(state_values_shard))
-      for (v_noshard, v_shard) in zip(values_noshard, values_shard):
-        self.assertAllClose(v_noshard, v_shard, atol=1e-3)
-      for (s_noshard, s_shard) in zip(state_values_noshard, state_values_shard):
-        self.assertAllClose(s_noshard, s_shard, atol=1e-3)
-
-  def _testDoubleInputWithDropoutAndDynamicCalculation(
-      self, use_gpu):
-    """Smoke test for using LSTM with doubles, dropout, dynamic calculation."""
-
-    num_units = 3
-    input_size = 5
-    batch_size = 2
-    num_proj = 4
-    num_proj_shards = 3
-    num_unit_shards = 2
-    max_length = 8
-    with self.test_session(use_gpu=use_gpu, graph=tf.Graph()) as sess:
-      sequence_length = tf.placeholder(tf.int64)
-      initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=self._seed)
-      inputs = max_length * [
-          tf.placeholder(tf.float64, shape=(None, input_size))]
-
-      cell = tf.nn.rnn_cell.LSTMCell(
-          num_units,
-          use_peepholes=True,
-          num_proj=num_proj,
-          num_unit_shards=num_unit_shards,
-          num_proj_shards=num_proj_shards,
-          initializer=initializer,
-          state_is_tuple=False)
-      dropout_cell = tf.nn.rnn_cell.DropoutWrapper(cell, 0.5, seed=0)
-
-      outputs, state = tf.nn.rnn(
-          dropout_cell, inputs, sequence_length=sequence_length,
-          initial_state=cell.zero_state(batch_size, tf.float64))
-
-      self.assertEqual(len(outputs), len(inputs))
-
-      tf.global_variables_initializer().run(feed_dict={sequence_length: [2, 3]})
-      input_value = np.asarray(np.random.randn(batch_size, input_size),
-                               dtype=np.float64)
-      values = sess.run(outputs, feed_dict={inputs[0]: input_value,
-                                            sequence_length: [2, 3]})
-      state_value = sess.run([state], feed_dict={inputs[0]: input_value,
-                                                 sequence_length: [2, 3]})
-      self.assertEqual(values[0].dtype, input_value.dtype)
-      self.assertEqual(state_value[0].dtype, input_value.dtype)
-
-  def testSharingWeightsWithReuse(self):
-    num_units = 3
-    input_size = 5
-    batch_size = 2
-    num_proj = 4
-    max_length = 8
-    with self.test_session(graph=tf.Graph()) as sess:
-      initializer = tf.random_uniform_initializer(-1, 1, seed=self._seed)
-      initializer_d = tf.random_uniform_initializer(-1, 1, seed=self._seed+1)
-      inputs = max_length * [
-          tf.placeholder(tf.float32, shape=(None, input_size))]
-      cell = tf.nn.rnn_cell.LSTMCell(
-          num_units, use_peepholes=True,
-          num_proj=num_proj, initializer=initializer,
-          state_is_tuple=False)
-      cell_d = tf.nn.rnn_cell.LSTMCell(
-          num_units, use_peepholes=True,
-          num_proj=num_proj, initializer=initializer_d,
-          state_is_tuple=False)
-
-      with tf.variable_scope("share_scope"):
-        outputs0, _ = tf.nn.rnn(cell, inputs, dtype=tf.float32)
-      with tf.variable_scope("share_scope", reuse=True):
-        outputs1, _ = tf.nn.rnn(cell, inputs, dtype=tf.float32)
-      with tf.variable_scope("diff_scope"):
-        outputs2, _ = tf.nn.rnn(cell_d, inputs, dtype=tf.float32)
-
-      tf.global_variables_initializer().run()
-      input_value = np.random.randn(batch_size, input_size)
-      output_values = sess.run(
-          outputs0 + outputs1 + outputs2, feed_dict={inputs[0]: input_value})
-      outputs0_values = output_values[:max_length]
-      outputs1_values = output_values[max_length:2*max_length]
-      outputs2_values = output_values[2*max_length:]
-      self.assertEqual(len(outputs0_values), len(outputs1_values))
-      self.assertEqual(len(outputs0_values), len(outputs2_values))
-      for o1, o2, o3 in zip(outputs0_values, outputs1_values, outputs2_values):
-        # Same weights used by both RNNs so outputs should be the same.
-        self.assertAllEqual(o1, o2)
-        # Different weights used so outputs should be different.
-        self.assertTrue(np.linalg.norm(o1-o3) > 1e-6)
-
-  def testSharingWeightsWithDifferentNamescope(self):
-    num_units = 3
-    input_size = 5
-    batch_size = 2
-    num_proj = 4
-    max_length = 8
-    with self.test_session(graph=tf.Graph()) as sess:
-      initializer = tf.random_uniform_initializer(-1, 1, seed=self._seed)
-      inputs = max_length * [
-          tf.placeholder(tf.float32, shape=(None, input_size))]
-      cell = tf.nn.rnn_cell.LSTMCell(
-          num_units, use_peepholes=True,
-          num_proj=num_proj, initializer=initializer,
-          state_is_tuple=False)
-
-      with tf.name_scope("scope0"):
-        with tf.variable_scope("share_scope"):
-          outputs0, _ = tf.nn.rnn(cell, inputs, dtype=tf.float32)
-      with tf.name_scope("scope1"):
-        with tf.variable_scope("share_scope", reuse=True):
-          outputs1, _ = tf.nn.rnn(cell, inputs, dtype=tf.float32)
-
-      tf.global_variables_initializer().run()
-      input_value = np.random.randn(batch_size, input_size)
-      output_values = sess.run(
-          outputs0 + outputs1, feed_dict={inputs[0]: input_value})
-      outputs0_values = output_values[:max_length]
-      outputs1_values = output_values[max_length:]
-      self.assertEqual(len(outputs0_values), len(outputs1_values))
-      for out0, out1 in zip(outputs0_values, outputs1_values):
-        self.assertAllEqual(out0, out1)
-
   def testDynamicRNNAllowsUnknownTimeDimension(self):
     inputs = tf.placeholder(tf.float32, shape=[1, None, 20])
-    cell = tf.nn.rnn_cell.GRUCell(30)
+    cell = tf.contrib.rnn.GRUCell(30)
     # Smoke test, this should not raise an error
     tf.nn.dynamic_rnn(cell, inputs, dtype=tf.float32)
 
@@ -844,19 +243,19 @@ def testDynamicRNNWithTupleStates(self):
       inputs = max_length * [
           tf.placeholder(tf.float32, shape=(None, input_size))]
       inputs_c = tf.stack(inputs)
-      cell = tf.nn.rnn_cell.LSTMCell(
+      cell = tf.contrib.rnn.LSTMCell(
           num_units, use_peepholes=True,
           num_proj=num_proj, initializer=initializer, state_is_tuple=True)
       with tf.variable_scope("root") as scope:
-        outputs_static, state_static = tf.nn.rnn(
+        outputs_static, state_static = tf.contrib.rnn.static_rnn(
             cell, inputs, dtype=tf.float32,
             sequence_length=sequence_length, scope=scope)
         scope.reuse_variables()
         outputs_dynamic, state_dynamic = tf.nn.dynamic_rnn(
             cell, inputs_c, dtype=tf.float32, time_major=True,
             sequence_length=sequence_length, scope=scope)
-      self.assertTrue(isinstance(state_static, tf.nn.rnn_cell.LSTMStateTuple))
-      self.assertTrue(isinstance(state_dynamic, tf.nn.rnn_cell.LSTMStateTuple))
+      self.assertTrue(isinstance(state_static, tf.contrib.rnn.LSTMStateTuple))
+      self.assertTrue(isinstance(state_dynamic, tf.contrib.rnn.LSTMStateTuple))
       self.assertEqual(state_static[0], state_static.c)
       self.assertEqual(state_static[1], state_static.h)
       self.assertEqual(state_dynamic[0], state_dynamic.c)
@@ -891,12 +290,12 @@ def testDynamicRNNWithNestedTupleStates(self):
           tf.placeholder(tf.float32, shape=(None, input_size))]
       inputs_c = tf.stack(inputs)
       def _cell(i):
-        return tf.nn.rnn_cell.LSTMCell(
+        return tf.contrib.rnn.LSTMCell(
             num_units + i, use_peepholes=True,
             num_proj=num_proj + i, initializer=initializer, state_is_tuple=True)
 
       # This creates a state tuple which has 4 sub-tuples of length 2 each.
-      cell = tf.nn.rnn_cell.MultiRNNCell(
+      cell = tf.contrib.rnn.MultiRNNCell(
           [_cell(i) for i in range(4)], state_is_tuple=True)
 
       self.assertEqual(len(cell.state_size), 4)
@@ -910,7 +309,7 @@ def _cell(i):
         self.assertEqual(test_zero[i][1].get_shape()[1], cell.state_size[i][1])
 
       with tf.variable_scope("root") as scope:
-        outputs_static, state_static = tf.nn.rnn(
+        outputs_static, state_static = tf.contrib.rnn.static_rnn(
             cell, inputs, dtype=tf.float32,
             sequence_length=sequence_length, scope=scope)
         scope.reuse_variables()
@@ -955,12 +354,12 @@ def _testDynamicEquivalentToStaticRNN(self, use_gpu, use_sequence_length):
       inputs = tf.unstack(concat_inputs)
       initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=self._seed)
 
-      cell = tf.nn.rnn_cell.LSTMCell(
+      cell = tf.contrib.rnn.LSTMCell(
           num_units, use_peepholes=True,
           initializer=initializer, num_proj=num_proj, state_is_tuple=False)
 
       with tf.variable_scope("dynamic_scope"):
-        outputs_static, state_static = tf.nn.rnn(
+        outputs_static, state_static = tf.contrib.rnn.static_rnn(
             cell, inputs, sequence_length=sequence_length, dtype=tf.float32)
 
       feeds = {concat_inputs: input_values}
@@ -1010,7 +409,7 @@ def _testDynamicEquivalentToStaticRNN(self, use_gpu, use_sequence_length):
       inputs = tf.unstack(concat_inputs)
       initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=self._seed)
 
-      cell = tf.nn.rnn_cell.LSTMCell(
+      cell = tf.contrib.rnn.LSTMCell(
           num_units, use_peepholes=True,
           initializer=initializer, num_proj=num_proj, state_is_tuple=False)
 
@@ -1084,38 +483,6 @@ def _testDynamicEquivalentToStaticRNN(self, use_gpu, use_sequence_length):
           "Comparing individual variable gradients iteration %d" % i)
       self.assertAllEqual(a, b)
 
-  def testNoProjNoShardingSimpleStateSaver(self):
-    self._testNoProjNoShardingSimpleStateSaver(use_gpu=False)
-    self._testNoProjNoShardingSimpleStateSaver(use_gpu=True)
-
-  def testNoProjNoSharding(self):
-    self._testNoProjNoSharding(use_gpu=False)
-    self._testNoProjNoSharding(use_gpu=True)
-
-  def testCellClipping(self):
-    self._testCellClipping(use_gpu=False)
-    self._testCellClipping(use_gpu=True)
-
-  def testProjNoSharding(self):
-    self._testProjNoSharding(use_gpu=False)
-    self._testProjNoSharding(use_gpu=True)
-
-  def testProjSharding(self):
-    self._testProjSharding(use_gpu=False)
-    self._testProjSharding(use_gpu=True)
-
-  def testShardNoShardEquivalentOutput(self):
-    self._testShardNoShardEquivalentOutput(use_gpu=False)
-    self._testShardNoShardEquivalentOutput(use_gpu=True)
-
-  def testDoubleInput(self):
-    self._testDoubleInput(use_gpu=False)
-    self._testDoubleInput(use_gpu=True)
-
-  def testDoubleInputWithDropoutAndDynamicCalculation(self):
-    self._testDoubleInputWithDropoutAndDynamicCalculation(use_gpu=False)
-    self._testDoubleInputWithDropoutAndDynamicCalculation(use_gpu=True)
-
   def testDynamicEquivalentToStaticRNN(self):
     self._testDynamicEquivalentToStaticRNN(
         use_gpu=False, use_sequence_length=False)
@@ -1133,138 +500,6 @@ def setUp(self):
     self._seed = 23489
     np.random.seed(self._seed)
 
-  def _createBidirectionalRNN(self,
-                              use_gpu,
-                              use_shape,
-                              use_sequence_length,
-                              scope=None):
-    num_units = 3
-    input_size = 5
-    batch_size = 2
-    max_length = 8
-
-    initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=self._seed)
-    sequence_length = tf.placeholder(tf.int64) if use_sequence_length else None
-    cell_fw = tf.nn.rnn_cell.LSTMCell(num_units,
-                                      input_size,
-                                      initializer=initializer,
-                                      state_is_tuple=False)
-    cell_bw = tf.nn.rnn_cell.LSTMCell(num_units,
-                                      input_size,
-                                      initializer=initializer,
-                                      state_is_tuple=False)
-    inputs = max_length * [
-        tf.placeholder(
-            tf.float32,
-            shape=(batch_size, input_size) if use_shape else (None, input_size))
-    ]
-    outputs, state_fw, state_bw = tf.nn.bidirectional_rnn(
-        cell_fw,
-        cell_bw,
-        inputs,
-        dtype=tf.float32,
-        sequence_length=sequence_length,
-        scope=scope)
-    self.assertEqual(len(outputs), len(inputs))
-    for out in outputs:
-      self.assertEqual(
-          out.get_shape().as_list(),
-          [batch_size if use_shape else None, 2 * num_units])
-
-    input_value = np.random.randn(batch_size, input_size)
-    outputs = tf.stack(outputs)
-
-    return input_value, inputs, outputs, state_fw, state_bw, sequence_length
-
-  def _testBidirectionalRNN(self, use_gpu, use_shape):
-    with self.test_session(use_gpu=use_gpu, graph=tf.Graph()) as sess:
-      input_value, inputs, outputs, state_fw, state_bw, sequence_length = (
-          self._createBidirectionalRNN(use_gpu, use_shape, True))
-      tf.global_variables_initializer().run()
-      # Run with pre-specified sequence length of 2, 3
-      out, s_fw, s_bw = sess.run([outputs, state_fw, state_bw],
-                                 feed_dict={inputs[0]: input_value,
-                                 sequence_length: [2, 3]})
-
-      # Since the forward and backward LSTM cells were initialized with the
-      # same parameters, the forward and backward output has to be the same,
-      # but reversed in time. The format is output[time][batch][depth], and
-      # due to depth concatenation (as num_units=3 for both RNNs):
-      # - forward output:  out[][][depth] for 0 <= depth < 3
-      # - backward output: out[][][depth] for 4 <= depth < 6
-      #
-      # First sequence in batch is length=2
-      # Check that the time=0 forward output is equal to time=1 backward output
-      self.assertEqual(out[0][0][0], out[1][0][3])
-      self.assertEqual(out[0][0][1], out[1][0][4])
-      self.assertEqual(out[0][0][2], out[1][0][5])
-      # Check that the time=1 forward output is equal to time=0 backward output
-      self.assertEqual(out[1][0][0], out[0][0][3])
-      self.assertEqual(out[1][0][1], out[0][0][4])
-      self.assertEqual(out[1][0][2], out[0][0][5])
-
-      # Second sequence in batch is length=3
-      # Check that the time=0 forward output is equal to time=2 backward output
-      self.assertEqual(out[0][1][0], out[2][1][3])
-      self.assertEqual(out[0][1][1], out[2][1][4])
-      self.assertEqual(out[0][1][2], out[2][1][5])
-      # Check that the time=1 forward output is equal to time=1 backward output
-      self.assertEqual(out[1][1][0], out[1][1][3])
-      self.assertEqual(out[1][1][1], out[1][1][4])
-      self.assertEqual(out[1][1][2], out[1][1][5])
-      # Check that the time=2 forward output is equal to time=0 backward output
-      self.assertEqual(out[2][1][0], out[0][1][3])
-      self.assertEqual(out[2][1][1], out[0][1][4])
-      self.assertEqual(out[2][1][2], out[0][1][5])
-      # Via the reasoning above, the forward and backward final state should be
-      # exactly the same
-      self.assertAllClose(s_fw, s_bw)
-
-  def _testBidirectionalRNNWithoutSequenceLength(self, use_gpu, use_shape):
-    with self.test_session(use_gpu=use_gpu, graph=tf.Graph()) as sess:
-      input_value, inputs, outputs, state_fw, state_bw, _ = (
-          self._createBidirectionalRNN(use_gpu, use_shape, False))
-      tf.global_variables_initializer().run()
-      out, s_fw, s_bw = sess.run([outputs, state_fw, state_bw],
-                                 feed_dict={inputs[0]: input_value})
-
-      # Since the forward and backward LSTM cells were initialized with the
-      # same parameters, the forward and backward output has to be the same,
-      # but reversed in time. The format is output[time][batch][depth], and
-      # due to depth concatenation (as num_units=3 for both RNNs):
-      # - forward output:  out[][][depth] for 0 <= depth < 3
-      # - backward output: out[][][depth] for 4 <= depth < 6
-      #
-      # Both sequences in batch are length=8.  Check that the time=i
-      # forward output is equal to time=8-1-i backward output
-      for i in xrange(8):
-        self.assertEqual(out[i][0][0], out[8 - 1 - i][0][3])
-        self.assertEqual(out[i][0][1], out[8 - 1 - i][0][4])
-        self.assertEqual(out[i][0][2], out[8 - 1 - i][0][5])
-      for i in xrange(8):
-        self.assertEqual(out[i][1][0], out[8 - 1 - i][1][3])
-        self.assertEqual(out[i][1][1], out[8 - 1 - i][1][4])
-        self.assertEqual(out[i][1][2], out[8 - 1 - i][1][5])
-      # Via the reasoning above, the forward and backward final state should be
-      # exactly the same
-      self.assertAllClose(s_fw, s_bw)
-
-  def testBidirectionalRNN(self):
-    self._testBidirectionalRNN(use_gpu=False, use_shape=False)
-    self._testBidirectionalRNN(use_gpu=True, use_shape=False)
-    self._testBidirectionalRNN(use_gpu=False, use_shape=True)
-    self._testBidirectionalRNN(use_gpu=True, use_shape=True)
-
-  def testBidirectionalRNNWithoutSequenceLength(self):
-    self._testBidirectionalRNNWithoutSequenceLength(use_gpu=False,
-                                                    use_shape=False)
-    self._testBidirectionalRNNWithoutSequenceLength(use_gpu=True,
-                                                    use_shape=False)
-    self._testBidirectionalRNNWithoutSequenceLength(use_gpu=False,
-                                                    use_shape=True)
-    self._testBidirectionalRNNWithoutSequenceLength(use_gpu=True,
-                                                    use_shape=True)
-
   def _createBidirectionalDynamicRNN(self, use_gpu, use_shape,
                                      use_state_tuple, use_time_major,
                                      scope=None):
@@ -1275,10 +510,10 @@ def _createBidirectionalDynamicRNN(self, use_gpu, use_shape,
 
     initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=self._seed)
     sequence_length = tf.placeholder(tf.int64)
-    cell_fw = tf.nn.rnn_cell.LSTMCell(num_units,
+    cell_fw = tf.contrib.rnn.LSTMCell(num_units,
                                       initializer=initializer,
                                       state_is_tuple=use_state_tuple)
-    cell_bw = tf.nn.rnn_cell.LSTMCell(num_units,
+    cell_bw = tf.contrib.rnn.LSTMCell(num_units,
                                       initializer=initializer,
                                       state_is_tuple=use_state_tuple)
     inputs = max_length * [
@@ -1398,16 +633,6 @@ def _testScope(self, factory, prefix="prefix", use_outer_scope=True):
         tf.logging.info(v.name)
       self.assertEqual(len(scope_vars), len(all_vars))
 
-  def testBidirectionalRNNScope(self):
-    def factory(scope):
-      return self._createBidirectionalRNN(
-          use_gpu=True, use_shape=True,
-          use_sequence_length=True, scope=scope)
-
-    self._testScope(factory, use_outer_scope=True)
-    self._testScope(factory, use_outer_scope=False)
-    self._testScope(factory, prefix=None, use_outer_scope=False)
-
   def testBidirectionalDynamicRNNScope(self):
     def get_factory(use_time_major):
       def factory(scope):
@@ -1446,27 +671,14 @@ def testMultiDimensionalLSTMAllRNNContainers(self):
       # variables.
       cell = DummyMultiDimensionalLSTM(feature_dims)
       state_saver = TestStateSaver(batch_size, input_size)
-      outputs_static, state_static = tf.nn.rnn(
+      outputs_static, state_static = tf.contrib.rnn.static_rnn(
           cell, inputs, dtype=tf.float32,
           sequence_length=sequence_length)
       outputs_dynamic, state_dynamic = tf.nn.dynamic_rnn(
           cell, inputs_c, dtype=tf.float32, time_major=True,
           sequence_length=sequence_length)
-      outputs_bid, state_bid_fw, state_bid_bw = tf.nn.bidirectional_rnn(
-          cell, cell, inputs_using_dim, dtype=tf.float32,
-          sequence_length=sequence_length)
-      outputs_sav, state_sav = tf.nn.state_saving_rnn(
-          cell, inputs_using_dim, sequence_length=sequence_length,
-          state_saver=state_saver, state_name=("h", "c"))
-      for out, inp in zip(outputs_static, inputs):
-        self.assertEqual(out.get_shape().as_list(), inp.get_shape().as_list())
       self.assertEqual(outputs_dynamic.get_shape().as_list(),
                        inputs_c.get_shape().as_list())
-      for out, inp in zip(outputs_bid, inputs_using_dim):
-        input_shape_list = inp.get_shape().as_list()
-        # fwd and bwd activations are concatenated along the second dim.
-        input_shape_list[1] *= 2
-        self.assertEqual(out.get_shape().as_list(), input_shape_list)
 
       tf.global_variables_initializer().run()
 
@@ -1476,37 +688,14 @@ def testMultiDimensionalLSTMAllRNNContainers(self):
           outputs_static, feed_dict={inputs[0]: input_value})
       outputs_dynamic_v = sess.run(
           outputs_dynamic, feed_dict={inputs[0]: input_value})
-      outputs_bid_v = sess.run(
-          outputs_bid, feed_dict={inputs_using_dim[0]: input_value})
-      outputs_sav_v = sess.run(
-          outputs_sav, feed_dict={inputs_using_dim[0]: input_value})
-
       self.assertAllEqual(outputs_static_v, outputs_dynamic_v)
-      self.assertAllEqual(outputs_static_v, outputs_sav_v)
-      outputs_static_array = np.array(outputs_static_v)
-      outputs_static_array_double = np.concatenate(
-          (outputs_static_array, outputs_static_array), axis=2)
-      outputs_bid_array = np.array(outputs_bid_v)
-      self.assertAllEqual(outputs_static_array_double, outputs_bid_array)
 
       state_static_v = sess.run(
           state_static, feed_dict={inputs[0]: input_value})
       state_dynamic_v = sess.run(
           state_dynamic, feed_dict={inputs[0]: input_value})
-      state_bid_fw_v = sess.run(
-          state_bid_fw, feed_dict={inputs_using_dim[0]: input_value})
-      state_bid_bw_v = sess.run(
-          state_bid_bw, feed_dict={inputs_using_dim[0]: input_value})
-      state_sav_v = sess.run(
-          state_sav, feed_dict={inputs_using_dim[0]: input_value})
       self.assertAllEqual(
           np.hstack(state_static_v), np.hstack(state_dynamic_v))
-      self.assertAllEqual(
-          np.hstack(state_static_v), np.hstack(state_sav_v))
-      self.assertAllEqual(
-          np.hstack(state_static_v), np.hstack(state_bid_fw_v))
-      self.assertAllEqual(
-          np.hstack(state_static_v), np.hstack(state_bid_bw_v))
 
 
 class NestedLSTMTest(tf.test.TestCase):
@@ -1539,15 +728,9 @@ def testNestedIOLSTMAllRNNContainers(self):
       outputs_dynamic, state_dynamic = tf.nn.dynamic_rnn(
           cell, inputs_c, dtype=tf.float32, time_major=True,
           sequence_length=sequence_length)
-      outputs_static, state_static = tf.nn.rnn(
+      outputs_static, state_static = tf.contrib.rnn.static_rnn(
           cell, inputs, dtype=tf.float32,
           sequence_length=sequence_length)
-      outputs_bid, state_bid_fw, state_bid_bw = tf.nn.bidirectional_rnn(
-          cell, cell, inputs_using_dim, dtype=tf.float32,
-          sequence_length=sequence_length)
-      outputs_sav, state_sav = tf.nn.state_saving_rnn(
-          cell, inputs_using_dim, sequence_length=sequence_length,
-          state_saver=state_saver, state_name=("h", "c"))
 
       def _assert_same_shape(input1, input2, double=False):
         flat_input1 = nest.flatten(input1)
@@ -1560,8 +743,6 @@ def _assert_same_shape(input1, input2, double=False):
 
       _assert_same_shape(inputs_c, outputs_dynamic)
       _assert_same_shape(inputs, outputs_static)
-      _assert_same_shape(inputs_using_dim, outputs_sav)
-      _assert_same_shape(inputs_using_dim, outputs_bid, double=True)
 
       tf.global_variables_initializer().run()
 
@@ -1572,38 +753,16 @@ def _assert_same_shape(input1, input2, double=False):
           outputs_dynamic, feed_dict={single_input: input_value})
       outputs_static_v = sess.run(
           outputs_static, feed_dict={single_input: input_value})
-      outputs_sav_v = sess.run(
-          outputs_sav, feed_dict={single_input_using_dim: input_value})
-      outputs_bid_v = sess.run(
-          outputs_bid, feed_dict={single_input_using_dim: input_value})
 
       self.assertAllEqual(outputs_static_v,
                           np.transpose(outputs_dynamic_v, (1, 0, 2, 3)))
-      self.assertAllEqual(outputs_static_v, outputs_sav_v)
-      outputs_static_array = np.array(outputs_static_v)
-      outputs_static_array_double = np.concatenate(
-          (outputs_static_array, outputs_static_array), axis=3)
-      outputs_bid_array = np.array(outputs_bid_v)
-      self.assertAllEqual(outputs_static_array_double, outputs_bid_array)
 
       state_dynamic_v = sess.run(
           state_dynamic, feed_dict={single_input: input_value})
       state_static_v = sess.run(
           state_static, feed_dict={single_input: input_value})
-      state_bid_fw_v = sess.run(
-          state_bid_fw, feed_dict={single_input_using_dim: input_value})
-      state_bid_bw_v = sess.run(
-          state_bid_bw, feed_dict={single_input_using_dim: input_value})
-      state_sav_v = sess.run(
-          state_sav, feed_dict={single_input_using_dim: input_value})
       self.assertAllEqual(
           np.hstack(state_static_v), np.hstack(state_dynamic_v))
-      self.assertAllEqual(
-          np.hstack(state_static_v), np.hstack(state_sav_v))
-      self.assertAllEqual(
-          np.hstack(state_static_v), np.hstack(state_bid_fw_v))
-      self.assertAllEqual(
-          np.hstack(state_static_v), np.hstack(state_bid_bw_v))
 
 
 class RawRNNTest(tf.test.TestCase):
@@ -1624,7 +783,7 @@ def _testRawRNN(self, max_time):
       inputs_ta = tf.TensorArray(dtype=tf.float32, size=tf.shape(inputs)[0])
       inputs_ta = inputs_ta.unpack(inputs)
 
-      cell = tf.nn.rnn_cell.LSTMCell(num_units, state_is_tuple=True)
+      cell = tf.contrib.rnn.LSTMCell(num_units, state_is_tuple=True)
 
       def loop_fn(time_, cell_output, cell_state, unused_loop_state):
         emit_output = cell_output  # == None for time == 0
@@ -1717,7 +876,7 @@ def testLoopState(self):
       inputs_ta = tf.TensorArray(dtype=tf.float32, size=tf.shape(inputs)[0])
       inputs_ta = inputs_ta.unpack(inputs)
 
-      cell = tf.nn.rnn_cell.LSTMCell(num_units, state_is_tuple=True)
+      cell = tf.contrib.rnn.LSTMCell(num_units, state_is_tuple=True)
 
       def loop_fn(time_, cell_output, cell_state, loop_state):
         if cell_output is None:
@@ -1752,7 +911,7 @@ def testLoopStateWithTensorArray(self):
       inputs_ta = tf.TensorArray(dtype=tf.float32, size=tf.shape(inputs)[0])
       inputs_ta = inputs_ta.unpack(inputs)
 
-      cell = tf.nn.rnn_cell.LSTMCell(num_units, state_is_tuple=True)
+      cell = tf.contrib.rnn.LSTMCell(num_units, state_is_tuple=True)
       def loop_fn(time_, cell_output, cell_state, loop_state):
         if cell_output is None:
           loop_state = tf.TensorArray(
@@ -1790,7 +949,7 @@ def testEmitDifferentStructureThanCellOutput(self):
       inputs_ta = tf.TensorArray(dtype=tf.float32, size=tf.shape(inputs)[0])
       inputs_ta = inputs_ta.unpack(inputs)
 
-      cell = tf.nn.rnn_cell.LSTMCell(num_units, state_is_tuple=True)
+      cell = tf.contrib.rnn.LSTMCell(num_units, state_is_tuple=True)
       def loop_fn(time_, cell_output, cell_state, _):
         if cell_output is None:
           emit_output = (tf.zeros([2, 3], dtype=tf.int32),
@@ -1853,7 +1012,7 @@ def factory(scope):
       inputs_ta = tf.TensorArray(dtype=tf.float32, size=tf.shape(inputs)[0])
       inputs_ta = inputs_ta.unpack(inputs)
 
-      cell = tf.nn.rnn_cell.LSTMCell(num_units, state_is_tuple=True)
+      cell = tf.contrib.rnn.LSTMCell(num_units, state_is_tuple=True)
       def loop_fn(time_, cell_output, cell_state, unused_loop_state):
         emit_output = cell_output  # == None for time == 0
         if cell_output is None:  # time == 0
@@ -1877,62 +1036,131 @@ def loop_fn(time_, cell_output, cell_state, unused_loop_state):
     self._testScope(factory, prefix=None, use_outer_scope=False)
 
 
-class StateSaverRNNTest(tf.test.TestCase):
+class DeviceWrapperCell(tf.contrib.rnn.RNNCell):
+  """Class to ensure cell calculation happens on a specific device."""
 
-  def setUp(self):
-    self._seed = 23489
-    np.random.seed(self._seed)
+  def __init__(self, cell, device):
+    self._cell = cell
+    self._device = device
 
-  def _testScope(self, factory, prefix="prefix", use_outer_scope=True):
-    with self.test_session(use_gpu=True, graph=tf.Graph()):
-      if use_outer_scope:
-        with tf.variable_scope(prefix) as scope:
-          factory(scope)
-      else:
-        factory(prefix)
-        tf.global_variables_initializer()
+  @property
+  def output_size(self):
+    return self._cell.output_size
 
-      # check that all the variables names starts
-      # with the proper scope.
-      all_vars = tf.global_variables()
-      prefix = prefix or "rnn"
-      scope_vars = [v for v in all_vars if v.name.startswith(prefix + "/")]
-      tf.logging.info("RNN with scope: %s (%s)"
-                      % (prefix, "scope" if use_outer_scope else "str"))
-      for v in scope_vars:
-        tf.logging.info(v.name)
-      self.assertEqual(len(scope_vars), len(all_vars))
+  @property
+  def state_size(self):
+    return self._cell.state_size
 
-  def testStateSaverRNNScope(self):
-    num_units = 3
+  def __call__(self, input_, state, scope=None):
+    if self._device is not None:
+      with tf.device(self._device):
+        return self._cell(input_, state, scope)
+    else:
+      return self._cell(input_, state, scope)
+
+
+class TensorArrayOnCorrectDeviceTest(tf.test.TestCase):
+
+  def _execute_rnn_on(
+      self, rnn_device=None, cell_device=None, input_device=None):
+    batch_size = 3
+    time_steps = 7
     input_size = 5
-    batch_size = 2
-    max_length = 8
-    def factory(scope):
-      initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=self._seed)
-      state_saver = TestStateSaver(batch_size, 2 * num_units)
-      cell = tf.nn.rnn_cell.LSTMCell(
-          num_units, use_peepholes=False, initializer=initializer,
-          state_is_tuple=False)
-      inputs = max_length * [
-          tf.placeholder(tf.float32, shape=(batch_size, input_size))]
-      return tf.nn.state_saving_rnn(
-          cell, inputs, state_saver=state_saver,
-          state_name="save_lstm", scope=scope)
+    num_units = 10
+
+    cell = tf.contrib.rnn.LSTMCell(num_units, use_peepholes=True)
+    gpu_cell = DeviceWrapperCell(cell, cell_device)
+    inputs = np.random.randn(batch_size, time_steps, input_size).astype(
+        np.float32)
+    sequence_length = np.random.randint(0, time_steps, size=batch_size)
+
+    if input_device is not None:
+      with tf.device(input_device):
+        inputs = tf.constant(inputs)
+
+    if rnn_device is not None:
+      with tf.device(rnn_device):
+        outputs, _ = tf.nn.dynamic_rnn(
+            gpu_cell, inputs, sequence_length=sequence_length, dtype=tf.float32)
+    else:
+      outputs, _ = tf.nn.dynamic_rnn(
+          gpu_cell, inputs, sequence_length=sequence_length, dtype=tf.float32)
+
+    with self.test_session(use_gpu=True) as sess:
+      opts = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
+      run_metadata = tf.RunMetadata()
+      tf.global_variables_initializer().run()
+      sess.run(outputs, options=opts, run_metadata=run_metadata)
+
+    return run_metadata
+
+  def testRNNOnCPUCellOnGPU(self):
+    if not tf.test.is_gpu_available():
+      return  # Test requires access to a GPU
+
+    run_metadata = self._execute_rnn_on(
+        rnn_device="/cpu:0", cell_device="/gpu:0")
+    step_stats = run_metadata.step_stats
+    ix = 0 if "gpu" in step_stats.dev_stats[0].device else 1
+    gpu_stats = step_stats.dev_stats[ix].node_stats
+    cpu_stats = step_stats.dev_stats[1 - ix].node_stats
+    def _assert_in(op_str, in_stats, out_stats):
+      self.assertTrue(any(op_str in s.node_name for s in in_stats))
+      self.assertFalse(any(op_str in s.node_name for s in out_stats))
+
+    # Writes happen at output of RNN cell
+    _assert_in("TensorArrayWrite", gpu_stats, cpu_stats)
+    # Gather happens on final TensorArray
+    _assert_in("TensorArrayGather", gpu_stats, cpu_stats)
+    # Reads happen at input to RNN cell
+    _assert_in("TensorArrayRead", cpu_stats, gpu_stats)
+    # Scatters happen to get initial input into TensorArray
+    _assert_in("TensorArrayScatter", cpu_stats, gpu_stats)
+
+  def testRNNOnCPUCellOnCPU(self):
+    if not tf.test.is_gpu_available():
+      return  # Test requires access to a GPU
+
+    run_metadata = self._execute_rnn_on(
+        rnn_device="/cpu:0", cell_device="/cpu:0", input_device="/gpu:0")
+    step_stats = run_metadata.step_stats
+    ix = 0 if "gpu" in step_stats.dev_stats[0].device else 1
+    gpu_stats = step_stats.dev_stats[ix].node_stats
+    cpu_stats = step_stats.dev_stats[1 - ix].node_stats
+    def _assert_in(op_str, in_stats, out_stats):
+      self.assertTrue(any(op_str in s.node_name for s in in_stats))
+      self.assertFalse(any(op_str in s.node_name for s in out_stats))
+
+    # All TensorArray operations happen on CPU
+    _assert_in("TensorArray", cpu_stats, gpu_stats)
+
+  def testInputOnGPUCellNotDeclared(self):
+    if not tf.test.is_gpu_available():
+      return  # Test requires access to a GPU
+
+    run_metadata = self._execute_rnn_on(input_device="/gpu:0")
+    step_stats = run_metadata.step_stats
+    ix = 0 if "gpu" in step_stats.dev_stats[0].device else 1
+    gpu_stats = step_stats.dev_stats[ix].node_stats
+    cpu_stats = step_stats.dev_stats[1 - ix].node_stats
+    def _assert_in(op_str, in_stats, out_stats):
+      self.assertTrue(any(op_str in s.node_name for s in in_stats))
+      self.assertFalse(any(op_str in s.node_name for s in out_stats))
+
+    # Everything happens on GPU
+    _assert_in("TensorArray", gpu_stats, cpu_stats)
 
-    self._testScope(factory, use_outer_scope=True)
-    self._testScope(factory, use_outer_scope=False)
-    self._testScope(factory, prefix=None, use_outer_scope=False)
 
 ######### Benchmarking RNN code
 
+
 def _static_vs_dynamic_rnn_benchmark_static(inputs_list_t, sequence_length):
   (_, input_size) = inputs_list_t[0].get_shape().as_list()
   initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=127)
-  cell = tf.nn.rnn_cell.LSTMCell(
+  cell = tf.contrib.rnn.LSTMCell(
       num_units=input_size, use_peepholes=True, initializer=initializer,
       state_is_tuple=False)
-  outputs, final_state = tf.nn.rnn(
+  outputs, final_state = tf.contrib.rnn.static_rnn(
       cell, inputs_list_t, sequence_length=sequence_length, dtype=tf.float32)
 
   trainable_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
@@ -1944,7 +1172,7 @@ def _static_vs_dynamic_rnn_benchmark_static(inputs_list_t, sequence_length):
 def _static_vs_dynamic_rnn_benchmark_dynamic(inputs_t, sequence_length):
   (unused_0, unused_1, input_size) = inputs_t.get_shape().as_list()
   initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=127)
-  cell = tf.nn.rnn_cell.LSTMCell(
+  cell = tf.contrib.rnn.LSTMCell(
       num_units=input_size, use_peepholes=True, initializer=initializer,
       state_is_tuple=False)
   outputs, final_state = tf.nn.dynamic_rnn(
@@ -2048,10 +1276,10 @@ def static_vs_dynamic_rnn_benchmark(batch_size, max_time, num_units, use_gpu):
 def _half_seq_len_vs_unroll_half_rnn_benchmark(inputs_list_t, sequence_length):
   (_, input_size) = inputs_list_t[0].get_shape().as_list()
   initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=127)
-  cell = tf.nn.rnn_cell.LSTMCell(
+  cell = tf.contrib.rnn.LSTMCell(
       num_units=input_size, use_peepholes=True, initializer=initializer,
       state_is_tuple=False)
-  outputs, final_state = tf.nn.rnn(
+  outputs, final_state = tf.contrib.rnn.static_rnn(
       cell, inputs_list_t, sequence_length=sequence_length, dtype=tf.float32)
 
   trainable_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
@@ -2102,10 +1330,10 @@ def _concat_state_vs_tuple_state_rnn_benchmark(
     inputs_list_t, sequence_length, state_is_tuple):
   (_, input_size) = inputs_list_t[0].get_shape().as_list()
   initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=127)
-  cell = tf.nn.rnn_cell.LSTMCell(
+  cell = tf.contrib.rnn.LSTMCell(
       num_units=input_size, use_peepholes=True,
       initializer=initializer, state_is_tuple=state_is_tuple)
-  outputs, final_state = tf.nn.rnn(
+  outputs, final_state = tf.contrib.rnn.static_rnn(
       cell, inputs_list_t, sequence_length=sequence_length, dtype=tf.float32)
 
   final_state = list(final_state) if state_is_tuple else [final_state]
@@ -2158,7 +1386,7 @@ def _dynamic_rnn_swap_memory_benchmark(inputs_t, sequence_length,
                                        swap_memory):
   (unused_0, unused_1, input_size) = inputs_t.get_shape().as_list()
   initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=127)
-  cell = tf.nn.rnn_cell.LSTMCell(
+  cell = tf.contrib.rnn.LSTMCell(
       num_units=input_size, use_peepholes=True, initializer=initializer,
       state_is_tuple=False)
   outputs, final_state = tf.nn.dynamic_rnn(
diff --git a/tensorflow/python/kernel_tests/scalar_strict_test.py b/tensorflow/python/kernel_tests/scalar_strict_test.py
index 7d3a9533872923..eec6ec3f94fad0 100644
--- a/tensorflow/python/kernel_tests/scalar_strict_test.py
+++ b/tensorflow/python/kernel_tests/scalar_strict_test.py
@@ -65,7 +65,7 @@ def placeholders(args, feed):
 
   def testConcat(self):
     self.check(tf.concat, ([0], ([2], [3], [7])),
-               'concat_dim tensor should be a scalar integer', [2, 3, 7])
+               'axis tensor should be a scalar integer', [2, 3, 7])
     for data in (2, 3, 7), (2, [3], 7), (2, 3, [7]):
       self.check(tf.concat, (0, data),
                  r'Expected \w+ dimensions in the range \[0, 0\)', [2, 3, 7])
@@ -116,17 +116,6 @@ def testSparseToDense(self):
     self.check(tf.sparse_to_dense, (1, 4, 7),
                'output_shape should be a vector', [0, 7, 0, 0])
 
-  def testImageSummary(self):
-    image = np.zeros((2, 2, 2, 3), dtype=np.uint8)
-    self.check(tf.image_summary, (['img'], image), 'Tags must be a scalar')
-
-  def testScalarSummary(self):
-    self.check(tf.scalar_summary, (['a'], 7), 'not the same shape')
-    self.check(tf.scalar_summary, ('a', [7]), 'not the same shape')
-
-  def testHistogramSummary(self):
-    self.check(tf.histogram_summary, (['a'], 7), 'tags must be scalar')
-
   def testTile(self):
     self.check(tf.tile, ([7], 2), 'Expected multiples to be 1-D', [7, 7])
 
diff --git a/tensorflow/python/kernel_tests/seq2seq_test.py b/tensorflow/python/kernel_tests/seq2seq_test.py
index 8fe876a88f6aaf..03b5f68659a5c2 100644
--- a/tensorflow/python/kernel_tests/seq2seq_test.py
+++ b/tensorflow/python/kernel_tests/seq2seq_test.py
@@ -328,9 +328,11 @@ def testAttentionDecoderStateIsTuple(self):
         self.assertEqual((2, 2), res[0][1].c.shape)
         self.assertEqual((2, 2), res[0][1].h.shape)
 
+    # pylint: disable=unused-variable,invalid-name
     def testDynamicAttentionDecoderStateIsTuple(self):
       with self.test_session() as sess:
-        with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
+        with tf.variable_scope(
+            "root", initializer=tf.constant_initializer(0.5)):
           cell = tf.nn.rnn_cell.BasicLSTMCell(2, state_is_tuple=True)
           cell = tf.nn.rnn_cell.MultiRNNCell(cells=[cell] * 2,
                                              state_is_tuple=True)
@@ -601,7 +603,7 @@ def GRUSeq2Seq(enc_inp, dec_inp):
               num_decoder_symbols=classes, embedding_size=24,
               output_projection=(w, b))
         targets = [dec_inp[i+1] for i in range(len(dec_inp) - 1)] + [0]
-        def SampledLoss(inputs, labels):
+        def SampledLoss(labels, inputs):
           labels = tf.reshape(labels, [-1, 1])
           return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, 8, classes)
         return tf.nn.seq2seq.model_with_buckets(
@@ -616,7 +618,7 @@ def SampledLoss(inputs, labels):
       with tf.variable_scope("root"):
         _, losses = SampleGRUSeq2Seq(inp, out, weights)
         updates = []
-        params = tf.all_variables()
+        params = tf.global_variables()
         optimizer = tf.train.AdamOptimizer(0.03, epsilon=1e-5)
         for i in range(len(buckets)):
           full_grads = tf.gradients(losses[i], params)
@@ -669,7 +671,7 @@ def TestModel(seq2seq):
         np.random.seed(111)
 
         enc_inp = [tf.constant(i + 1, tf.int32, shape=[batch_size])
-                     for i in range(num_enc_timesteps)]
+                   for i in range(num_enc_timesteps)]
         dec_inp_fp_true = [tf.constant(i, tf.int32, shape=[batch_size])
                            for i in range(num_dec_timesteps)]
         dec_inp_holder_fp_false = [tf.placeholder(tf.int32, shape=[batch_size])
@@ -693,7 +695,7 @@ def ForwardBackward(enc_inp, dec_inp, feed_previous):
 
         dec_op_fp_true, update_fp_true, variables_fp_true = ForwardBackward(
             enc_inp, dec_inp_fp_true, feed_previous=True)
-        dec_op_fp_false, update_fp_false, variables_fp_false = ForwardBackward(
+        _, update_fp_false, variables_fp_false = ForwardBackward(
             enc_inp, dec_inp_holder_fp_false, feed_previous=False)
 
         sess.run(tf.global_variables_initializer())
@@ -701,9 +703,9 @@ def ForwardBackward(enc_inp, dec_inp, feed_previous):
         # We only check consistencies between the variables existing in both
         # the models with True and False feed_previous. Variables created by
         # the loop_function in the model with True feed_previous are ignored.
-        v_false_name_dict = {v.name.split('/', 1)[-1]: v
+        v_false_name_dict = {v.name.split("/", 1)[-1]: v
                              for v in variables_fp_false}
-        matched_variables = [(v, v_false_name_dict[v.name.split('/', 1)[-1]])
+        matched_variables = [(v, v_false_name_dict[v.name.split("/", 1)[-1]])
                              for v in variables_fp_true]
         for v_true, v_false in matched_variables:
           sess.run(tf.assign(v_false, v_true))
diff --git a/tensorflow/contrib/metrics/python/kernel_tests/set_ops_test.py b/tensorflow/python/kernel_tests/sets_test.py
similarity index 99%
rename from tensorflow/contrib/metrics/python/kernel_tests/set_ops_test.py
rename to tensorflow/python/kernel_tests/sets_test.py
index 6e45cb338a856a..177b4c1aa5477b 100644
--- a/tensorflow/contrib/metrics/python/kernel_tests/set_ops_test.py
+++ b/tensorflow/python/kernel_tests/sets_test.py
@@ -356,7 +356,7 @@ def _test_set_intersection_3d(self, dtype, invalid_indices=False):
         a = tf.cast(
             tf.sparse_to_dense(
                 sp_a.indices,
-                sp_a.shape,
+                sp_a.dense_shape,
                 sp_a.values,
                 default_value="-1" if dtype == tf.string else -1),
             dtype=dtype)
@@ -370,7 +370,7 @@ def _test_set_intersection_3d(self, dtype, invalid_indices=False):
         b = tf.cast(
             tf.sparse_to_dense(
                 sp_b.indices,
-                sp_b.shape,
+                sp_b.dense_shape,
                 sp_b.values,
                 default_value="-2" if dtype == tf.string else -2),
             dtype=dtype)
@@ -985,7 +985,7 @@ def _assert_set_operation(self, expected_indices, expected_values,
     self.assertEqual(
         expected_set, actual_set, "Expected %s, got %s, at %s." % (
             expected_set, actual_set, last_indices))
-    self.assertAllEqual(expected_shape, sparse_tensor.shape)
+    self.assertAllEqual(expected_shape, sparse_tensor.dense_shape)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/shape_ops_test.py b/tensorflow/python/kernel_tests/shape_ops_test.py
index ac318a3f7ab5aa..03110b0e9ba6dd 100644
--- a/tensorflow/python/kernel_tests/shape_ops_test.py
+++ b/tensorflow/python/kernel_tests/shape_ops_test.py
@@ -37,7 +37,7 @@ def _sparsify(x, thresh=0.5, index_dtype=np.int64):
   x_shape = x.shape
 
   return tf.SparseTensor(
-      indices=x_indices, values=x_values, shape=x_shape), len(x_values)
+      indices=x_indices, values=x_values, dense_shape=x_shape), len(x_values)
 
 class ShapeOpsTest(tf.test.TestCase):
 
diff --git a/tensorflow/python/kernel_tests/sparse_add_op_test.py b/tensorflow/python/kernel_tests/sparse_add_op_test.py
index a2d9eaea2d923b..d1c7eeadc93c40 100644
--- a/tensorflow/python/kernel_tests/sparse_add_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_add_op_test.py
@@ -33,7 +33,7 @@ def _sparsify(x, thresh=0.5, index_dtype=np.int64):
   x_shape = x.shape
 
   return tf.SparseTensor(
-      indices=x_indices, values=x_values, shape=x_shape), len(x_values)
+      indices=x_indices, values=x_values, dense_shape=x_shape), len(x_values)
 
 
 class SparseAddTest(tf.test.TestCase):
@@ -81,7 +81,7 @@ def testAddSelf(self):
 
           sum_out = sess.run(sp_sum)
 
-          self.assertEqual(sp_sum.shape.get_shape(), [2])
+          self.assertEqual(sp_sum.dense_shape.get_shape(), [2])
           self.assertAllEqual(
               sum_out.indices, [[0, 1], [1, 0], [2, 0], [2, 1]])
           self.assertAllEqual(sum_out.values, [2, 4, 6, 8])
@@ -95,7 +95,7 @@ def testAddSelfAndNegation(self):
       sp_sum = tf.sparse_add(sp_a, sp_b, 0.1)
       sum_out = sess.run(sp_sum)
 
-      self.assertEqual(sp_sum.shape.get_shape(), [2])
+      self.assertEqual(sp_sum.dense_shape.get_shape(), [2])
       self.assertAllEqual(sum_out.indices, np.empty([0, 2]))
       self.assertAllEqual(sum_out.values, [])
       self.assertAllEqual(sum_out.shape, [3, 3])
@@ -114,7 +114,7 @@ def testSmallValuesShouldVanish(self):
       sp_sum = tf.sparse_add(sp_a, sp_b, thresh=0.21)
       sum_out = sess.run(sp_sum)
 
-      self.assertEqual(sp_sum.shape.get_shape(), [2])
+      self.assertEqual(sp_sum.dense_shape.get_shape(), [2])
       self.assertAllEqual(sum_out.indices, [[0, 1], [2, 0]])
       self.assertAllEqual(sum_out.values, [2, 6])
       self.assertAllEqual(sum_out.shape, [3, 3])
@@ -123,7 +123,7 @@ def testSmallValuesShouldVanish(self):
       sp_sum = tf.sparse_add(sp_a, sp_b, thresh=0.11)
       sum_out = sess.run(sp_sum)
 
-      self.assertEqual(sp_sum.shape.get_shape(), [2])
+      self.assertEqual(sp_sum.dense_shape.get_shape(), [2])
       self.assertAllEqual(sum_out.indices, [[0, 1], [2, 0], [2, 1]])
       self.assertAllClose(sum_out.values, [2, 6, -.2])
       self.assertAllEqual(sum_out.shape, [3, 3])
diff --git a/tensorflow/python/kernel_tests/sparse_concat_op_test.py b/tensorflow/python/kernel_tests/sparse_concat_op_test.py
index de48b969d9cc72..b65610dcc7cf38 100644
--- a/tensorflow/python/kernel_tests/sparse_concat_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_concat_op_test.py
@@ -137,7 +137,7 @@ def testConcat1(self):
 
           self.assertEqual(sp_concat.indices.get_shape(), [4, 2])
           self.assertEqual(sp_concat.values.get_shape(), [4])
-          self.assertEqual(sp_concat.shape.get_shape(), [2])
+          self.assertEqual(sp_concat.dense_shape.get_shape(), [2])
 
           concat_out = sess.run(sp_concat)
 
@@ -159,7 +159,7 @@ def testConcat2(self):
 
             self.assertEqual(sp_concat.indices.get_shape(), [8, 2])
             self.assertEqual(sp_concat.values.get_shape(), [8])
-            self.assertEqual(sp_concat.shape.get_shape(), [2])
+            self.assertEqual(sp_concat.dense_shape.get_shape(), [2])
 
             concat_out = sess.run(sp_concat)
 
@@ -185,7 +185,7 @@ def testConcatDim0(self):
 
         self.assertEqual(sp_concat.indices.get_shape(), [7, 2])
         self.assertEqual(sp_concat.values.get_shape(), [7])
-        self.assertEqual(sp_concat.shape.get_shape(), [2])
+        self.assertEqual(sp_concat.dense_shape.get_shape(), [2])
 
         concat_out = sess.run(sp_concat)
 
@@ -210,7 +210,7 @@ def testConcat3(self):
 
         self.assertEqual(sp_concat.indices.get_shape(), [10, 2])
         self.assertEqual(sp_concat.values.get_shape(), [10])
-        self.assertEqual(sp_concat.shape.get_shape(), [2])
+        self.assertEqual(sp_concat.dense_shape.get_shape(), [2])
 
         concat_out = sess.run(sp_concat)
 
@@ -234,7 +234,7 @@ def testConcatNonNumeric(self):
 
         self.assertEqual(sp_concat.indices.get_shape(), [8, 2])
         self.assertEqual(sp_concat.values.get_shape(), [8])
-        self.assertEqual(sp_concat.shape.get_shape(), [2])
+        self.assertEqual(sp_concat.dense_shape.get_shape(), [2])
 
         concat_out = sess.run(sp_concat)
 
@@ -301,7 +301,7 @@ def testMismatchedShapesExpandNonconcatDim(self):
                                [10, 2]])
           self.assertAllEqual(sp_concat_dim0_out.values,
                               [1, 2, 3, 4, 1, 2, 1, 0, 1, 2, 1, 1, 2])
-          self.assertAllEqual(sp_concat_dim0_out.shape, [11, 5])
+          self.assertAllEqual(sp_concat_dim0_out.dense_shape, [11, 5])
 
           self.assertAllEqual(sp_concat_dim1_out.indices,
                               [[0, 2], [0, 11], [1, 0], [1, 4], [1, 8], [1, 10],
@@ -309,7 +309,7 @@ def testMismatchedShapesExpandNonconcatDim(self):
                                [2, 8]])
           self.assertAllEqual(sp_concat_dim1_out.values,
                               [1, 1, 2, 1, 1, 1, 2, 3, 4, 2, 1, 0, 2])
-          self.assertAllEqual(sp_concat_dim1_out.shape, [3, 13])
+          self.assertAllEqual(sp_concat_dim1_out.dense_shape, [3, 13])
 
   def testShapeInferenceUnknownShapes(self):
     with self.test_session(use_gpu=False):
@@ -324,7 +324,7 @@ def testShapeInferenceUnknownShapes(self):
 
         self.assertEqual(sp_concat.indices.get_shape().as_list(), [None, 3])
         self.assertEqual(sp_concat.values.get_shape().as_list(), [None])
-        self.assertEqual(sp_concat.shape.get_shape(), [3])
+        self.assertEqual(sp_concat.dense_shape.get_shape(), [3])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/sparse_ops_test.py b/tensorflow/python/kernel_tests/sparse_ops_test.py
index 554556a65afd94..41a8a486d191e4 100644
--- a/tensorflow/python/kernel_tests/sparse_ops_test.py
+++ b/tensorflow/python/kernel_tests/sparse_ops_test.py
@@ -41,7 +41,7 @@ def _sparsify(x, thresh=0.5, index_dtype=np.int64):
   x_shape = x.shape
 
   return tf.SparseTensor(
-      indices=x_indices, values=x_values, shape=x_shape), len(x_values)
+      indices=x_indices, values=x_values, dense_shape=x_shape), len(x_values)
 
 
 class SparseToIndicatorTest(test_util.TensorFlowTestCase):
@@ -153,7 +153,7 @@ def _AssertResultsSorted(self, output, vocab_size):
         output.values,
         [-3, 1, 4, 1, 5, 9])
     self.assertAllEqual(
-        output.shape,
+        output.dense_shape,
         [3, vocab_size])
 
   def _AssertResultsNotSorted(self, output, vocab_size):
@@ -164,7 +164,7 @@ def _AssertResultsNotSorted(self, output, vocab_size):
         output.values,
         [-3, 4, 1, 9, 5, 1])
     self.assertAllEqual(
-        output.shape,
+        output.dense_shape,
         [3, vocab_size])
 
   def testInt32AndFloat32(self):
@@ -254,7 +254,7 @@ def testBasic(self):
 
         self.assertAllEqual(output.indices, [[0, 0], [1, 4], [3, 2]])
         self.assertAllEqual(output.values, [0, 14, 32])
-        self.assertAllEqual(output.shape, [5, 6])
+        self.assertAllEqual(output.dense_shape, [5, 6])
 
   def testRetainNone(self):
     with self.test_session(use_gpu=False) as sess:
@@ -266,7 +266,7 @@ def testRetainNone(self):
 
       self.assertAllEqual(output.indices, np.array([]).reshape((0, 2)))
       self.assertAllEqual(output.values, [])
-      self.assertAllEqual(output.shape, [5, 6])
+      self.assertAllEqual(output.dense_shape, [5, 6])
 
   def testMismatchedRetainShape(self):
     with self.test_session(use_gpu=False):
@@ -305,7 +305,7 @@ def testBasic(self):
                                            [0, 1, 3], [1, 1, 4],
                                            [1, 3, 2], [1, 3, 3]])
       self.assertAllEqual(output.values, [0, 10, 13, 14, 32, 33])
-      self.assertAllEqual(output.shape, [3, 6, 7])
+      self.assertAllEqual(output.dense_shape, [3, 6, 7])
 
   def testInputUnavailableInGraphConstructionOk(self):
     with self.test_session(use_gpu=False) as sess:
@@ -319,7 +319,7 @@ def testInputUnavailableInGraphConstructionOk(self):
                                            [0, 1, 3], [1, 1, 4],
                                            [1, 3, 2], [1, 3, 3]])
       self.assertAllEqual(output.values, [0, 10, 13, 14, 32, 33])
-      self.assertAllEqual(output.shape, [3, 6, 7])
+      self.assertAllEqual(output.dense_shape, [3, 6, 7])
 
   def testFeedInputUnavailableInGraphConstructionOk(self):
     with self.test_session(use_gpu=False) as sess:
@@ -334,7 +334,7 @@ def testFeedInputUnavailableInGraphConstructionOk(self):
                                            [0, 1, 3], [1, 1, 4],
                                            [1, 3, 2], [1, 3, 3]])
       self.assertAllEqual(output.values, [0, 10, 13, 14, 32, 33])
-      self.assertAllEqual(output.shape, [3, 6, 7])
+      self.assertAllEqual(output.dense_shape, [3, 6, 7])
 
   def testTightBoundingBox(self):
     with self.test_session(use_gpu=False) as sess:
@@ -347,7 +347,7 @@ def testTightBoundingBox(self):
                                            [0, 1, 3], [1, 1, 4],
                                            [1, 3, 2], [1, 3, 3]])
       self.assertAllEqual(output.values, [0, 10, 13, 14, 32, 33])
-      self.assertAllEqual(output.shape, [2, 4, 5])
+      self.assertAllEqual(output.dense_shape, [2, 4, 5])
 
   def testInvalidRank(self):
     with self.test_session(use_gpu=False):
@@ -436,7 +436,7 @@ def testFillNumber(self):
             output.indices,
             [[0, 0], [1, 0], [1, 3], [1, 4], [2, 0], [3, 2], [3, 3], [4, 0]])
         self.assertAllEqual(output.values, [0, 10, 13, 14, -1, 32, 33, -1])
-        self.assertAllEqual(output.shape, [5, 6])
+        self.assertAllEqual(output.dense_shape, [5, 6])
         self.assertAllEqual(empty_row_indicator_out,
                             np.array([0, 0, 1, 0, 1]).astype(np.bool))
 
@@ -454,7 +454,7 @@ def testFillString(self):
           [[0, 0], [1, 0], [1, 3], [1, 4], [2, 0], [3, 2], [3, 3], [4, 0]])
       self.assertAllEqual(output.values,
                           [b"a", b"b", b"c", b"d", b"", b"e", b"f", b""])
-      self.assertAllEqual(output.shape, [5, 6])
+      self.assertAllEqual(output.dense_shape, [5, 6])
       self.assertAllEqual(empty_row_indicator_out,
                           np.array([0, 0, 1, 0, 1]).astype(np.bool))
 
@@ -469,7 +469,7 @@ def testNoEmptyRows(self):
 
       self.assertAllEqual(output.indices, [[0, 0], [1, 0], [1, 3], [1, 4]])
       self.assertAllEqual(output.values, [0, 10, 13, 14])
-      self.assertAllEqual(output.shape, [2, 6])
+      self.assertAllEqual(output.dense_shape, [2, 6])
       self.assertAllEqual(empty_row_indicator_out, np.zeros(2).astype(np.bool))
 
 
@@ -480,7 +480,7 @@ class SparseReduceSumTest(test_util.TensorFlowTestCase):
   # where ? is implictly-zero.
   ind = np.array([[0, 0], [0, 2], [1, 1]]).astype(np.int64)
   vals = np.array([1, 1, 1]).astype(np.int32)
-  shape = np.array([2, 3]).astype(np.int64)
+  dense_shape = np.array([2, 3]).astype(np.int64)
 
   def _compare(self, sp_t, reduction_axes, ndims, keep_dims):
     densified = sparse_ops.sparse_tensor_to_dense(sp_t).eval()
@@ -517,7 +517,7 @@ def _compare_all(self, sp_t, reduction_axes, ndims):
     self._compare(sp_t, reduction_axes, ndims, True)
 
   def testSimpleAndRandomInputs(self):
-    sp_t = tf.SparseTensor(self.ind, self.vals, self.shape)
+    sp_t = tf.SparseTensor(self.ind, self.vals, self.dense_shape)
 
     with self.test_session(use_gpu=False):
       self._compare_all(sp_t, None, ndims=2)
@@ -541,7 +541,7 @@ def testSimpleAndRandomInputs(self):
           self._compare_all(sp_t, axes, ndims=len(dims))
 
   def testInvalidAxes(self):
-    sp_t = tf.SparseTensor(self.ind, self.vals, self.shape)
+    sp_t = tf.SparseTensor(self.ind, self.vals, self.dense_shape)
     with self.test_session(use_gpu=False):
       with self.assertRaisesOpError("Invalid reduction dimension -3"):
         sparse_ops.sparse_reduce_sum(sp_t, -3).eval()
@@ -576,10 +576,11 @@ def _check(self, result_tensor, result_np, input_sp_t):
     self.assertTrue(isinstance(result_tensor, tf.SparseTensor))
     self.assertTrue(isinstance(input_sp_t, tf.SparseTensor))
     self.assertAllEqual(input_sp_t.indices.eval(), result_tensor.indices.eval())
-    self.assertAllEqual(input_sp_t.shape.eval(), result_tensor.shape.eval())
+    self.assertAllEqual(
+        input_sp_t.dense_shape.eval(), result_tensor.dense_shape.eval())
 
     res_densified = sparse_ops.sparse_to_dense(result_tensor.indices,
-                                               result_tensor.shape,
+                                               result_tensor.dense_shape,
                                                result_tensor.values).eval()
     self.assertAllEqual(result_np, res_densified)
 
@@ -700,7 +701,7 @@ def testHigherRanks(self):
 
         self.assertAllEqual(expected_values, result.values)
         self.assertAllEqual(sp_t.indices.eval(), result.indices)
-        self.assertAllEqual(shape, result.shape)
+        self.assertAllEqual(shape, result.dense_shape)
 
   def testGradient(self):
     x_shape = [2, 5, 10]
@@ -719,7 +720,7 @@ class SparseMinimumMaximumTest(test_util.TensorFlowTestCase):
   def _assertSparseTensorValueEqual(self, a, b):
     self.assertAllEqual(a.indices, b.indices)
     self.assertAllEqual(a.values, b.values)
-    self.assertAllEqual(a.shape, b.shape)
+    self.assertAllEqual(a.dense_shape, b.dense_shape)
 
   def testBasic(self):
     with self.test_session(use_gpu=False):
diff --git a/tensorflow/python/kernel_tests/sparse_reorder_op_test.py b/tensorflow/python/kernel_tests/sparse_reorder_op_test.py
index dd5f9a09410143..d665e8ed86836b 100644
--- a/tensorflow/python/kernel_tests/sparse_reorder_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_reorder_op_test.py
@@ -95,7 +95,7 @@ def testGradients(self):
       for _ in range(5):  # To test various random permutations
         input_val = self._SparseTensorValue_5x6(np.random.permutation(6))
         sp_input = tf.SparseTensor(
-            input_val.indices, input_val.values, input_val.shape)
+            input_val.indices, input_val.values, input_val.dense_shape)
         sp_output = tf.sparse_reorder(sp_input)
 
         err = tf.test.compute_gradient_error(
diff --git a/tensorflow/python/kernel_tests/sparse_reshape_op_test.py b/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
index f6dee8a3fb2ea1..4b7e158d54468e 100644
--- a/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
@@ -200,7 +200,7 @@ def testFeedPartialShapes(self):
       sp_input = self._SparseTensorPlaceholder()
       sp_output = tf.sparse_reshape(sp_input, [2, 3, 5])
       self.assertListEqual(sp_output.indices.get_shape().as_list(), [None, 3])
-      self.assertListEqual(sp_output.shape.get_shape().as_list(), [3])
+      self.assertListEqual(sp_output.dense_shape.get_shape().as_list(), [3])
 
       # Incorporate known shape information about input indices in output
       # indices
@@ -208,7 +208,7 @@ def testFeedPartialShapes(self):
       sp_input.indices.set_shape([5, None])
       sp_output = tf.sparse_reshape(sp_input, [2, 3, 5])
       self.assertListEqual(sp_output.indices.get_shape().as_list(), [5, 3])
-      self.assertListEqual(sp_output.shape.get_shape().as_list(), [3])
+      self.assertListEqual(sp_output.dense_shape.get_shape().as_list(), [3])
 
       # Even if new_shape has no shape information, we know the ranks of
       # output indices and shape
@@ -217,7 +217,7 @@ def testFeedPartialShapes(self):
       new_shape = tf.placeholder(tf.int64)
       sp_output = tf.sparse_reshape(sp_input, new_shape)
       self.assertListEqual(sp_output.indices.get_shape().as_list(), [5, None])
-      self.assertListEqual(sp_output.shape.get_shape().as_list(), [None])
+      self.assertListEqual(sp_output.dense_shape.get_shape().as_list(), [None])
 
   def testFeedDenseReshapeSemantics(self):
     with self.test_session(use_gpu=False) as sess:
diff --git a/tensorflow/python/kernel_tests/sparse_serialization_ops_test.py b/tensorflow/python/kernel_tests/sparse_serialization_ops_test.py
index 5896634a65ad35..13b7fcc7c078c4 100644
--- a/tensorflow/python/kernel_tests/sparse_serialization_ops_test.py
+++ b/tensorflow/python/kernel_tests/sparse_serialization_ops_test.py
@@ -123,7 +123,7 @@ def testSerializeManyDeserializeManyRoundTrip(self):
           [serialized, deserialized],
           feed_dict={sparse_tensor.indices: indices_value,
                      sparse_tensor.values: values_value,
-                     sparse_tensor.shape: shape_value})
+                     sparse_tensor.dense_shape: shape_value})
       self.assertEqual(serialized_value.shape, (4, 3))
       self.assertAllEqual(deserialized_value.indices, indices_value)
       self.assertAllEqual(deserialized_value.values, values_value)
diff --git a/tensorflow/python/kernel_tests/sparse_split_op_test.py b/tensorflow/python/kernel_tests/sparse_split_op_test.py
index ed26ded934121d..dedaa1b1c47176 100644
--- a/tensorflow/python/kernel_tests/sparse_split_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_split_op_test.py
@@ -73,167 +73,186 @@ def _SparseTensor_3x4x2(self):
 
   def testSplitMatrixRows(self):
     with self.test_session(use_gpu=False):
-      sp_tensors = tf.sparse_split(0, 2, self._SparseTensor_4x6())
+      sp_tensors = tf.sparse_split(
+          sp_input=self._SparseTensor_4x6(), num_split=2, axis=0)
       self.assertAllEqual(len(sp_tensors), 2)
       self.assertAllEqual(sp_tensors[0].indices.eval(),
                           [[0, 0], [0, 2], [0, 4], [0, 5], [1, 1], [1, 3], [1,
                                                                             4]])
       self.assertAllEqual(sp_tensors[0].values.eval(), [0, 2, 4, 5, 11, 13, 14])
-      self.assertAllEqual(sp_tensors[0].shape.eval(), [2, 6])
+      self.assertAllEqual(sp_tensors[0].dense_shape.eval(), [2, 6])
       self.assertAllEqual(sp_tensors[1].indices.eval(),
                           [[0, 0], [0, 3], [0, 5], [1, 0], [1, 2], [1, 3], [1,
                                                                             5]])
       self.assertAllEqual(sp_tensors[1].values.eval(), [20, 23, 25, 30, 32, 33,
                                                         35])
-      self.assertAllEqual(sp_tensors[1].shape.eval(), [2, 6])
+      self.assertAllEqual(sp_tensors[1].dense_shape.eval(), [2, 6])
 
   def testSplitMatrixUnevenCols(self):
     with self.test_session(use_gpu=False):
-      sp_tensors_3 = tf.sparse_split(1, 3, self._SparseTensor_5x7())
+      sp_tensors_3 = tf.sparse_split(
+          sp_input=self._SparseTensor_5x7(), num_split=3, axis=1)
       self.assertAllEqual(len(sp_tensors_3), 3)
       self.assertAllEqual(sp_tensors_3[0].indices.eval(),
                           [[0, 0], [0, 2], [1, 1], [2, 0], [3, 0], [3, 2],
                            [4, 1]])
       self.assertAllEqual(sp_tensors_3[0].values.eval(), [0, 2, 11, 20, 30, 32,
                                                           41])
-      self.assertAllEqual(sp_tensors_3[0].shape.eval(), [5, 3])
+      self.assertAllEqual(sp_tensors_3[0].dense_shape.eval(), [5, 3])
       self.assertAllEqual(sp_tensors_3[1].indices.eval(), [[0, 1], [1, 0],
                                                            [1, 1], [2, 0],
                                                            [3, 0], [4, 1]])
       self.assertAllEqual(sp_tensors_3[1].values.eval(), [4, 13, 14, 23, 33,
                                                           44])
-      self.assertAllEqual(sp_tensors_3[1].shape.eval(), [5, 2])
+      self.assertAllEqual(sp_tensors_3[1].dense_shape.eval(), [5, 2])
       self.assertAllEqual(sp_tensors_3[2].indices.eval(), [[0, 0], [1, 1],
                                                            [2, 0], [3, 0],
                                                            [4, 1]])
       self.assertAllEqual(sp_tensors_3[2].values.eval(), [5, 16, 25, 35, 46])
-      self.assertAllEqual(sp_tensors_3[2].shape.eval(), [5, 2])
-      sp_tensors_4 = tf.sparse_split(1, 4, self._SparseTensor_5x7())
+      self.assertAllEqual(sp_tensors_3[2].dense_shape.eval(), [5, 2])
+      sp_tensors_4 = tf.sparse_split(
+          sp_input=self._SparseTensor_5x7(), num_split=4, axis=1)
       self.assertAllEqual(len(sp_tensors_4), 4)
       self.assertAllEqual(sp_tensors_4[0].indices.eval(),
                           [[0, 0], [1, 1], [2, 0], [3, 0], [4, 1]])
       self.assertAllEqual(sp_tensors_4[0].values.eval(), [0, 11, 20, 30, 41])
-      self.assertAllEqual(sp_tensors_4[0].shape.eval(), [5, 2])
+      self.assertAllEqual(sp_tensors_4[0].dense_shape.eval(), [5, 2])
       self.assertAllEqual(sp_tensors_4[1].indices.eval(),
                           [[0, 0], [1, 1], [2, 1], [3, 0], [3, 1]])
       self.assertAllEqual(sp_tensors_4[1].values.eval(), [2, 13, 23, 32, 33])
-      self.assertAllEqual(sp_tensors_4[1].shape.eval(), [5, 2])
+      self.assertAllEqual(sp_tensors_4[1].dense_shape.eval(), [5, 2])
       self.assertAllEqual(sp_tensors_4[2].indices.eval(),
                           [[0, 0], [0, 1], [1, 0], [2, 1], [3, 1], [4, 0]])
       self.assertAllEqual(sp_tensors_4[2].values.eval(), [4, 5, 14, 25, 35, 44])
-      self.assertAllEqual(sp_tensors_4[2].shape.eval(), [5, 2])
+      self.assertAllEqual(sp_tensors_4[2].dense_shape.eval(), [5, 2])
       self.assertAllEqual(sp_tensors_4[3].indices.eval(), [[1, 0], [4, 0]])
       self.assertAllEqual(sp_tensors_4[3].values.eval(), [16, 46])
-      self.assertAllEqual(sp_tensors_4[3].shape.eval(), [5, 1])
+      self.assertAllEqual(sp_tensors_4[3].dense_shape.eval(), [5, 1])
 
   def testSplitMatrixUnevenRows(self):
     with self.test_session(use_gpu=False):
-      sp_tensors_2 = tf.sparse_split(0, 2, self._SparseTensor_5x7())
+      sp_tensors_2 = tf.sparse_split(
+          sp_input=self._SparseTensor_5x7(), num_split=2, axis=0)
       self.assertAllEqual(sp_tensors_2[0].indices.eval(),
                           [[0, 0], [0, 2], [0, 4], [0, 5], [1, 1], [1, 3],
                            [1, 4], [1, 6], [2, 0], [2, 3], [2, 5]])
       self.assertAllEqual(sp_tensors_2[0].values.eval(), [0, 2, 4, 5, 11, 13,
                                                           14, 16, 20, 23, 25])
-      self.assertAllEqual(sp_tensors_2[0].shape.eval(), [3, 7])
+      self.assertAllEqual(sp_tensors_2[0].dense_shape.eval(), [3, 7])
       self.assertAllEqual(sp_tensors_2[1].indices.eval(),
                           [[0, 0], [0, 2], [0, 3], [0, 5], [1, 1], [1, 4],
                            [1, 6]])
       self.assertAllEqual(sp_tensors_2[1].values.eval(), [30, 32, 33, 35, 41,
                                                           44, 46])
-      self.assertAllEqual(sp_tensors_2[1].shape.eval(), [2, 7])
+      self.assertAllEqual(sp_tensors_2[1].dense_shape.eval(), [2, 7])
       self.assertAllEqual(len(sp_tensors_2), 2)
-      sp_tensors_3 = tf.sparse_split(0, 3, self._SparseTensor_5x7())
+      sp_tensors_3 = tf.sparse_split(
+          sp_input=self._SparseTensor_5x7(), num_split=3, axis=0)
       self.assertAllEqual(len(sp_tensors_3), 3)
       self.assertAllEqual(sp_tensors_3[0].indices.eval(),
                           [[0, 0], [0, 2], [0, 4], [0, 5], [1, 1], [1, 3],
                            [1, 4], [1, 6]])
       self.assertAllEqual(sp_tensors_3[0].values.eval(), [0, 2, 4, 5, 11, 13,
                                                           14, 16])
-      self.assertAllEqual(sp_tensors_3[0].shape.eval(), [2, 7])
+      self.assertAllEqual(sp_tensors_3[0].dense_shape.eval(), [2, 7])
 
       self.assertAllEqual(sp_tensors_3[1].values.eval(), [20, 23, 25, 30, 32,
                                                           33, 35])
-      self.assertAllEqual(sp_tensors_3[1].shape.eval(), [2, 7])
+      self.assertAllEqual(sp_tensors_3[1].dense_shape.eval(), [2, 7])
       self.assertAllEqual(sp_tensors_3[2].indices.eval(), [[0, 1], [0, 4],
                                                            [0, 6]])
       self.assertAllEqual(sp_tensors_3[2].values.eval(), [41, 44, 46])
-      self.assertAllEqual(sp_tensors_3[2].shape.eval(), [1, 7])
+      self.assertAllEqual(sp_tensors_3[2].dense_shape.eval(), [1, 7])
     return
 
   def testSplitAllRows(self):
     with self.test_session(use_gpu=False):
-      sp_tensors = tf.sparse_split(0, 4, self._SparseTensor_4x6())
+      sp_tensors = tf.sparse_split(
+          sp_input=self._SparseTensor_4x6(), num_split=4, axis=0)
       self.assertAllEqual(len(sp_tensors), 4)
       self.assertAllEqual(sp_tensors[0].indices.eval(), [[0, 0], [0, 2], [0, 4],
                                                          [0, 5]])
       self.assertAllEqual(sp_tensors[0].values.eval(), [0, 2, 4, 5])
-      self.assertAllEqual(sp_tensors[0].shape.eval(), [1, 6])
+      self.assertAllEqual(sp_tensors[0].dense_shape.eval(), [1, 6])
       self.assertAllEqual(sp_tensors[1].indices.eval(), [[0, 1], [0, 3], [0,
                                                                           4]])
       self.assertAllEqual(sp_tensors[1].values.eval(), [11, 13, 14])
-      self.assertAllEqual(sp_tensors[1].shape.eval(), [1, 6])
+      self.assertAllEqual(sp_tensors[1].dense_shape.eval(), [1, 6])
       self.assertAllEqual(sp_tensors[2].indices.eval(), [[0, 0], [0, 3], [0,
                                                                           5]])
       self.assertAllEqual(sp_tensors[2].values.eval(), [20, 23, 25])
-      self.assertAllEqual(sp_tensors[2].shape.eval(), [1, 6])
+      self.assertAllEqual(sp_tensors[2].dense_shape.eval(), [1, 6])
       self.assertAllEqual(sp_tensors[3].indices.eval(), [[0, 0], [0, 2], [0, 3],
                                                          [0, 5]])
       self.assertAllEqual(sp_tensors[3].values.eval(), [30, 32, 33, 35])
-      self.assertAllEqual(sp_tensors[3].shape.eval(), [1, 6])
+      self.assertAllEqual(sp_tensors[3].dense_shape.eval(), [1, 6])
 
   def testSplitColumns(self):
     with self.test_session(use_gpu=False):
-      sparse_tensors = tf.sparse_split(1, 3, self._SparseTensor_4x6())
+      sparse_tensors = tf.sparse_split(
+          sp_input=self._SparseTensor_4x6(), num_split=3, axis=1)
       self.assertAllEqual(len(sparse_tensors), 3)
       self.assertAllEqual(sparse_tensors[0].indices.eval(), [[0, 0], [1, 1],
                                                              [2, 0], [3, 0]])
       self.assertAllEqual(sparse_tensors[0].values.eval(), [0, 11, 20, 30])
-      self.assertAllEqual(sparse_tensors[0].shape.eval(), [4, 2])
+      self.assertAllEqual(sparse_tensors[0].dense_shape.eval(), [4, 2])
       self.assertAllEqual(sparse_tensors[1].indices.eval(),
                           [[0, 0], [1, 1], [2, 1], [3, 0], [3, 1]])
       self.assertAllEqual(sparse_tensors[1].values.eval(), [2, 13, 23, 32, 33])
-      self.assertAllEqual(sparse_tensors[1].shape.eval(), [4, 2])
+      self.assertAllEqual(sparse_tensors[1].dense_shape.eval(), [4, 2])
       self.assertAllEqual(sparse_tensors[2].indices.eval(),
                           [[0, 0], [0, 1], [1, 0], [2, 1], [3, 1]])
       self.assertAllEqual(sparse_tensors[2].values.eval(), [4, 5, 14, 25, 35])
-      self.assertAllEqual(sparse_tensors[2].shape.eval(), [4, 2])
+      self.assertAllEqual(sparse_tensors[2].dense_shape.eval(), [4, 2])
 
   def testSplitAllColumns(self):
     with self.test_session(use_gpu=False):
-      sparse_tensors = tf.sparse_split(1, 6, self._SparseTensor_4x6())
+      sparse_tensors = tf.sparse_split(
+          sp_input=self._SparseTensor_4x6(), num_split=6, axis=1)
       self.assertAllEqual(len(sparse_tensors), 6)
       self.assertAllEqual(sparse_tensors[0].indices.eval(), [[0, 0], [2, 0],
                                                              [3, 0]])
       self.assertAllEqual(sparse_tensors[0].values.eval(), [0, 20, 30])
-      self.assertAllEqual(sparse_tensors[0].shape.eval(), [4, 1])
+      self.assertAllEqual(sparse_tensors[0].dense_shape.eval(), [4, 1])
       self.assertAllEqual(sparse_tensors[1].indices.eval(), [[1, 0]])
       self.assertAllEqual(sparse_tensors[1].values.eval(), [11])
-      self.assertAllEqual(sparse_tensors[1].shape.eval(), [4, 1])
+      self.assertAllEqual(sparse_tensors[1].dense_shape.eval(), [4, 1])
       self.assertAllEqual(sparse_tensors[2].indices.eval(), [[0, 0], [3, 0]])
       self.assertAllEqual(sparse_tensors[2].values.eval(), [2, 32])
-      self.assertAllEqual(sparse_tensors[2].shape.eval(), [4, 1])
+      self.assertAllEqual(sparse_tensors[2].dense_shape.eval(), [4, 1])
       self.assertAllEqual(sparse_tensors[3].indices.eval(), [[1, 0], [2, 0],
                                                              [3, 0]])
-      self.assertAllEqual(sparse_tensors[3].shape.eval(), [4, 1])
+      self.assertAllEqual(sparse_tensors[3].dense_shape.eval(), [4, 1])
       self.assertAllEqual(sparse_tensors[3].values.eval(), [13, 23, 33])
       self.assertAllEqual(sparse_tensors[4].indices.eval(), [[0, 0], [1, 0]])
       self.assertAllEqual(sparse_tensors[4].values.eval(), [4, 14])
-      self.assertAllEqual(sparse_tensors[4].shape.eval(), [4, 1])
+      self.assertAllEqual(sparse_tensors[4].dense_shape.eval(), [4, 1])
       self.assertAllEqual(sparse_tensors[5].indices.eval(), [[0, 0], [2, 0],
                                                              [3, 0]])
       self.assertAllEqual(sparse_tensors[5].values.eval(), [5, 25, 35])
-      self.assertAllEqual(sparse_tensors[5].shape.eval(), [4, 1])
+      self.assertAllEqual(sparse_tensors[5].dense_shape.eval(), [4, 1])
 
   def testSliceConcat(self):
     for sp_input in (
         self._SparseTensorValue_3x4x2(), self._SparseTensor_3x4x2()):
       with self.test_session(use_gpu=False):
-        sparse_tensors = tf.sparse_split(1, 2, sp_input)
+        sparse_tensors = tf.sparse_split(
+            sp_input=sp_input, num_split=2, axis=1)
         concat_tensor = tf.sparse_concat(1, sparse_tensors)
         expected_output = self._SparseTensor_3x4x2()
         self.assertAllEqual(concat_tensor.indices.eval(),
                             expected_output.indices.eval())
 
+  def testArgumentErrors(self):
+    with self.assertRaisesRegexp(ValueError, 'Keyword arguments are required'):
+      tf.sparse_split(3, 2, 1)
+    with self.assertRaisesRegexp(ValueError, 'sp_input is required'):
+      tf.sparse_split()
+    with self.assertRaisesRegexp(ValueError, 'num_split is required'):
+      tf.sparse_split(sp_input=1)
+    with self.assertRaisesRegexp(ValueError, 'axis is required'):
+      tf.sparse_split(num_split=2, sp_input=1)
+
 
 if __name__ == '__main__':
   tf.test.main()
diff --git a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_grad_test.py b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_grad_test.py
index f2539624793c68..b443358fda714b 100644
--- a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_grad_test.py
+++ b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_grad_test.py
@@ -33,7 +33,7 @@ def _sparsify(self, x):
     x_shape = x.shape
 
     return tf.SparseTensor(
-        indices=x_indices, values=x_values, shape=x_shape), len(x_values)
+        indices=x_indices, values=x_values, dense_shape=x_shape), len(x_values)
 
   def _randomTensor(self, size, np_dtype, adjoint=False, sparse=False):
     n, m = size
diff --git a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
index 3ca79275eaf7df..5685668f683e2e 100644
--- a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
@@ -52,7 +52,7 @@ def _testMatmul(self, x, y, adjoint_a=False, adjoint_b=False):
 
     with self.test_session(use_gpu=True):
       sp_x_value = tf.SparseTensorValue(
-          indices=x_indices, values=x_values, shape=x_shape)
+          indices=x_indices, values=x_values, dense_shape=x_shape)
       tf_value_ans = sparse_ops.sparse_tensor_dense_matmul(
           sp_x_value, y, adjoint_a=adjoint_a, adjoint_b=adjoint_b)
       tf_tensor_ans = sparse_ops.sparse_tensor_dense_matmul(
@@ -169,7 +169,7 @@ def _timeit(iterations, _):
 
 def _sparse_tensor_dense_vs_dense_matmul_benchmark_sparse(
     x_ind, x_val, x_shape, y, adjoint_a, adjoint_b):
-  sp_x = tf.SparseTensor(indices=x_ind, values=x_val, shape=x_shape)
+  sp_x = tf.SparseTensor(indices=x_ind, values=x_val, dense_shape=x_shape)
 
   def body(t, prev):
     with tf.control_dependencies([prev]):
diff --git a/tensorflow/python/kernel_tests/sparse_tensors_map_ops_test.py b/tensorflow/python/kernel_tests/sparse_tensors_map_ops_test.py
index 48c9f551edc0a4..240644d228f82d 100644
--- a/tensorflow/python/kernel_tests/sparse_tensors_map_ops_test.py
+++ b/tensorflow/python/kernel_tests/sparse_tensors_map_ops_test.py
@@ -138,7 +138,7 @@ def testAddManyTakeManyRoundTrip(self):
           [handles, roundtrip],
           feed_dict={sparse_tensor.indices: indices_value,
                      sparse_tensor.values: values_value,
-                     sparse_tensor.shape: shape_value})
+                     sparse_tensor.dense_shape: shape_value})
       self.assertEqual(handles_value.shape, (4,))
       self.assertAllEqual(roundtrip_value.indices, indices_value)
       self.assertAllEqual(roundtrip_value.values, values_value)
@@ -220,7 +220,7 @@ def benchmarkVeryLarge2DFloatSparseTensor(self):
         np.testing.assert_equal(
             st_roundtrip_values.indices, st_deserialized_values.indices)
         np.testing.assert_equal(
-            st_roundtrip_values.shape, st_deserialized_values.shape)
+            st_roundtrip_values.dense_shape, st_deserialized_values.dense_shape)
 
         self.run_op_benchmark(
             sess, st_roundtrip_op, min_iters=2000,
diff --git a/tensorflow/python/kernel_tests/split_op_test.py b/tensorflow/python/kernel_tests/split_op_test.py
index ba079b5149e0b6..5bfaacbb572cc2 100644
--- a/tensorflow/python/kernel_tests/split_op_test.py
+++ b/tensorflow/python/kernel_tests/split_op_test.py
@@ -24,6 +24,18 @@
 
 class SplitVOpTest(tf.test.TestCase):
 
+  def testListOfScalarTensors(self):
+    a = tf.to_int32(5)
+    b = tf.to_int32(6)
+
+    value = np.random.rand(11, 11)
+
+    with self.test_session(use_gpu=False) as sess:
+      result = sess.run(tf.split_v(value, [a, b]))
+
+    self.assertAllEqual(result[0], value[0:5, :])
+    self.assertAllEqual(result[1], value[5:, :])
+
   def _RunAndVerify(self, use_gpu, large_num_splits=False):
     # Random dims of rank 5
     shape = np.random.randint(1, 5, size=5)
diff --git a/tensorflow/python/kernel_tests/summary_ops_test.py b/tensorflow/python/kernel_tests/summary_ops_test.py
index 06148eefa44b33..1870dd17a96c33 100644
--- a/tensorflow/python/kernel_tests/summary_ops_test.py
+++ b/tensorflow/python/kernel_tests/summary_ops_test.py
@@ -89,12 +89,12 @@ def testMergeAllSummaries(self):
       self.assertEqual(2, len(merge.op.inputs))
       self.assertEqual(summ1, merge.op.inputs[0])
       self.assertEqual(summ3, merge.op.inputs[1])
-      merge = tf.contrib.deprecated.merge_all_summaries("foo_key")
+      merge = tf.summary.merge_all("foo_key")
       self.assertEqual("MergeSummary", merge.op.type)
       self.assertEqual(1, len(merge.op.inputs))
       self.assertEqual(summ2, merge.op.inputs[0])
       self.assertTrue(
-          tf.contrib.deprecated.merge_all_summaries("bar_key") is None)
+          tf.summary.merge_all("bar_key") is None)
 
   def testHistogramSummaryTypes(self):
     with tf.Graph().as_default():
diff --git a/tensorflow/python/kernel_tests/tensor_array_ops_test.py b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
index 16f2585fecb6c8..b7a0b1e5b8af78 100644
--- a/tensorflow/python/kernel_tests/tensor_array_ops_test.py
+++ b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
@@ -63,7 +63,7 @@ def _testTensorArrayWritePack(self, tf_dtype):
       w1 = w0.write(1, convert([[6.0, 7.0]]))
       w2 = w1.write(2, convert([[8.0, 9.0]]))
 
-      c0 = w2.pack()
+      c0 = w2.stack()
 
       self.assertAllEqual(
           convert([[[4.0, 5.0]], [[6.0, 7.0]], [[8.0, 9.0]]]), c0.eval())
@@ -123,7 +123,7 @@ def _testTensorArrayPackNotAllValuesAvailableFails(self):
       with self.assertRaisesOpError(
           "Could not read from TensorArray index 1 "
           "because it has not yet been written to."):
-        ta.write(0, [[4.0, 5.0]]).pack().eval()
+        ta.write(0, [[4.0, 5.0]]).stack().eval()
 
   def testTensorArrayPackNotAllValuesAvailableFails(self):
     self._testTensorArrayPackNotAllValuesAvailableFails()
@@ -141,7 +141,7 @@ def _testTensorArrayUnpackRead(self, tf_dtype):
         convert = lambda x: np.asarray(x).astype(dtype)
 
       # Unpack a vector into scalars
-      w0 = ta.unpack(convert([1.0, 2.0, 3.0]))
+      w0 = ta.unstack(convert([1.0, 2.0, 3.0]))
       r0 = w0.read(0)
       r1 = w0.read(1)
       r2 = w0.read(2)
@@ -155,7 +155,7 @@ def _testTensorArrayUnpackRead(self, tf_dtype):
           dtype=tf_dtype, tensor_array_name="foo", size=3)
 
       # Unpack a matrix into vectors
-      w1 = ta.unpack(convert([[1.0, 1.1], [2.0, 2.1], [3.0, 3.1]]))
+      w1 = ta.unstack(convert([[1.0, 1.1], [2.0, 2.1], [3.0, 3.1]]))
       r0 = w1.read(0)
       r1 = w1.read(1)
       r2 = w1.read(2)
@@ -171,7 +171,7 @@ def _testTensorArrayUnpackRead(self, tf_dtype):
           dtype=tf_dtype, tensor_array_name="foo", size=3)
 
       # Try unpacking an empty matrix, which should not cause an error.
-      w2 = ta.unpack(convert([[], [], []]))
+      w2 = ta.unstack(convert([[], [], []]))
       r0 = w2.read(0)
       r1 = w2.read(1)
       r2 = w2.read(2)
@@ -583,7 +583,7 @@ def _testTensorArrayGradientWritePackConcatAndRead(self):
 
       w0 = ta.write(0, value_0)
       w1 = w0.write(1, value_1)
-      p0 = w1.pack()
+      p0 = w1.stack()
       r0 = w1.read(0)
       s0 = w1.concat()
 
@@ -610,7 +610,7 @@ def testTensorArrayReadTwice(self):
       ta_readonce = tf.TensorArray(
           dtype=tf.float32, tensor_array_name="foo", size=2)
 
-      w_readonce = ta_readonce.unpack(value)
+      w_readonce = ta_readonce.unstack(value)
       r0_readonce = w_readonce.read(0)
       with tf.control_dependencies([r0_readonce]):
         r1_readonce = w_readonce.read(0)
@@ -623,7 +623,7 @@ def testTensorArrayReadTwice(self):
       ta_readtwice = tf.TensorArray(
           dtype=tf.float32, tensor_array_name="foo", size=2,
           clear_after_read=False)
-      w_readtwice = ta_readtwice.unpack(value)
+      w_readtwice = ta_readtwice.unstack(value)
       r0_readtwice = w_readtwice.read(0)
       with tf.control_dependencies([r0_readtwice]):
         r1_readtwice = w_readtwice.read(0)
@@ -638,7 +638,7 @@ def _testTensorArrayGradientUnpackRead(self):
 
       value = tf.constant([[1.0, -1.0], [10.0, -10.0]])
 
-      w = ta.unpack(value)
+      w = ta.unstack(value)
       r0 = w.read(0)
       r0_1 = w.read(0)
       r1 = w.read(1)
@@ -682,7 +682,7 @@ def _testTensorArrayGradientDynamicUnpackRead(self):
 
       value = tf.constant([[1.0, -1.0], [10.0, -10.0]])
 
-      w = ta.unpack(value)
+      w = ta.unstack(value)
       r0 = w.read(0)
       r1 = w.read(1)
 
@@ -746,7 +746,7 @@ def body(time, ta_t, state):
                             tensor_shape.unknown_shape(),
                             tensor_shape.unknown_shape()),
           parallel_iterations=3)
-      vout = h_final.pack()
+      vout = h_final.stack()
 
       grad_val = -np.arange(3*5, dtype=np_dtype).reshape(3, 5)
       v0_grad = tf.gradients([vout], [v0], [grad_val])[0]
@@ -807,7 +807,7 @@ def testGradSerialTwoLoops(self):
       num_steps = 100
       acc = tf.TensorArray(dtype=tf.float32, size=num_steps,
                            clear_after_read=False,
-                           elem_shape=tensor_shape.scalar())
+                           element_shape=tensor_shape.scalar())
       i = tf.constant(0, name="i")
       x = tf.constant(2.0, name="x")
 
@@ -824,7 +824,7 @@ def fn(i, acc):
         return i + 1, acc.write(i, z)
       _, acc2 = tf.while_loop(lambda i, acc: i < num_steps, fn, [i1, acc1])
 
-      r = acc2.pack()
+      r = acc2.stack()
       grad = tf.gradients(r, [x])[0]
       self.assertAllClose(31.0, grad.eval())
 
@@ -926,7 +926,7 @@ def _testUnpackShape(self):
           dtype=tf.float32, tensor_array_name="foo",
           size=0, dynamic_size=True, infer_shape=True)
       value = tf.constant([[1.0, -1.0], [10.0, -10.0], [100.0, -100.0]])
-      w0 = ta.unpack(value)
+      w0 = ta.unstack(value)
       r0 = w0.read(0)
       self.assertAllEqual((2,), r0.get_shape())
 
@@ -972,7 +972,7 @@ def _testGradientWhenNotAllComponentsRead(self):
     with self.test_session(use_gpu=True) as session:
       ta = tf.TensorArray(dtype=tf.float32, size=2)
       x = tf.constant([2.0, 3.0])
-      w = ta.unpack(x)
+      w = ta.unstack(x)
       r0 = w.read(0)
       # calculate (dr0/dx0, dr0/dx1).  since r0 = x0, gradients are (1, 0).
       grad_r0 = tf.gradients(ys=[r0], xs=[x], grad_ys=[1.0])
@@ -987,9 +987,9 @@ def _testTensorArrayUnpackDynamic(self):
       ta = tf.TensorArray(dtype=tf.float32, size=3,
                           dynamic_size=True)
       x = tf.constant([1.0, 2.0, 3.0])
-      w0 = ta.unpack(x)
+      w0 = ta.unstack(x)
       w1 = w0.write(3, 4.0)
-      r = w1.pack()
+      r = w1.stack()
       self.assertAllEqual(np.array([1.0, 2.0, 3.0, 4.0]), r.eval())
       grad = tf.gradients(ys=[r], xs=[x])
       self.assertAllEqual(np.array([1.0, 1.0, 1.0]),
@@ -1021,7 +1021,7 @@ def _testTensorArrayEvalEmpty(self):
           "TensorArray has size zero, but element shape <unknown> is not fully "
           "defined. Currently only static shapes are supported when packing "
           "zero-size TensorArrays."):
-        ta.pack().eval()
+        ta.stack().eval()
 
   def testTensorArrayEvalEmpty(self):
     self._testTensorArrayEvalEmpty()
@@ -1034,8 +1034,8 @@ def _testTensorArrayEvalEmptyWithDefault(self):
                           infer_shape=True)
       self.assertEqual(0, ta.size().eval())
       # Don't actually perform the pack.  This stores the static shape.
-      ta.unpack(tf.zeros([0, 3, 5]))
-      packed = ta.pack()
+      ta.unstack(tf.zeros([0, 3, 5]))
+      packed = ta.stack()
       self.assertAllEqual([0, 3, 5], packed.eval().shape)
       # Concatenating zero tensors along their first dimension gives a
       # first dimension of zero
@@ -1075,7 +1075,7 @@ def testTensorArrayWriteGatherAndGradients(self):
       values = tf.constant([[1.0*x, -1.0*x] for x in range(10)])
       indices = tf.constant([1, 8])
 
-      w = ta.unpack(values)
+      w = ta.unstack(values)
       g = w.gather(indices)
 
       # Test combined gradients + aggregation of read(0)
@@ -1119,11 +1119,11 @@ def testTensorArrayGetsDeviceFromFirstWrite(self):
     self.assertEqual(ta.handle.device, "")
     self.assertEqual(ta.flow.device, "")
     with tf.device("/gpu:0"):
-      ta = ta.unpack([1.0, 2.0])
+      ta = ta.unstack([1.0, 2.0])
     self.assertTrue("gpu:0" in ta.handle.device.lower())
     self.assertTrue("gpu:0" in ta.flow.device.lower())
     with tf.device("/gpu:1"):
-      ta = ta.unpack([1.0, 2.0])
+      ta = ta.unstack([1.0, 2.0])
     self.assertTrue("gpu:0" in ta.handle.device.lower())
     self.assertTrue("gpu:0" in ta.flow.device.lower())
 
diff --git a/tensorflow/python/kernel_tests/variable_scope_test.py b/tensorflow/python/kernel_tests/variable_scope_test.py
index 6e73d329cdf93a..0c524a7f80c8db 100644
--- a/tensorflow/python/kernel_tests/variable_scope_test.py
+++ b/tensorflow/python/kernel_tests/variable_scope_test.py
@@ -641,7 +641,7 @@ def testGetVarWithDevice(self):
     varname_type = []
 
     def device_func(op):
-      if op.type == "Variable":
+      if op.type in ["Variable", "VariableV2"]:
         varname_type.append((op.name, op.get_attr("dtype")))
       return "/gpu:0"
 
@@ -679,9 +679,9 @@ def testResultNameMatchesRequested(self):
       v_concat = v.as_tensor()
       self.assertEqual(v_concat.name, "scope0/name0:0")
       variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
-      self.assertTrue("scope0/name0/part_0:0" in [x.name for x in variables])
-      self.assertTrue("scope0/name0/part_1:0" in [x.name for x in variables])
-      self.assertFalse("scope0/name0/part_2:0" in [x.name for x in variables])
+      self.assertIn("scope0/name0/part_0:0", [x.name for x in variables])
+      self.assertIn("scope0/name0/part_1:0", [x.name for x in variables])
+      self.assertNotIn("scope0/name0/part_2:0", [x.name for x in variables])
 
   def testBreaksIfPartitioningChanges(self):
     with tf.variable_scope("scope0", partitioner=axis0_into2_partitioner):
@@ -725,6 +725,13 @@ def testPropagatePartitionerOnReopening(self):
       with tf.variable_scope(vs) as vs1:
         self.assertEqual(axis0_into2_partitioner, vs1.partitioner)
 
+  def testScalarIgnoresPartitioner(self):
+    with tf.variable_scope("scope0", partitioner=axis0_into2_partitioner):
+      v = tf.get_variable("name0", shape=())
+      self.assertEqual(v.name, "scope0/name0:0")
+      variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
+      self.assertIn("scope0/name0:0", [x.name for x in variables])
+
   def testPartitionConcatenatesAlongCorrectAxis(self):
     def _part_axis_0(**unused_kwargs):
       return (2, 1, 1)
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index 8d875477f6b03e..17e999b736f051 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -31,8 +31,9 @@
 import six
 
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import variables as tf_variables
+from tensorflow.python.ops import variable_scope as vs
 
 
 class _Layer(object):
@@ -45,15 +46,16 @@ class _Layer(object):
   infrastructure functionality.
 
   A layer is a class implementing common neural networks operations, such
-  as convolution, batch norm, etc. These operations require managing weights,
+  as convolution, batch norm, etc. These operations require managing variables,
   losses, and updates, as well as applying TensorFlow ops to input tensors.
 
   Properties:
     trainable: Whether the layer should be trained (boolean).
     name: The name of the layer (string).
-    trainable_weights: List of trainable weights.
-    non_trainable_weights: List of non-trainable weights.
-    weights: List of all weights of this layer, trainable and non-trainable.
+    dtype: Default dtype of the layer (dtypes.float32).
+    trainable_variables: List of trainable variables.
+    non_trainable_variables: List of non-trainable variables.
+    variables: List of all variables of this layer, trainable and non-trainable.
     updates: List of update ops of this layer.
     losses: List of losses added by this layer.
   """
@@ -70,7 +72,7 @@ def __init__(self, trainable=True, name=None,
     # in calls to kwargs.get().
     allowed_kwargs = {
         '_scope',
-        '_reuse_weights',
+        '_reuse',
     }
     for kwarg in kwargs:
       if kwarg not in allowed_kwargs:
@@ -78,11 +80,11 @@ def __init__(self, trainable=True, name=None,
 
     self._trainable = trainable
     self._built = False
-    self._trainable_weights = []
-    self._non_trainable_weights = []
+    self._trainable_variables = []
+    self._non_trainable_variables = []
     self._updates = []
     self._losses = []
-    self._reuse_weights = kwargs.get('_reuse_weights')
+    self._reuse = kwargs.get('_reuse')
     self._dtype = dtype
 
     # Determine base name (non-unique).
@@ -114,12 +116,21 @@ def name(self):
     return self._name
 
   @property
-  def trainable_weights(self):
-    return self._trainable_weights if self.trainable else []
+  def trainable_variables(self):
+    return self._trainable_variables if self.trainable else []
 
   @property
-  def non_trainable_weights(self):
-    return self._non_trainable_weights if self.trainable else self.weights
+  def non_trainable_variables(self):
+    return self._non_trainable_variables if self.trainable else self.variables
+
+  @property
+  def variables(self):
+    """Returns the list of all layer variables/weights.
+
+    Returns:
+      A list of variables.
+    """
+    return self._trainable_variables + self._non_trainable_variables
 
   @property
   def updates(self):
@@ -137,17 +148,21 @@ def built(self):
   def trainable(self):
     return self._trainable
 
+  @property
+  def dtype(self):
+    return self._dtype
+
   @property
   def weights(self):
-    """Returns the list of all layer weights, trainable and non-trainable.
+    """Returns the list of all layer variables/weights.
 
     Returns:
       A list of variables.
     """
-    return self._trainable_weights + self._non_trainable_weights
+    return self.variables
 
   def build(self, _):
-    """Creates the weights of the layer.
+    """Creates the variables of the layer.
     """
     self._built = True
 
@@ -163,20 +178,20 @@ def call(self, inputs, **kwargs):
     """
     raise NotImplementedError
 
-  def _add_weight(self, name, shape, dtype=None,
-                  initializer=None, regularizer=None, trainable=True,
-                  variable_getter=vs.get_variable):
-    """Adds a new weight variable to the layer.
+  def _add_variable(self, name, shape, dtype=None,
+                    initializer=None, regularizer=None, trainable=True,
+                    variable_getter=vs.get_variable):
+    """Adds a new variable to the layer.
 
     Arguments:
-      name: weight name.
-      shape: weight shape.
-      dtype: The type of the weight variable. Defaults to `self._dtype`.
+      name: variable name.
+      shape: variable shape.
+      dtype: The type of the variable. Defaults to `self._dtype`.
       initializer: initializer instance (callable).
       regularizer: regularizer instance (callable).
-      trainable: whether the weight should be part of the layer's
-        "trainable_weights" (e.g. weights, biases)
-        or "non_trainable_weights" (e.g. BatchNorm mean, stddev).
+      trainable: whether the variable should be part of the layer's
+        "trainable_variables" (e.g. variables, biases)
+        or "non_trainable_variables" (e.g. BatchNorm mean, stddev).
       variable_getter: The getter to use for TensorFlow variables.
 
     Returns:
@@ -189,18 +204,29 @@ def _add_weight(self, name, shape, dtype=None,
                                initializer=initializer,
                                dtype=dtype,
                                trainable=trainable and self.trainable)
+    # TODO(sguada) fix name = variable.op.name
     if trainable:
-      self._trainable_weights.append(variable)
+      self._trainable_variables.append(variable)
     else:
-      self._non_trainable_weights.append(variable)
-    if regularizer and not self._reuse_weights:
-      with ops.colocate_with(variable.op):
-        with ops.name_scope(name + '/Regularizer'):
-          regularization = regularizer(variable)
-      if regularization is not None:
-        self._losses.append(regularization)
-        _add_elements_to_collection(
-            regularization, ops.GraphKeys.REGULARIZATION_LOSSES)
+      self._non_trainable_variables.append(variable)
+    if regularizer and not self._reuse:
+      if isinstance(variable, tf_variables.PartitionedVariable):
+        for v in variable:
+          with ops.colocate_with(v.op):
+            with ops.name_scope(name + '/Regularizer'):
+              regularization = regularizer(v)
+          if regularization is not None:
+            self._losses.append(regularization)
+            _add_elements_to_collection(
+                regularization, ops.GraphKeys.REGULARIZATION_LOSSES)
+      else:
+        with ops.colocate_with(variable.op):
+          with ops.name_scope(name + '/Regularizer'):
+            regularization = regularizer(variable)
+        if regularization is not None:
+          self._losses.append(regularization)
+          _add_elements_to_collection(
+              regularization, ops.GraphKeys.REGULARIZATION_LOSSES)
     return variable
 
   def __call__(self, inputs, **kwargs):
@@ -214,19 +240,19 @@ def __call__(self, inputs, **kwargs):
       Output tensor(s).
     """
     # Define a custom getter to override tf.get_variable when creating layer
-    # weights. We respect current custom getter, if one is set.
+    # variables. We respect current custom getter, if one is set.
     current_custom_getter = vs.get_variable_scope().custom_getter
     def variable_getter(getter, name, shape, dtype=None, initializer=None,
                         regularizer=None, trainable=True, **kwargs):
       if current_custom_getter is not None:
         getter = functools.partial(current_custom_getter, getter)
-      return self._add_weight(
+      return self._add_variable(
           name, shape, initializer=initializer, regularizer=regularizer,
           dtype=dtype, trainable=trainable,
           variable_getter=functools.partial(getter, **kwargs))
 
     # Build (if necessary) and call the layer, inside a variable scope.
-    with vs.variable_scope(self._scope, reuse=self._reuse_weights,
+    with vs.variable_scope(self._scope, reuse=self._reuse,
                            custom_getter=variable_getter) as scope:
       with ops.name_scope(scope.original_name_scope):
         if not self.built:
@@ -271,8 +297,8 @@ def apply(self, inputs, **kwargs):
 
 
 def _to_snake_case(name):
-  intermediate = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
-  insecure = re.sub('([a-z0-9])([A-Z])', r'\1_\2', intermediate).lower()
+  intermediate = re.sub('(.)([A-Z][a-z0-9]+)', r'\1_\2', name)
+  insecure = re.sub('([a-z])([A-Z])', r'\1_\2', intermediate).lower()
   # If the class is private the name starts with "_" which is not secure
   # for creating scopes. We prefix the name with "private" in this case.
   if insecure[0] != '_':
diff --git a/tensorflow/python/layers/base_test.py b/tensorflow/python/layers/base_test.py
index 9262db2fc75f2c..ac8103cf2be09a 100644
--- a/tensorflow/python/layers/base_test.py
+++ b/tensorflow/python/layers/base_test.py
@@ -28,9 +28,9 @@ class BaseLayerTest(tf.test.TestCase):
   def testLayerProperties(self):
     layer = base_layers._Layer(name='my_layer')
     self.assertEqual(layer.name, 'my_layer')
-    self.assertListEqual(layer.weights, [])
-    self.assertListEqual(layer.trainable_weights, [])
-    self.assertListEqual(layer.non_trainable_weights, [])
+    self.assertListEqual(layer.variables, [])
+    self.assertListEqual(layer.trainable_variables, [])
+    self.assertListEqual(layer.non_trainable_variables, [])
     self.assertListEqual(layer.updates, [])
     self.assertListEqual(layer.losses, [])
     self.assertEqual(layer.built, False)
@@ -42,31 +42,31 @@ def testAddWeight(self):
       layer = base_layers._Layer(name='my_layer')
 
       # Test basic variable creation.
-      variable = layer._add_weight('my_var', [2, 2],
-                                   initializer=tf.zeros_initializer)
+      variable = layer._add_variable('my_var', [2, 2],
+                                     initializer=tf.zeros_initializer)
       self.assertEqual(variable.name, 'my_var:0')
-      self.assertListEqual(layer.weights, [variable])
-      self.assertListEqual(layer.trainable_weights, [variable])
-      self.assertListEqual(layer.non_trainable_weights, [])
-      self.assertListEqual(layer.weights,
+      self.assertListEqual(layer.variables, [variable])
+      self.assertListEqual(layer.trainable_variables, [variable])
+      self.assertListEqual(layer.non_trainable_variables, [])
+      self.assertListEqual(layer.variables,
                            tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES))
 
       # Test non-trainable variable creation.
-      # layer._add_weight should work even outside `build` and `call`.
-      variable_2 = layer._add_weight('non_trainable_var', [2, 2],
-                                     initializer=tf.zeros_initializer,
-                                     trainable=False)
-      self.assertListEqual(layer.weights, [variable, variable_2])
-      self.assertListEqual(layer.trainable_weights, [variable])
-      self.assertListEqual(layer.non_trainable_weights, [variable_2])
+      # layer._add_variable should work even outside `build` and `call`.
+      variable_2 = layer._add_variable('non_trainable_var', [2, 2],
+                                       initializer=tf.zeros_initializer,
+                                       trainable=False)
+      self.assertListEqual(layer.variables, [variable, variable_2])
+      self.assertListEqual(layer.trainable_variables, [variable])
+      self.assertListEqual(layer.non_trainable_variables, [variable_2])
       self.assertEqual(
           len(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)), 1)
 
       # Test with regularizer.
       regularizer = lambda x: tf.reduce_sum(x) * 1e-3
-      variable = layer._add_weight('reg_var', [2, 2],
-                                   initializer=tf.zeros_initializer,
-                                   regularizer=regularizer)
+      variable = layer._add_variable('reg_var', [2, 2],
+                                     initializer=tf.zeros_initializer,
+                                     regularizer=regularizer)
       self.assertEqual(len(layer.losses), 1)
 
   def testGetVariable(self):
@@ -77,8 +77,8 @@ def testGetVariable(self):
       class MyLayer(base_layers._Layer):
 
         def build(self, input_shape):
-          self.w = tf.get_variable('my_var', [2, 2],
-                                   initializer=tf.zeros_initializer)
+          self.my_var = tf.get_variable('my_var', [2, 2],
+                                        initializer=tf.zeros_initializer)
 
         def call(self, inputs):
           return inputs
@@ -86,7 +86,7 @@ def call(self, inputs):
       layer = MyLayer(name='my_layer')
       inputs = tf.random_uniform((5,), seed=1)
       _ = layer.apply(inputs)
-      self.assertListEqual(layer.weights, [layer.w])
+      self.assertListEqual(layer.variables, [layer.my_var])
 
   def testCall(self):
 
diff --git a/tensorflow/python/layers/conv_utils.py b/tensorflow/python/layers/conv_utils.py
new file mode 100644
index 00000000000000..ad6e7d3f327d03
--- /dev/null
+++ b/tensorflow/python/layers/conv_utils.py
@@ -0,0 +1,105 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# pylint: disable=unused-import,g-bad-import-order
+"""Contains layer utilies for input validation and format conversion.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import numpy as np
+
+
+def convert_data_format(data_format, ndim):
+  if data_format == 'channels_last':
+    if ndim == 3:
+      return 'NWC'
+    elif ndim == 4:
+      return 'NHWC'
+    elif ndim == 5:
+      return 'NDHWC'
+    else:
+      raise ValueError('Input rank not supported:', ndim)
+  elif data_format == 'channels_first':
+    if ndim == 3:
+      return 'NCW'
+    elif ndim == 4:
+      return 'NCHW'
+    elif ndim == 5:
+      raise ValueError('Data format "channels_first" not supported for '
+                       'inputs with rank 5.')
+    else:
+      raise ValueError('Input rank not supported:', ndim)
+  else:
+    raise ValueError('Invalid data_format:', data_format)
+
+
+def normalize_tuple(value, n, name):
+  """Transforms a single integer or iterable of integers into an integer tuple.
+
+  Arguments:
+    value: The value to validate and convert. Could an int, or any iterable
+      of ints.
+    n: The size of the tuple to be returned.
+    name: The name of the argument being validated, e.g. "strides" or
+      "kernel_size". This is only used to format error messages.
+
+  Returns:
+    A tuple of n integers.
+
+  Raises:
+    ValueError: If something else than an int/long or iterable thereof was
+      passed.
+  """
+  if isinstance(value, int):
+    return (value,) * n
+  else:
+    try:
+      value_tuple = tuple(value)
+    except TypeError:
+      raise ValueError('The `' + name + '` argument must be a tuple of ' +
+                       str(n) + ' integers. Received: ' + str(value))
+    if len(value_tuple) != n:
+      raise ValueError('The `' + name + '` argument must be a tuple of ' +
+                       str(n) + ' integers. Received: ' + str(value))
+    for single_value in value_tuple:
+      try:
+        int(single_value)
+      except ValueError:
+        raise ValueError('The `' + name + '` argument must be a tuple of ' +
+                         str(n) + ' integers. Received: ' + str(value) + ' '
+                         'including element ' + str(single_value) + ' of type' +
+                         ' ' + str(type(single_value)))
+    return value_tuple
+
+
+def normalize_data_format(value):
+  data_format = value.lower()
+  if data_format not in {'channels_first', 'channels_last'}:
+    raise ValueError('The `data_format` argument must be one of '
+                     '"channels_first", "channels_last". Received: ' +
+                     str(value))
+  return data_format
+
+
+def normalize_padding(value):
+  padding = value.lower()
+  if padding not in {'valid', 'same'}:
+    raise ValueError('The `padding` argument must be one of "valid", "same". '
+                     'Received: ' + str(padding))
+  return padding
diff --git a/tensorflow/python/layers/conv_utils_test.py b/tensorflow/python/layers/conv_utils_test.py
new file mode 100644
index 00000000000000..02012bff43de46
--- /dev/null
+++ b/tensorflow/python/layers/conv_utils_test.py
@@ -0,0 +1,72 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf.layers.core."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from tensorflow.python.layers import conv_utils
+
+
+class ConvUtilsTest(tf.test.TestCase):
+
+  def testConvertDataFormat(self):
+    self.assertEqual(
+        conv_utils.convert_data_format('channels_first', 4), 'NCHW')
+    self.assertEqual(conv_utils.convert_data_format('channels_first', 3), 'NCW')
+    self.assertEqual(conv_utils.convert_data_format('channels_last', 4), 'NHWC')
+    self.assertEqual(conv_utils.convert_data_format('channels_last', 3), 'NWC')
+    self.assertEqual(
+        conv_utils.convert_data_format('channels_last', 5), 'NDHWC')
+
+    with self.assertRaises(ValueError):
+      conv_utils.convert_data_format('invalid', 2)
+
+  def testNormalizeTuple(self):
+    self.assertEqual(
+        conv_utils.normalize_tuple(2, n=3, name='strides'), (2, 2, 2))
+    self.assertEqual(
+        conv_utils.normalize_tuple((2, 1, 2), n=3, name='strides'), (2, 1, 2))
+
+    with self.assertRaises(ValueError):
+      conv_utils.normalize_tuple((2, 1), n=3, name='strides')
+
+    with self.assertRaises(ValueError):
+      conv_utils.normalize_tuple(None, n=3, name='strides')
+
+  def testNormalizeDataFormat(self):
+    self.assertEqual(
+        conv_utils.normalize_data_format('Channels_Last'), 'channels_last')
+    self.assertEqual(
+        conv_utils.normalize_data_format('CHANNELS_FIRST'), 'channels_first')
+
+    with self.assertRaises(ValueError):
+      conv_utils.normalize_data_format('invalid')
+
+  def testNormalizePadding(self):
+    self.assertEqual(
+        conv_utils.normalize_padding('SAME'), 'same')
+    self.assertEqual(
+        conv_utils.normalize_padding('VALID'), 'valid')
+
+    with self.assertRaises(ValueError):
+      conv_utils.normalize_padding('invalid')
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/python/layers/convolutional.py b/tensorflow/python/layers/convolutional.py
new file mode 100644
index 00000000000000..82be4d3f4af7c4
--- /dev/null
+++ b/tensorflow/python/layers/convolutional.py
@@ -0,0 +1,1189 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# pylint: disable=unused-import,g-bad-import-order
+"""Contains the convolutional layer classes and their functional aliases.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import numpy as np
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import standard_ops
+from tensorflow.python.ops import variable_scope as vs
+
+from tensorflow.python.layers import base
+from tensorflow.python.layers import conv_utils as utils
+
+
+class _Conv(base._Layer):  # pylint: disable=protected-access
+  """Abstract nD convolution layer (private, used as implementation base).
+
+  This layer creates a convolution kernel that is convolved
+  (actually cross-correlated) with the layer input to produce a tensor of
+  outputs. If `use_bias` is True (and a `bias_initializer` is provided),
+  a bias vector is created and added to the outputs. Finally, if
+  `activation` is not `None`, it is applied to the outputs as well.
+
+  Arguments:
+    rank: An integer, the rank of the convolution, e.g. "2" for 2D convolution.
+    filters: integer, the dimensionality of the output space (i.e. the number
+      output of filters in the convolution).
+    kernel_size: an integer or tuple/list of n integers, specifying the
+      length of the 1D convolution window.
+    strides: an integer or tuple/list of n integers,
+      specifying the stride length of the convolution.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: one of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, length, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, length)`.
+    dilation_rate: an integer or tuple/list of n integers, specifying
+      the dilation rate to use for dilated convolution.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any `strides` value != 1.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    kernel_initializer: An initializer for the convolution kernel.
+    bias_initializer: An initializer for the bias vector. If None, no bias will
+      be applied.
+    kernel_regularizer: Optional regularizer for the convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, rank,
+               filters,
+               kernel_size,
+               strides=1,
+               padding='valid',
+               data_format='channels_last',
+               dilation_rate=1,
+               activation=None,
+               use_bias=True,
+               kernel_initializer=None,
+               bias_initializer=init_ops.zeros_initializer,
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(_Conv, self).__init__(trainable=trainable,
+                                name=name, **kwargs)
+    self.rank = rank
+    self.filters = filters
+    self.kernel_size = utils.normalize_tuple(kernel_size, rank, 'kernel_size')
+    self.strides = utils.normalize_tuple(strides, rank, 'strides')
+    self.padding = utils.normalize_padding(padding)
+    self.data_format = utils.normalize_data_format(data_format)
+    self.dilation_rate = utils.normalize_tuple(
+        dilation_rate, rank, 'dilation_rate')
+    self.activation = activation
+    self.use_bias = use_bias
+    self.kernel_initializer = kernel_initializer
+    self.bias_initializer = bias_initializer
+    self.kernel_regularizer = kernel_regularizer
+    self.bias_regularizer = bias_regularizer
+    self.activity_regularizer = activity_regularizer
+
+  def build(self, input_shape):
+    if len(input_shape) != self.rank + 2:
+      raise ValueError('Inputs should have rank ' +
+                       str(self.rank + 2) +
+                       'Received input shape:', str(input_shape))
+    if self.data_format == 'channels_first':
+      channel_axis = 1
+    else:
+      channel_axis = -1
+    if input_shape[channel_axis] is None:
+      raise ValueError('The channel dimension of the inputs '
+                       'should be defined. Found `None`.')
+    input_dim = input_shape[channel_axis]
+    kernel_shape = self.kernel_size + (input_dim, self.filters)
+
+    self.kernel = vs.get_variable('kernel',
+                                  shape=kernel_shape,
+                                  initializer=self.kernel_initializer,
+                                  regularizer=self.kernel_regularizer,
+                                  trainable=True,
+                                  dtype=self._dtype)
+    if self.use_bias:
+      self.bias = vs.get_variable('bias',
+                                  shape=(self.filters,),
+                                  initializer=self.bias_initializer,
+                                  regularizer=self.bias_regularizer,
+                                  trainable=True,
+                                  dtype=self._dtype)
+    else:
+      self.bias = None
+
+  def call(self, inputs):
+    outputs = nn.convolution(
+        input=inputs,
+        filter=self.kernel,
+        dilation_rate=self.dilation_rate,
+        strides=self.strides,
+        padding=self.padding.upper(),
+        data_format=utils.convert_data_format(self.data_format, self.rank + 2))
+    if self.bias is not None:
+      if self.rank != 2 and self.data_format == 'channels_first':
+        # bias_add does not support channels_first for non-4D inputs.
+        if self.rank == 1:
+          bias = array_ops.reshape(self.bias, (1, self.filters, 1))
+        if self.rank == 3:
+          bias = array_ops.reshape(self.bias, (1, self.filters, 1, 1))
+        outputs += bias
+      else:
+        outputs = nn.bias_add(
+            outputs,
+            self.bias,
+            data_format=utils.convert_data_format(self.data_format, 4))
+        # Note that we passed rank=4 because bias_add will only accept
+        # NHWC and NCWH even if the rank of the inputs is 3 or 5.
+
+    if self.activation is not None:
+      return self.activation(outputs)
+    return outputs
+
+
+class Conv1D(_Conv):
+  """1D convolution layer (e.g. temporal convolution).
+
+  This layer creates a convolution kernel that is convolved
+  (actually cross-correlated) with the layer input to produce a tensor of
+  outputs. If `use_bias` is True (and a `bias_initializer` is provided),
+  a bias vector is created and added to the outputs. Finally, if
+  `activation` is not `None`, it is applied to the outputs as well.
+
+  Arguments:
+    filters: integer, the dimensionality of the output space (i.e. the number
+      output of filters in the convolution).
+    kernel_size: An integer or tuple/list of a single integer, specifying the
+      length of the 1D convolution window.
+    strides: an integer or tuple/list of a single integer,
+      specifying the stride length of the convolution.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: one of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, length, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, length)`.
+    dilation_rate: an integer or tuple/list of a single integer, specifying
+      the dilation rate to use for dilated convolution.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any `strides` value != 1.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    kernel_initializer: An initializer for the convolution kernel.
+    bias_initializer: An initializer for the bias vector. If None, no bias will
+      be applied.
+    kernel_regularizer: Optional regularizer for the convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, filters,
+               kernel_size,
+               strides=1,
+               padding='valid',
+               data_format='channels_last',
+               dilation_rate=1,
+               activation=None,
+               use_bias=True,
+               kernel_initializer=None,
+               bias_initializer=init_ops.zeros_initializer,
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(Convolution1D, self).__init__(
+        rank=1,
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilation_rate=dilation_rate,
+        activation=activation,
+        use_bias=use_bias,
+        kernel_initializer=kernel_initializer,
+        bias_initializer=bias_initializer,
+        kernel_regularizer=kernel_regularizer,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        trainable=trainable,
+        name=name, **kwargs)
+
+
+def conv1d(inputs,
+           filters,
+           kernel_size,
+           strides=1,
+           padding='valid',
+           data_format='channels_last',
+           dilation_rate=1,
+           activation=None,
+           use_bias=True,
+           kernel_initializer=None,
+           bias_initializer=init_ops.zeros_initializer,
+           kernel_regularizer=None,
+           bias_regularizer=None,
+           activity_regularizer=None,
+           trainable=True,
+           name=None,
+           reuse=False):
+  """Functional interface for 1D convolution layer (e.g. temporal convolution).
+
+  This layer creates a convolution kernel that is convolved
+  (actually cross-correlated) with the layer input to produce a tensor of
+  outputs. If `use_bias` is True (and a `bias_initializer` is provided),
+  a bias vector is created and added to the outputs. Finally, if
+  `activation` is not `None`, it is applied to the outputs as well.
+
+  Arguments:
+    inputs: Tensor input.
+    filters: integer, the dimensionality of the output space (i.e. the number
+      output of filters in the convolution).
+    kernel_size: An integer or tuple/list of a single integer, specifying the
+      length of the 1D convolution window.
+    strides: an integer or tuple/list of a single integer,
+      specifying the stride length of the convolution.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: one of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, length, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, length)`.
+    dilation_rate: an integer or tuple/list of a single integer, specifying
+      the dilation rate to use for dilated convolution.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any `strides` value != 1.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    kernel_initializer: An initializer for the convolution kernel.
+    bias_initializer: An initializer for the bias vector. If None, no bias will
+      be applied.
+    kernel_regularizer: Optional regularizer for the convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
+    name: A string, the name of the layer.
+    reuse: Boolean, whether to reuse the weights of a previous layer
+      by the same name.
+
+  Returns:
+    Output tensor.
+  """
+  layer = Conv1D(
+      filters=filters,
+      kernel_size=kernel_size,
+      strides=strides,
+      padding=padding,
+      data_format=data_format,
+      dilation_rate=dilation_rate,
+      activation=activation,
+      use_bias=use_bias,
+      kernel_initializer=kernel_initializer,
+      bias_initializer=bias_initializer,
+      kernel_regularizer=kernel_regularizer,
+      bias_regularizer=bias_regularizer,
+      activity_regularizer=activity_regularizer,
+      trainable=trainable,
+      name=name,
+      _reuse=reuse,
+      _scope=name)
+  return layer.apply(inputs)
+
+
+class Conv2D(_Conv):
+  """2D convolution layer (e.g. spatial convolution over images).
+
+  This layer creates a convolution kernel that is convolved
+  (actually cross-correlated) with the layer input to produce a tensor of
+  outputs. If `use_bias` is True (and a `bias_initializer` is provided),
+  a bias vector is created and added to the outputs. Finally, if
+  `activation` is not `None`, it is applied to the outputs as well.
+
+  Arguments:
+    filters: integer, the dimensionality of the output space (i.e. the number
+      output of filters in the convolution).
+    kernel_size: an integer or tuple/list of 2 integers, specifying the
+      width and height of the 2D convolution window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: an integer or tuple/list of 2 integers,
+      specifying the strides of the convolution along the width and height.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: one of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, width, height, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, width, height)`.
+    dilation_rate: an integer or tuple/list of 2 integers, specifying
+      the dilation rate to use for dilated convolution.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any stride value != 1.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    kernel_initializer: An initializer for the convolution kernel.
+    bias_initializer: An initializer for the bias vector. If None, no bias will
+      be applied.
+    kernel_regularizer: Optional regularizer for the convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, filters,
+               kernel_size,
+               strides=(1, 1),
+               padding='valid',
+               data_format='channels_last',
+               dilation_rate=(1, 1),
+               activation=None,
+               use_bias=True,
+               kernel_initializer=None,
+               bias_initializer=init_ops.zeros_initializer,
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(Conv2D, self).__init__(
+        rank=2,
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilation_rate=dilation_rate,
+        activation=activation,
+        use_bias=use_bias,
+        kernel_initializer=kernel_initializer,
+        bias_initializer=bias_initializer,
+        kernel_regularizer=kernel_regularizer,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        trainable=trainable,
+        name=name, **kwargs)
+
+
+def conv2d(inputs,
+           filters,
+           kernel_size,
+           strides=(1, 1),
+           padding='valid',
+           data_format='channels_last',
+           dilation_rate=(1, 1),
+           activation=None,
+           use_bias=True,
+           kernel_initializer=None,
+           bias_initializer=init_ops.zeros_initializer,
+           kernel_regularizer=None,
+           bias_regularizer=None,
+           activity_regularizer=None,
+           trainable=True,
+           name=None,
+           reuse=False):
+  """Functional interface for the 2D convolution layer.
+
+  This layer creates a convolution kernel that is convolved
+  (actually cross-correlated) with the layer input to produce a tensor of
+  outputs. If `use_bias` is True (and a `bias_initializer` is provided),
+  a bias vector is created and added to the outputs. Finally, if
+  `activation` is not `None`, it is applied to the outputs as well.
+
+  Arguments:
+    inputs: Tensor input.
+    filters: integer, the dimensionality of the output space (i.e. the number
+      output of filters in the convolution).
+    kernel_size: an integer or tuple/list of 2 integers, specifying the
+      width and height of the 2D convolution window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: an integer or tuple/list of 2 integers,
+      specifying the strides of the convolution along the width and height.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: one of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, width, height, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, width, height)`.
+    dilation_rate: an integer or tuple/list of 2 integers, specifying
+      the dilation rate to use for dilated convolution.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any stride value != 1.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    kernel_initializer: An initializer for the convolution kernel.
+    bias_initializer: An initializer for the bias vector. If None, no bias will
+      be applied.
+    kernel_regularizer: Optional regularizer for the convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
+    name: A string, the name of the layer.
+    reuse: Boolean, whether to reuse the weights of a previous layer
+      by the same name.
+
+  Returns:
+    Output tensor.
+  """
+  layer = Conv2D(
+      filters=filters,
+      kernel_size=kernel_size,
+      strides=strides,
+      padding=padding,
+      data_format=data_format,
+      dilation_rate=dilation_rate,
+      activation=activation,
+      use_bias=use_bias,
+      kernel_initializer=kernel_initializer,
+      bias_initializer=bias_initializer,
+      kernel_regularizer=kernel_regularizer,
+      bias_regularizer=bias_regularizer,
+      activity_regularizer=activity_regularizer,
+      trainable=trainable,
+      name=name,
+      _reuse=reuse,
+      _scope=name)
+  return layer.apply(inputs)
+
+
+class Conv3D(_Conv):
+  """3D convolution layer (e.g. spatial convolution over volumes).
+
+  This layer creates a convolution kernel that is convolved
+  (actually cross-correlated) with the layer input to produce a tensor of
+  outputs. If `use_bias` is True (and a `bias_initializer` is provided),
+  a bias vector is created and added to the outputs. Finally, if
+  `activation` is not `None`, it is applied to the outputs as well.
+
+  Arguments:
+    filters: integer, the dimensionality of the output space (i.e. the number
+      output of filters in the convolution).
+    kernel_size: an integer or tuple/list of 3 integers, specifying the
+      width and height of the 2D convolution window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: an integer or tuple/list of 3 integers,
+      specifying the strides of the convolution along the width and height.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: one of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, width, height, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, width, height)`.
+    dilation_rate: an integer or tuple/list of 3 integers, specifying
+      the dilation rate to use for dilated convolution.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any stride value != 1.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    kernel_initializer: An initializer for the convolution kernel.
+    bias_initializer: An initializer for the bias vector. If None, no bias will
+      be applied.
+    kernel_regularizer: Optional regularizer for the convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, filters,
+               kernel_size,
+               strides=(1, 1, 1),
+               padding='valid',
+               data_format='channels_last',
+               dilation_rate=(1, 1, 1),
+               activation=None,
+               use_bias=True,
+               kernel_initializer=None,
+               bias_initializer=init_ops.zeros_initializer,
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(Conv3D, self).__init__(
+        rank=3,
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilation_rate=dilation_rate,
+        activation=activation,
+        use_bias=use_bias,
+        kernel_initializer=kernel_initializer,
+        bias_initializer=bias_initializer,
+        kernel_regularizer=kernel_regularizer,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        trainable=trainable,
+        name=name, **kwargs)
+
+
+def conv3d(inputs,
+           filters,
+           kernel_size,
+           strides=(1, 1, 1),
+           padding='valid',
+           data_format='channels_last',
+           dilation_rate=(1, 1, 1),
+           activation=None,
+           use_bias=True,
+           kernel_initializer=None,
+           bias_initializer=init_ops.zeros_initializer,
+           kernel_regularizer=None,
+           bias_regularizer=None,
+           activity_regularizer=None,
+           trainable=True,
+           name=None,
+           reuse=False):
+  """Functional interface for the 3D convolution layer.
+
+  This layer creates a convolution kernel that is convolved
+  (actually cross-correlated) with the layer input to produce a tensor of
+  outputs. If `use_bias` is True (and a `bias_initializer` is provided),
+  a bias vector is created and added to the outputs. Finally, if
+  `activation` is not `None`, it is applied to the outputs as well.
+
+  Arguments:
+    inputs: Tensor input.
+    filters: integer, the dimensionality of the output space (i.e. the number
+      output of filters in the convolution).
+    kernel_size: an integer or tuple/list of 3 integers, specifying the
+      width and height of the 2D convolution window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: an integer or tuple/list of 3 integers,
+      specifying the strides of the convolution along the width and height.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: one of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, width, height, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, width, height)`.
+    dilation_rate: an integer or tuple/list of 3 integers, specifying
+      the dilation rate to use for dilated convolution.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any stride value != 1.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    kernel_initializer: An initializer for the convolution kernel.
+    bias_initializer: An initializer for the bias vector. If None, no bias will
+      be applied.
+    kernel_regularizer: Optional regularizer for the convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
+    name: A string, the name of the layer.
+    reuse: Boolean, whether to reuse the weights of a previous layer
+      by the same name.
+
+  Returns:
+    Output tensor.
+  """
+  layer = Conv3D(
+      filters=filters,
+      kernel_size=kernel_size,
+      strides=strides,
+      padding=padding,
+      data_format=data_format,
+      dilation_rate=dilation_rate,
+      activation=activation,
+      use_bias=use_bias,
+      kernel_initializer=kernel_initializer,
+      bias_initializer=bias_initializer,
+      kernel_regularizer=kernel_regularizer,
+      bias_regularizer=bias_regularizer,
+      activity_regularizer=activity_regularizer,
+      trainable=trainable,
+      name=name,
+      _reuse=reuse,
+      _scope=name)
+  return layer.apply(inputs)
+
+
+class SeparableConv2D(Conv2D):
+  """Depthwise separable 2D convolution.
+
+  This layer performs a depthwise convolution that acts separately on
+  channels, followed by a pointwise convolution that mixes channels.
+  If `use_bias` is True and a bias initializer is provided,
+  it adds a bias vector to the output.
+  It then optionally applies an activation function to produce the final output.
+
+  Arguments:
+    filters: integer, the dimensionality of the output space (i.e. the number
+      output of filters in the convolution).
+    kernel_size: a tuple or list of N positive integers specifying the spatial
+      dimensions of of the filters. Can be a single integer to specify the same
+      value for all spatial dimensions.
+    strides: a tuple or list of N positive integers specifying the strides
+      of the convolution. Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Specifying any `stride` value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: one of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shapedata_format = 'NWHC'
+      `(batch, width, height, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, width, height)`.
+    depth_multiplier: The number of depthwise convolution output channels for
+      each input channel. The total number of depthwise convolution output
+      channels will be equal to `num_filters_in * depth_multiplier`.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    depthwise_initializer: An initializer for the depthwise convolution kernel.
+    pointwise_initializer: An initializer for the pointwise convolution kernel.
+    bias_initializer: An initializer for the bias vector. If None, no bias will
+      be applied.
+    depthwise_regularizer: Optional regularizer for the depthwise
+      convolution kernel.
+    pointwise_regularizer: Optional regularizer for the pointwise
+      convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, filters,
+               kernel_size,
+               strides=(1, 1),
+               padding='valid',
+               data_format='channels_last',
+               depth_multiplier=1,
+               activation=None,
+               use_bias=True,
+               depthwise_initializer=None,
+               pointwise_initializer=None,
+               bias_initializer=init_ops.zeros_initializer,
+               depthwise_regularizer=None,
+               pointwise_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(SeparableConv2D, self).__init__(
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        activation=activation,
+        use_bias=use_bias,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        trainable=trainable,
+        name=name,
+        **kwargs)
+    self.depth_multiplier = depth_multiplier
+    self.depthwise_initializer = depthwise_initializer
+    self.pointwise_initializer = pointwise_initializer
+    self.depthwise_regularizer = depthwise_regularizer
+    self.pointwise_regularizer = pointwise_regularizer
+
+  def build(self, input_shape):
+    if len(input_shape) < 4:
+      raise ValueError('Inputs to `SeparableConv2D` should have rank 4. '
+                       'Received input shape:', str(input_shape))
+    if self.data_format == 'channels_first':
+      channel_axis = 1
+    else:
+      channel_axis = 3
+    if input_shape[channel_axis] is None:
+      raise ValueError('The channel dimension of the inputs to '
+                       '`SeparableConv2D` '
+                       'should be defined. Found `None`.')
+    input_dim = int(input_shape[channel_axis])
+    depthwise_kernel_shape = (self.kernel_size[0],
+                              self.kernel_size[1],
+                              input_dim,
+                              self.depth_multiplier)
+    pointwise_kernel_shape = (1, 1,
+                              self.depth_multiplier * input_dim,
+                              self.filters)
+
+    self.depthwise_kernel = vs.get_variable(
+        'depthwise_kernel',
+        shape=depthwise_kernel_shape,
+        initializer=self.depthwise_initializer,
+        regularizer=self.depthwise_regularizer,
+        trainable=True,
+        dtype=self._dtype)
+    self.pointwise_kernel = vs.get_variable(
+        'pointwise_kernel',
+        shape=pointwise_kernel_shape,
+        initializer=self.pointwise_initializer,
+        regularizer=self.pointwise_regularizer,
+        trainable=True,
+        dtype=self._dtype)
+    if self.use_bias:
+      self.bias = vs.get_variable('bias',
+                                  shape=(self.filters,),
+                                  initializer=self.bias_initializer,
+                                  regularizer=self.bias_regularizer,
+                                  trainable=True,
+                                  dtype=self._dtype)
+    else:
+      self.bias = None
+
+  def call(self, inputs):
+    if self.data_format == 'channels_first':
+      # Reshape to channels last
+      inputs = array_ops.transpose(inputs, (0, 2, 3, 1))
+
+    # Apply the actual ops.
+    outputs = nn.separable_conv2d(
+        inputs,
+        self.depthwise_kernel,
+        self.pointwise_kernel,
+        strides=(1,) + self.strides + (1,),
+        padding=self.padding.upper())
+
+    if self.data_format == 'channels_first':
+      # Reshape to channels first
+      outputs = array_ops.transpose(outputs, (0, 3, 1, 2))
+
+    if self.bias:
+      outputs = nn.bias_add(
+          outputs,
+          self.bias,
+          data_format=utils.convert_data_format(self.data_format, ndim=4))
+
+    if self.activation is not None:
+      return self.activation(outputs)
+    return outputs
+
+
+def separable_conv2d(inputs,
+                     filters,
+                     kernel_size,
+                     strides=(1, 1),
+                     padding='valid',
+                     data_format='channels_last',
+                     depth_multiplier=1,
+                     activation=None,
+                     use_bias=True,
+                     depthwise_initializer=None,
+                     pointwise_initializer=None,
+                     bias_initializer=init_ops.zeros_initializer,
+                     depthwise_regularizer=None,
+                     pointwise_regularizer=None,
+                     bias_regularizer=None,
+                     activity_regularizer=None,
+                     trainable=True,
+                     name=None,
+                     reuse=False):
+  """Functional interface for the depthwise separable 2D convolution layer.
+
+  This layer performs a depthwise convolution that acts separately on
+  channels, followed by a pointwise convolution that mixes channels.
+  If `use_bias` is True and a bias initializer is provided,
+  it adds a bias vector to the output.
+  It then optionally applies an activation function to produce the final output.
+
+  Arguments:
+    inputs: Input tensor.
+    filters: integer, the dimensionality of the output space (i.e. the number
+      output of filters in the convolution).
+    kernel_size: a tuple or list of N positive integers specifying the spatial
+      dimensions of of the filters. Can be a single integer to specify the same
+      value for all spatial dimensions.
+    strides: a tuple or list of N positive integers specifying the strides
+      of the convolution. Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Specifying any `stride` value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: one of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shapedata_format = 'NWHC'
+      `(batch, width, height, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, width, height)`.
+    depth_multiplier: The number of depthwise convolution output channels for
+      each input channel. The total number of depthwise convolution output
+      channels will be equal to `num_filters_in * depth_multiplier`.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    depthwise_initializer: An initializer for the depthwise convolution kernel.
+    pointwise_initializer: An initializer for the pointwise convolution kernel.
+    bias_initializer: An initializer for the bias vector. If None, no bias will
+      be applied.
+    depthwise_regularizer: Optional regularizer for the depthwise
+      convolution kernel.
+    pointwise_regularizer: Optional regularizer for the pointwise
+      convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
+    name: A string, the name of the layer.
+    reuse: Boolean, whether to reuse the weights of a previous layer
+      by the same name.
+
+  Returns:
+    Output tensor.
+  """
+  layer = SeparableConv2D(
+      filters=filters,
+      kernel_size=kernel_size,
+      strides=strides,
+      padding=padding,
+      data_format=data_format,
+      depth_multiplier=depth_multiplier,
+      activation=activation,
+      use_bias=use_bias,
+      depthwise_initializer=depthwise_initializer,
+      pointwise_initializer=pointwise_initializer,
+      bias_initializer=bias_initializer,
+      depthwise_regularizer=depthwise_regularizer,
+      pointwise_regularizer=pointwise_regularizer,
+      bias_regularizer=bias_regularizer,
+      activity_regularizer=activity_regularizer,
+      trainable=trainable,
+      name=name,
+      _reuse=reuse,
+      _scope=name)
+  return layer.apply(inputs)
+
+
+class Conv2DTranspose(Conv2D):
+  """Transposed convolution layer (sometimes called Deconvolution).
+
+  The need for transposed convolutions generally arises
+  from the desire to use a transformation going in the opposite direction
+  of a normal convolution, i.e., from something that has the shape of the
+  output of some convolution to something that has the shape of its input
+  while maintaining a connectivity pattern that is compatible with
+  said convolution.
+
+  Arguments:
+    filters: integer, the dimensionality of the output space (i.e. the number
+      output of filters in the convolution).
+    kernel_size: a tuple or list of 2 positive integers specifying the spatial
+      dimensions of of the filters. Can be a single integer to specify the same
+      value for all spatial dimensions.
+    strides: a tuple or list of 2 positive integers specifying the strides
+      of the convolution. Can be a single integer to specify the same value for
+      all spatial dimensions.
+    padding: one of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, width, height, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, width, height)`.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    kernel_initializer: An initializer for the convolution kernel.
+    bias_initializer: An initializer for the bias vector. If None, no bias will
+      be applied.
+    kernel_regularizer: Optional regularizer for the convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, filters,
+               kernel_size,
+               strides=(1, 1),
+               padding='valid',
+               data_format='channels_last',
+               activation=None,
+               use_bias=True,
+               kernel_initializer=None,
+               bias_initializer=init_ops.zeros_initializer,
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(Conv2DTranspose, self).__init__(
+        filters,
+        kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        activation=activation,
+        use_bias=use_bias,
+        kernel_initializer=kernel_initializer,
+        bias_initializer=bias_initializer,
+        kernel_regularizer=kernel_regularizer,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        trainable=trainable,
+        name=name,
+        **kwargs)
+
+  def build(self, input_shape):
+    if len(input_shape) != 4:
+      raise ValueError('Inputs should have rank ' +
+                       str(4) +
+                       'Received input shape:', str(input_shape))
+    if self.data_format == 'channels_first':
+      channel_axis = 1
+    else:
+      channel_axis = -1
+    if input_shape[channel_axis] is None:
+      raise ValueError('The channel dimension of the inputs '
+                       'should be defined. Found `None`.')
+    input_dim = input_shape[channel_axis]
+    kernel_shape = self.kernel_size + (self.filters, input_dim)
+
+    self.kernel = vs.get_variable('kernel',
+                                  shape=kernel_shape,
+                                  initializer=self.kernel_initializer,
+                                  regularizer=self.kernel_regularizer,
+                                  trainable=True,
+                                  dtype=self._dtype)
+    if self.use_bias:
+      self.bias = vs.get_variable('bias',
+                                  shape=(self.filters,),
+                                  initializer=self.bias_initializer,
+                                  regularizer=self.bias_regularizer,
+                                  trainable=True,
+                                  dtype=self._dtype)
+    else:
+      self.bias = None
+
+  def call(self, inputs):
+    inputs_shape = array_ops.shape(inputs)
+    batch_size = inputs_shape[0]
+    if self.data_format == 'channels_first':
+      c_axis, h_axis, w_axis = 1, 2, 3
+    else:
+      c_axis, h_axis, w_axis = 3, 1, 2
+
+    height, width = inputs_shape[h_axis], inputs_shape[w_axis]
+    kernel_h, kernel_w = self.kernel_size
+    stride_h, stride_w = self.strides
+
+    def get_deconv_dim(dim_size, stride_size, kernel_size, padding):
+      if isinstance(dim_size, ops.Tensor):
+        dim_size = math_ops.mul(dim_size, stride_size)
+      elif dim_size is not None:
+        dim_size *= stride_size
+
+      if padding == 'valid' and dim_size is not None:
+        dim_size += max(kernel_size - stride_size, 0)
+      return dim_size
+
+    # Infer the dynamic output shape:
+    out_height = get_deconv_dim(height, stride_h, kernel_h, self.padding)
+    out_width = get_deconv_dim(width, stride_w, kernel_w, self.padding)
+
+    if self.data_format == 'channels_first':
+      output_shape = (batch_size, self.filters, out_height, out_width)
+      strides = (1, 1, stride_h, stride_w)
+    else:
+      output_shape = (batch_size, out_height, out_width, self.filters)
+      strides = (1, stride_h, stride_w, 1)
+
+    output_shape_tensor = array_ops.pack(output_shape)
+    outputs = nn.conv2d_transpose(
+        inputs,
+        self.kernel,
+        output_shape_tensor,
+        strides,
+        padding=self.padding.upper(),
+        data_format=utils.convert_data_format(self.data_format, ndim=4))
+
+    # Infer the static output shape:
+    out_shape = inputs.get_shape().as_list()
+    out_shape[c_axis] = self.filters
+    out_shape[h_axis] = get_deconv_dim(
+        out_shape[h_axis], stride_h, kernel_h, self.padding)
+    out_shape[w_axis] = get_deconv_dim(
+        out_shape[w_axis], stride_w, kernel_w, self.padding)
+    outputs.set_shape(out_shape)
+
+    if self.bias:
+      outputs = nn.bias_add(
+          outputs,
+          self.bias,
+          data_format=utils.convert_data_format(self.data_format, ndim=4))
+
+    if self.activation is not None:
+      return self.activation(outputs)
+    return outputs
+
+
+def conv2d_transpose(inputs,
+                     filters,
+                     kernel_size,
+                     strides=(1, 1),
+                     padding='valid',
+                     data_format='channels_last',
+                     activation=None,
+                     use_bias=True,
+                     kernel_initializer=None,
+                     bias_initializer=init_ops.zeros_initializer,
+                     kernel_regularizer=None,
+                     bias_regularizer=None,
+                     activity_regularizer=None,
+                     trainable=True,
+                     name=None,
+                     reuse=False):
+  """Transposed convolution layer (sometimes called Deconvolution).
+
+  The need for transposed convolutions generally arises
+  from the desire to use a transformation going in the opposite direction
+  of a normal convolution, i.e., from something that has the shape of the
+  output of some convolution to something that has the shape of its input
+  while maintaining a connectivity pattern that is compatible with
+  said convolution.
+
+  Arguments:
+    inputs: Input tensor.
+    filters: integer, the dimensionality of the output space (i.e. the number
+      output of filters in the convolution).
+    kernel_size: a tuple or list of 2 positive integers specifying the spatial
+      dimensions of of the filters. Can be a single integer to specify the same
+      value for all spatial dimensions.
+    strides: a tuple or list of 2 positive integers specifying the strides
+      of the convolution. Can be a single integer to specify the same value for
+      all spatial dimensions.
+    padding: one of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, width, height, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, width, height)`.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    kernel_initializer: An initializer for the convolution kernel.
+    bias_initializer: An initializer for the bias vector. If None, no bias will
+      be applied.
+    kernel_regularizer: Optional regularizer for the convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
+    name: A string, the name of the layer.
+    reuse: Boolean, whether to reuse the weights of a previous layer
+      by the same name.
+
+  Returns:
+    Output tensor.
+  """
+  layer = Conv2DTranspose(
+      filters=filters,
+      kernel_size=kernel_size,
+      strides=strides,
+      padding=padding,
+      data_format=data_format,
+      activation=activation,
+      use_bias=use_bias,
+      kernel_initializer=kernel_initializer,
+      bias_initializer=bias_initializer,
+      kernel_regularizer=kernel_regularizer,
+      bias_regularizer=bias_regularizer,
+      activity_regularizer=activity_regularizer,
+      trainable=trainable,
+      name=name,
+      _reuse=reuse,
+      _scope=name)
+  return layer.apply(inputs)
+
+
+# Aliases
+
+Convolution1D = Conv1D
+Convolution2D = Conv2D
+Convolution3D = Conv3D
+SeparableConvolution2D = SeparableConv2D
+Convolution2DTranspose = Deconvolution2D = Deconv2D = Conv2DTranspose
+convolution1d = conv1d
+convolution2d = conv2d
+convolution3d = conv3d
+separable_convolution2d = separable_conv2d
+convolution2d_transpose = deconvolution2d = deconv2d = conv2d_transpose
diff --git a/tensorflow/python/layers/convolutional_test.py b/tensorflow/python/layers/convolutional_test.py
new file mode 100644
index 00000000000000..c74a3f9cdcb6c0
--- /dev/null
+++ b/tensorflow/python/layers/convolutional_test.py
@@ -0,0 +1,550 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf.layers.core."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from tensorflow.python.layers import convolutional as conv_layers
+
+
+class ConvTest(tf.test.TestCase):
+
+  def testInvalidDataFormat(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, height, width, 3), seed=1)
+    with self.assertRaisesRegexp(
+        ValueError, 'data_format'):
+      conv_layers.conv2d(images, 32, 3, data_format='invalid')
+
+  def testInvalidStrides(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, height, width, 3), seed=1)
+    with self.assertRaisesRegexp(
+        ValueError, 'strides'):
+      conv_layers.conv2d(images, 32, 3, strides=(1, 2, 3))
+
+    with self.assertRaisesRegexp(
+        ValueError, 'strides'):
+      conv_layers.conv2d(images, 32, 3, strides=None)
+
+  def testInvalidKernelSize(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, height, width, 3), seed=1)
+    with self.assertRaisesRegexp(
+        ValueError, 'kernel_size'):
+      conv_layers.conv2d(images, 32, (1, 2, 3))
+
+    with self.assertRaisesRegexp(
+        ValueError, 'kernel_size'):
+      conv_layers.conv2d(images, 32, None)
+
+  def testCreateConv2D(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, height, width, 4))
+    layer = conv_layers.Conv2D(32, [3, 3], activation=tf.nn.relu)
+    output = layer.apply(images)
+    self.assertEqual(output.op.name, 'conv2d/Relu')
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, height - 2, width - 2, 32])
+    self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 4, 32])
+    self.assertListEqual(layer.bias.get_shape().as_list(), [32])
+
+  def testCreateConv2DIntegerKernelSize(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, height, width, 4))
+    layer = conv_layers.Conv2D(32, 3)
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, height - 2, width - 2, 32])
+    self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 4, 32])
+    self.assertListEqual(layer.bias.get_shape().as_list(), [32])
+
+  def testCreateConv2DChannelsFirst(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, 4, height, width))
+    layer = conv_layers.Conv2D(32, [3, 3],
+                               data_format='channels_first')
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, 32, height - 2, width - 2])
+    self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 4, 32])
+    self.assertListEqual(layer.bias.get_shape().as_list(), [32])
+
+  def testConv2DPaddingSame(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, height, width, 32), seed=1)
+    layer = conv_layers.Conv2D(64, images.get_shape()[1:3], padding='same')
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(), [5, height, width, 64])
+
+  def testCreateConvWithStrides(self):
+    height, width = 6, 8
+    # Test strides tuple
+    images = tf.random_uniform((5, height, width, 3), seed=1)
+    layer = conv_layers.Conv2D(32, [3, 3], strides=(2, 2), padding='same')
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, height / 2, width / 2, 32])
+
+    # Test strides integer
+    layer = conv_layers.Conv2D(32, [3, 3], strides=2, padding='same')
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, height / 2, width / 2, 32])
+
+    # Test unequal strides
+    layer = conv_layers.Conv2D(32, [3, 3], strides=(2, 1), padding='same')
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, height / 2, width, 32])
+
+  def testCreateConv1D(self):
+    width = 7
+    data = tf.random_uniform((5, width, 4))
+    layer = conv_layers.Conv1D(32, 3, activation=tf.nn.relu)
+    output = layer.apply(data)
+    self.assertEqual(output.op.name, 'conv1d/Relu')
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, width - 2, 32])
+    self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 4, 32])
+    self.assertListEqual(layer.bias.get_shape().as_list(), [32])
+
+  def testCreateConv1DChannelsFirst(self):
+    width = 7
+    data = tf.random_uniform((5, 4, width))
+    layer = conv_layers.Conv1D(32, 3, data_format='channels_first')
+    output = layer.apply(data)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, 32, width - 2])
+    self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 4, 32])
+    self.assertListEqual(layer.bias.get_shape().as_list(), [32])
+
+  def testCreateConv3D(self):
+    depth, height, width = 6, 7, 9
+    volumes = tf.random_uniform((5, depth, height, width, 4))
+    layer = conv_layers.Conv3D(32, [3, 3, 3], activation=tf.nn.relu)
+    output = layer.apply(volumes)
+    self.assertEqual(output.op.name, 'conv3d/Relu')
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, depth - 2, height - 2, width - 2, 32])
+    self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 3, 4, 32])
+    self.assertListEqual(layer.bias.get_shape().as_list(), [32])
+
+  def testConv2DKernelRegularizer(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, height, width, 4))
+    reg = lambda x: 0.1 * tf.reduce_sum(x)
+    layer = conv_layers.Conv2D(32, [3, 3], kernel_regularizer=reg)
+    layer.apply(images)
+    loss_keys = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
+    self.assertEqual(len(loss_keys), 1)
+    self.assertListEqual(layer.losses, loss_keys)
+
+  def testConv2DBiasRegularizer(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, height, width, 4))
+    reg = lambda x: 0.1 * tf.reduce_sum(x)
+    layer = conv_layers.Conv2D(32, [3, 3], bias_regularizer=reg)
+    layer.apply(images)
+    loss_keys = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
+    self.assertEqual(len(loss_keys), 1)
+    self.assertListEqual(layer.losses, loss_keys)
+
+  def testConv2DNoBias(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, height, width, 4))
+    layer = conv_layers.Conv2D(32, [3, 3],
+                               activation=tf.nn.relu, use_bias=False)
+    output = layer.apply(images)
+    self.assertEqual(output.op.name, 'conv2d/Relu')
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, height - 2, width - 2, 32])
+    self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 4, 32])
+    self.assertEqual(layer.bias, None)
+
+  def testDilatedConv2D(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, height, width, 4))
+    layer = conv_layers.Conv2D(32, [3, 3], dilation_rate=3)
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, 1, 3, 32])
+    self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 4, 32])
+    self.assertListEqual(layer.bias.get_shape().as_list(), [32])
+
+    # Test tuple dilation rate
+    layer = conv_layers.Conv2D(32, [3, 3], dilation_rate=(1, 3))
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, height - 2, 3, 32])
+
+  def testFunctionalConv2DReuse(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, height, width, 3), seed=1)
+    conv_layers.conv2d(images, 32, [3, 3], name='conv1')
+    self.assertEqual(
+        len(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)), 2)
+    conv_layers.conv2d(images, 32, [3, 3], name='conv1', reuse=True)
+    self.assertEqual(
+        len(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)), 2)
+
+  def testFunctionalConv2DNoReuse(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, height, width, 3), seed=1)
+    conv_layers.conv2d(images, 32, [3, 3])
+    self.assertEqual(
+        len(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)), 2)
+    conv_layers.conv2d(images, 32, [3, 3])
+    self.assertEqual(
+        len(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)), 4)
+
+
+class SeparableConv2DTest(tf.test.TestCase):
+
+  def testInvalidDataFormat(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, height, width, 3), seed=1)
+    with self.assertRaisesRegexp(
+        ValueError, 'data_format'):
+      conv_layers.separable_conv2d(images, 32, 3, data_format='invalid')
+
+  def testInvalidStrides(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, height, width, 3), seed=1)
+    with self.assertRaisesRegexp(
+        ValueError, 'strides'):
+      conv_layers.separable_conv2d(images, 32, 3, strides=(1, 2, 3))
+
+    with self.assertRaisesRegexp(
+        ValueError, 'strides'):
+      conv_layers.separable_conv2d(images, 32, 3, strides=None)
+
+  def testInvalidKernelSize(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, height, width, 3), seed=1)
+    with self.assertRaisesRegexp(
+        ValueError, 'kernel_size'):
+      conv_layers.separable_conv2d(images, 32, (1, 2, 3))
+
+    with self.assertRaisesRegexp(
+        ValueError, 'kernel_size'):
+      conv_layers.separable_conv2d(images, 32, None)
+
+  def testCreateSeparableConv2D(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, height, width, 4))
+    layer = conv_layers.SeparableConv2D(32, [3, 3], activation=tf.nn.relu)
+    output = layer.apply(images)
+    self.assertEqual(output.op.name, 'separable_conv2d/Relu')
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, height - 2, width - 2, 32])
+    self.assertListEqual(
+        layer.depthwise_kernel.get_shape().as_list(), [3, 3, 4, 1])
+    self.assertListEqual(
+        layer.pointwise_kernel.get_shape().as_list(), [1, 1, 4, 32])
+    self.assertListEqual(layer.bias.get_shape().as_list(), [32])
+
+  def testCreateSeparableConv2DDepthMultiplier(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, height, width, 4))
+    layer = conv_layers.SeparableConv2D(32, [3, 3], depth_multiplier=2)
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, height - 2, width - 2, 32])
+    self.assertListEqual(
+        layer.depthwise_kernel.get_shape().as_list(), [3, 3, 4, 2])
+    self.assertListEqual(
+        layer.pointwise_kernel.get_shape().as_list(), [1, 1, 8, 32])
+    self.assertListEqual(layer.bias.get_shape().as_list(), [32])
+
+  def testCreateSeparableConv2DIntegerKernelSize(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, height, width, 4))
+    layer = conv_layers.SeparableConv2D(32, 3)
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, height - 2, width - 2, 32])
+    self.assertListEqual(
+        layer.depthwise_kernel.get_shape().as_list(), [3, 3, 4, 1])
+    self.assertListEqual(
+        layer.pointwise_kernel.get_shape().as_list(), [1, 1, 4, 32])
+    self.assertListEqual(layer.bias.get_shape().as_list(), [32])
+
+  def testCreateSeparableConv2DChannelsFirst(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, 4, height, width))
+    layer = conv_layers.SeparableConv2D(32, [3, 3],
+                                        data_format='channels_first')
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, 32, height - 2, width - 2])
+    self.assertListEqual(
+        layer.depthwise_kernel.get_shape().as_list(), [3, 3, 4, 1])
+    self.assertListEqual(
+        layer.pointwise_kernel.get_shape().as_list(), [1, 1, 4, 32])
+    self.assertListEqual(layer.bias.get_shape().as_list(), [32])
+
+  def testSeparableConv2DPaddingSame(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, height, width, 32), seed=1)
+    layer = conv_layers.SeparableConv2D(
+        64, images.get_shape()[1:3], padding='same')
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(), [5, height, width, 64])
+
+  def testCreateSeparableConvWithStrides(self):
+    height, width = 6, 8
+    # Test strides tuple
+    images = tf.random_uniform((5, height, width, 3), seed=1)
+    layer = conv_layers.SeparableConv2D(
+        32, [3, 3], strides=(2, 2), padding='same')
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, height / 2, width / 2, 32])
+
+    # Test strides integer
+    layer = conv_layers.SeparableConv2D(32, [3, 3], strides=2, padding='same')
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, height / 2, width / 2, 32])
+
+    # Test unequal strides
+    layer = conv_layers.SeparableConv2D(
+        32, [3, 3], strides=(2, 1), padding='same')
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, height / 2, width, 32])
+
+  def testFunctionalConv2DReuse(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, height, width, 3), seed=1)
+    conv_layers.separable_conv2d(images, 32, [3, 3], name='sepconv1')
+    self.assertEqual(
+        len(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)), 3)
+    conv_layers.separable_conv2d(images, 32, [3, 3], name='sepconv1',
+                                 reuse=True)
+    self.assertEqual(
+        len(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)), 3)
+
+  def testFunctionalConv2DNoReuse(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, height, width, 3), seed=1)
+    conv_layers.separable_conv2d(images, 32, [3, 3])
+    self.assertEqual(
+        len(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)), 3)
+    conv_layers.separable_conv2d(images, 32, [3, 3])
+    self.assertEqual(
+        len(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)), 6)
+
+  def testSeparableConv2DDepthwiseRegularizer(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, height, width, 4))
+    reg = lambda x: 0.1 * tf.reduce_sum(x)
+    layer = conv_layers.SeparableConv2D(32, [3, 3], depthwise_regularizer=reg)
+    layer.apply(images)
+    loss_keys = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
+    self.assertEqual(len(loss_keys), 1)
+    self.assertListEqual(layer.losses, loss_keys)
+
+  def testSeparableConv2DPointwiseRegularizer(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, height, width, 4))
+    reg = lambda x: 0.1 * tf.reduce_sum(x)
+    layer = conv_layers.SeparableConv2D(32, [3, 3], pointwise_regularizer=reg)
+    layer.apply(images)
+    loss_keys = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
+    self.assertEqual(len(loss_keys), 1)
+    self.assertListEqual(layer.losses, loss_keys)
+
+  def testSeparableConv2DBiasRegularizer(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, height, width, 4))
+    reg = lambda x: 0.1 * tf.reduce_sum(x)
+    layer = conv_layers.SeparableConv2D(32, [3, 3], bias_regularizer=reg)
+    layer.apply(images)
+    loss_keys = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
+    self.assertEqual(len(loss_keys), 1)
+    self.assertListEqual(layer.losses, loss_keys)
+
+  def testSeparableConv2DNoBias(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, height, width, 4))
+    layer = conv_layers.SeparableConv2D(32, [3, 3],
+                                        activation=tf.nn.relu, use_bias=False)
+    output = layer.apply(images)
+    self.assertEqual(output.op.name, 'separable_conv2d/Relu')
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, height - 2, width - 2, 32])
+    self.assertListEqual(
+        layer.depthwise_kernel.get_shape().as_list(), [3, 3, 4, 1])
+    self.assertListEqual(
+        layer.pointwise_kernel.get_shape().as_list(), [1, 1, 4, 32])
+    self.assertEqual(layer.bias, None)
+
+
+class Conv2DTransposeTest(tf.test.TestCase):
+
+  def testInvalidDataFormat(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, height, width, 3), seed=1)
+    with self.assertRaisesRegexp(
+        ValueError, 'data_format'):
+      conv_layers.conv2d_transpose(images, 32, 3, data_format='invalid')
+
+  def testInvalidStrides(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, height, width, 3), seed=1)
+    with self.assertRaisesRegexp(
+        ValueError, 'strides'):
+      conv_layers.conv2d_transpose(images, 32, 3, strides=(1, 2, 3))
+
+    with self.assertRaisesRegexp(
+        ValueError, 'strides'):
+      conv_layers.conv2d_transpose(images, 32, 3, strides=None)
+
+  def testInvalidKernelSize(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, height, width, 3), seed=1)
+    with self.assertRaisesRegexp(
+        ValueError, 'kernel_size'):
+      conv_layers.conv2d_transpose(images, 32, (1, 2, 3))
+
+    with self.assertRaisesRegexp(
+        ValueError, 'kernel_size'):
+      conv_layers.conv2d_transpose(images, 32, None)
+
+  def testCreateConv2DTranspose(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, height, width, 4))
+    layer = conv_layers.Conv2DTranspose(32, [3, 3], activation=tf.nn.relu)
+    output = layer.apply(images)
+    self.assertEqual(output.op.name, 'conv2d_transpose/Relu')
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, height + 2, width + 2, 32])
+    self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 32, 4])
+    self.assertListEqual(layer.bias.get_shape().as_list(), [32])
+
+  def testCreateConv2DTransposeIntegerKernelSize(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, height, width, 4))
+    layer = conv_layers.Conv2DTranspose(32, 3)
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, height + 2, width + 2, 32])
+    self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 32, 4])
+    self.assertListEqual(layer.bias.get_shape().as_list(), [32])
+
+  def testCreateConv2DTransposeChannelsFirst(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, 4, height, width))
+    layer = conv_layers.Conv2DTranspose(32, [3, 3],
+                                        data_format='channels_first')
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, 32, height + 2, width + 2])
+    self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 32, 4])
+    self.assertListEqual(layer.bias.get_shape().as_list(), [32])
+
+  def testConv2DTransposePaddingSame(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, height, width, 32), seed=1)
+    layer = conv_layers.Conv2DTranspose(
+        64, images.get_shape()[1:3], padding='same')
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(), [5, height, width, 64])
+
+  def testCreateConv2DTransposeWithStrides(self):
+    height, width = 6, 8
+    # Test strides tuple
+    images = tf.random_uniform((5, height, width, 3), seed=1)
+    layer = conv_layers.Conv2DTranspose(
+        32, [3, 3], strides=(2, 2), padding='same')
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, height * 2, width * 2, 32])
+
+    # Test strides integer
+    layer = conv_layers.Conv2DTranspose(32, [3, 3], strides=2, padding='same')
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, height * 2, width * 2, 32])
+
+    # Test unequal strides
+    layer = conv_layers.Conv2DTranspose(
+        32, [3, 3], strides=(2, 1), padding='same')
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, height * 2, width, 32])
+
+  def testConv2DTransposeKernelRegularizer(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, height, width, 4))
+    reg = lambda x: 0.1 * tf.reduce_sum(x)
+    layer = conv_layers.Conv2DTranspose(32, [3, 3], kernel_regularizer=reg)
+    layer.apply(images)
+    loss_keys = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
+    self.assertEqual(len(loss_keys), 1)
+    self.assertListEqual(layer.losses, loss_keys)
+
+  def testConv2DTransposeBiasRegularizer(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, height, width, 4))
+    reg = lambda x: 0.1 * tf.reduce_sum(x)
+    layer = conv_layers.Conv2DTranspose(32, [3, 3], bias_regularizer=reg)
+    layer.apply(images)
+    loss_keys = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
+    self.assertEqual(len(loss_keys), 1)
+    self.assertListEqual(layer.losses, loss_keys)
+
+  def testConv2DTransposeNoBias(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, height, width, 4))
+    layer = conv_layers.Conv2DTranspose(32, [3, 3],
+                                        activation=tf.nn.relu, use_bias=False)
+    output = layer.apply(images)
+    self.assertEqual(output.op.name, 'conv2d_transpose/Relu')
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, height + 2, width + 2, 32])
+    self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 32, 4])
+    self.assertEqual(layer.bias, None)
+
+  def testFunctionalConv2DTransposeReuse(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, height, width, 3), seed=1)
+    conv_layers.conv2d_transpose(images, 32, [3, 3], name='deconv1')
+    self.assertEqual(
+        len(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)), 2)
+    conv_layers.conv2d_transpose(images, 32, [3, 3], name='deconv1',
+                                 reuse=True)
+    self.assertEqual(
+        len(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)), 2)
+
+  def testFunctionalConv2DTransposeNoReuse(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, height, width, 3), seed=1)
+    conv_layers.conv2d_transpose(images, 32, [3, 3])
+    self.assertEqual(
+        len(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)), 2)
+    conv_layers.conv2d_transpose(images, 32, [3, 3])
+    self.assertEqual(
+        len(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)), 4)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/python/layers/core.py b/tensorflow/python/layers/core.py
index f3ffbf33b91d7e..da8df67cb8f662 100644
--- a/tensorflow/python/layers/core.py
+++ b/tensorflow/python/layers/core.py
@@ -14,7 +14,7 @@
 # =============================================================================
 
 # pylint: disable=unused-import,g-bad-import-order
-"""Contains the core layers: FullyConnected, [Flatten, Dropout].
+"""Contains the core layers: Dense, Dropout.
 
 Also contains their functional aliases.
 """
@@ -39,11 +39,8 @@
 from tensorflow.python.layers import base
 
 
-class FullyConnected(base._Layer):  # pylint: disable=protected-access
-  """Fully-connected layer class.
-
-  WARNING: Do not use this class unless you know what you are doing:
-  the API is subject to future changes.
+class Dense(base._Layer):  # pylint: disable=protected-access
+  """Densely-connected layer class.
 
   This layer implements the operation `outputs = activation(inputs.w + b)`
   Where `activation` is the activation function passed as the `activation`
@@ -94,8 +91,7 @@ def __init__(self, units,
                trainable=True,
                name=None,
                **kwargs):
-    super(FullyConnected, self).__init__(trainable=trainable, name=name,
-                                         **kwargs)
+    super(Dense, self).__init__(trainable=trainable, name=name, **kwargs)
     self.units = units
     self.activation = activation
     self.use_bias = use_bias
@@ -108,11 +104,11 @@ def __init__(self, units,
   def build(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape)
     if input_shape.ndims is None:
-      raise ValueError('Inputs to `FullyConnected` should have known rank.')
+      raise ValueError('Inputs to `Dense` should have known rank.')
     if len(input_shape) < 2:
-      raise ValueError('Inputs to `FullyConnected` should have rank >= 2.')
+      raise ValueError('Inputs to `Dense` should have rank >= 2.')
     if input_shape[-1].value is None:
-      raise ValueError('The last dimension of the inputs to `FullyConnected` '
+      raise ValueError('The last dimension of the inputs to `Dense` '
                        'should be defined. Found `None`.')
     # Note that we set `trainable=True` because this is a trainable
     # weight of the layer. If the layer is not trainable
@@ -122,10 +118,10 @@ def build(self, input_shape):
                              shape=[input_shape[-1].value, self.units],
                              initializer=self.weights_initializer,
                              regularizer=self.weights_regularizer,
-                             dtype=self._dtype,
+                             dtype=self.dtype,
                              trainable=True)
     if self.use_bias:
-      self.bias = vs.get_variable('biases',
+      self.bias = vs.get_variable('bias',
                                   shape=[self.units,],
                                   initializer=self.bias_initializer,
                                   regularizer=self.bias_regularizer,
@@ -159,7 +155,7 @@ def call(self, inputs):
     return outputs
 
 
-def fully_connected(
+def dense(
     inputs, units,
     activation=None,
     use_bias=True,
@@ -171,7 +167,7 @@ def fully_connected(
     trainable=True,
     name=None,
     reuse=False):
-  """Functional interface for the fully connected layer.
+  """Functional interface for the densely-connected layer.
 
   This layer implements the operation `outputs = activation(inputs.w + b)`
   Where `activation` is the activation function passed as the `activation`
@@ -201,19 +197,19 @@ def fully_connected(
   Returns:
     Output tensor.
   """
-  layer = FullyConnected(units,
-                         activation=activation,
-                         use_bias=use_bias,
-                         weights_initializer=weights_initializer,
-                         bias_initializer=bias_initializer,
-                         weights_regularizer=weights_regularizer,
-                         bias_regularizer=bias_regularizer,
-                         activity_regularizer=activity_regularizer,
-                         trainable=trainable,
-                         name=name,
-                         dtype=inputs.dtype.base_dtype,
-                         _scope=name,
-                         _reuse_weights=reuse)
+  layer = Dense(units,
+                activation=activation,
+                use_bias=use_bias,
+                weights_initializer=weights_initializer,
+                bias_initializer=bias_initializer,
+                weights_regularizer=weights_regularizer,
+                bias_regularizer=bias_regularizer,
+                activity_regularizer=activity_regularizer,
+                trainable=trainable,
+                name=name,
+                dtype=inputs.dtype.base_dtype,
+                _scope=name,
+                _reuse=reuse)
   return layer.apply(inputs)
 
 
@@ -303,3 +299,9 @@ def dropout(inputs,
   """
   layer = Dropout(rate, noise_shape=noise_shape, seed=seed, name=name)
   return layer.apply(inputs, training=training)
+
+
+# Aliases
+
+FullyConnected = Dense
+fully_connected = dense
diff --git a/tensorflow/python/layers/core_test.py b/tensorflow/python/layers/core_test.py
index 710fd37fd0d98c..d3b0ee15502fc6 100644
--- a/tensorflow/python/layers/core_test.py
+++ b/tensorflow/python/layers/core_test.py
@@ -24,193 +24,195 @@
 from tensorflow.python.layers import core as core_layers
 
 
-class FullyConnectedTest(tf.test.TestCase):
-
-  def testFCProperties(self):
-    fc = core_layers.FullyConnected(2, activation=tf.nn.relu, name='fc')
-    self.assertEqual(fc.units, 2)
-    self.assertEqual(fc.activation, tf.nn.relu)
-    self.assertEqual(fc.weights_regularizer, None)
-    self.assertEqual(fc.bias_regularizer, None)
-    self.assertEqual(fc.activity_regularizer, None)
-    self.assertEqual(fc.use_bias, True)
-    self.assertEqual(fc.name, 'fc')
+class DenseTest(tf.test.TestCase):
+
+  def testDenseProperties(self):
+    dense = core_layers.Dense(2, activation=tf.nn.relu, name='my_dense')
+    self.assertEqual(dense.units, 2)
+    self.assertEqual(dense.activation, tf.nn.relu)
+    self.assertEqual(dense.weights_regularizer, None)
+    self.assertEqual(dense.bias_regularizer, None)
+    self.assertEqual(dense.activity_regularizer, None)
+    self.assertEqual(dense.use_bias, True)
+    self.assertEqual(dense.name, 'my_dense')
 
     # Test auto-naming
-    fc = core_layers.FullyConnected(2, activation=tf.nn.relu)
-    self.assertEqual(fc.name, 'fully_connected')
-    fc = core_layers.FullyConnected(2, activation=tf.nn.relu)
-    self.assertEqual(fc.name, 'fully_connected_1')
+    dense = core_layers.Dense(2, activation=tf.nn.relu)
+    self.assertEqual(dense.name, 'dense')
+    dense = core_layers.Dense(2, activation=tf.nn.relu)
+    self.assertEqual(dense.name, 'dense_1')
 
   def testCall(self):
-    fc = core_layers.FullyConnected(2, activation=tf.nn.relu, name='fc')
+    dense = core_layers.Dense(2, activation=tf.nn.relu, name='my_dense')
     inputs = tf.random_uniform((5, 2), seed=1)
-    _ = fc(inputs)
-    self.assertListEqual(fc.weights, [fc.w, fc.bias])
-    self.assertListEqual(fc.trainable_weights, [fc.w, fc.bias])
-    self.assertListEqual(fc.non_trainable_weights, [])
-    self.assertListEqual(fc._trainable_weights, [fc.w, fc.bias])
-    self.assertListEqual(fc._non_trainable_weights, [])
+    _ = dense(inputs)
+    self.assertListEqual(dense.variables, [dense.w, dense.bias])
+    self.assertListEqual(dense.trainable_variables, [dense.w, dense.bias])
+    self.assertListEqual(dense.non_trainable_variables, [])
+    self.assertListEqual(dense._trainable_variables, [dense.w, dense.bias])
+    self.assertListEqual(dense._non_trainable_variables, [])
     self.assertEqual(
         len(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)), 2)
-    self.assertEqual(fc.w.name, 'fc/weights:0')
-    self.assertEqual(fc.bias.name, 'fc/biases:0')
+    self.assertEqual(dense.w.name, 'my_dense/weights:0')
+    self.assertEqual(dense.bias.name, 'my_dense/bias:0')
 
   def testNoBias(self):
-    fc = core_layers.FullyConnected(2, use_bias=False, name='fc')
+    dense = core_layers.Dense(2, use_bias=False, name='my_dense')
     inputs = tf.random_uniform((5, 2), seed=1)
-    _ = fc(inputs)
-    self.assertListEqual(fc.weights, [fc.w])
-    self.assertListEqual(fc.trainable_weights, [fc.w])
-    self.assertListEqual(fc.non_trainable_weights, [])
+    _ = dense(inputs)
+    self.assertListEqual(dense.variables, [dense.w])
+    self.assertListEqual(dense.trainable_variables, [dense.w])
+    self.assertListEqual(dense.non_trainable_variables, [])
     self.assertEqual(
         len(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)), 1)
-    self.assertEqual(fc.w.name, 'fc/weights:0')
-    self.assertEqual(fc.bias, None)
+    self.assertEqual(dense.w.name, 'my_dense/weights:0')
+    self.assertEqual(dense.bias, None)
 
   def testNonTrainable(self):
-    fc = core_layers.FullyConnected(2, trainable=False, name='fc')
+    dense = core_layers.Dense(2, trainable=False, name='my_dense')
     inputs = tf.random_uniform((5, 2), seed=1)
-    _ = fc(inputs)
-    self.assertListEqual(fc.weights, [fc.w, fc.bias])
-    self.assertListEqual(fc.non_trainable_weights, [fc.w, fc.bias])
-    self.assertListEqual(fc.trainable_weights, [])
-    self.assertListEqual(fc._trainable_weights, [fc.w, fc.bias])
-    self.assertListEqual(fc._non_trainable_weights, [])
+    _ = dense(inputs)
+    self.assertListEqual(dense.variables, [dense.w, dense.bias])
+    self.assertListEqual(dense.non_trainable_variables,
+                         [dense.w, dense.bias])
+    self.assertListEqual(dense.trainable_variables, [])
+    self.assertListEqual(dense._trainable_variables,
+                         [dense.w, dense.bias])
+    self.assertListEqual(dense._non_trainable_variables, [])
     self.assertEqual(
         len(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)), 0)
 
   def testOutputShape(self):
-    fc = core_layers.FullyConnected(7, activation=tf.nn.relu, name='fc')
+    dense = core_layers.Dense(7, activation=tf.nn.relu, name='my_dense')
     inputs = tf.random_uniform((5, 3), seed=1)
-    outputs = fc.apply(inputs)
+    outputs = dense.apply(inputs)
     self.assertEqual(outputs.get_shape().as_list(), [5, 7])
 
     inputs = tf.random_uniform((5, 2, 3), seed=1)
-    outputs = fc(inputs)
+    outputs = dense(inputs)
     self.assertEqual(outputs.get_shape().as_list(), [5, 2, 7])
 
     inputs = tf.random_uniform((1, 2, 4, 3), seed=1)
-    outputs = fc.apply(inputs)
+    outputs = dense.apply(inputs)
     self.assertEqual(outputs.get_shape().as_list(), [1, 2, 4, 7])
 
   def testCallOnPlaceHolder(self):
     inputs = tf.placeholder(dtype=tf.float32)
-    fc = core_layers.FullyConnected(4, name='fc')
+    dense = core_layers.Dense(4, name='my_dense')
     with self.assertRaises(ValueError):
-      fc(inputs)
+      dense(inputs)
 
     inputs = tf.placeholder(dtype=tf.float32, shape=[None, None])
-    fc = core_layers.FullyConnected(4, name='fc')
+    dense = core_layers.Dense(4, name='my_dense')
     with self.assertRaises(ValueError):
-      fc(inputs)
+      dense(inputs)
 
     inputs = tf.placeholder(dtype=tf.float32, shape=[None, None, None])
-    fc = core_layers.FullyConnected(4, name='fc')
+    dense = core_layers.Dense(4, name='my_dense')
     with self.assertRaises(ValueError):
-      fc(inputs)
+      dense(inputs)
 
     inputs = tf.placeholder(dtype=tf.float32, shape=[None, 3])
-    fc = core_layers.FullyConnected(4, name='fc')
-    fc(inputs)
+    dense = core_layers.Dense(4, name='my_dense')
+    dense(inputs)
 
     inputs = tf.placeholder(dtype=tf.float32, shape=[None, None, 3])
-    fc = core_layers.FullyConnected(4, name='fc')
-    fc(inputs)
+    dense = core_layers.Dense(4, name='my_dense')
+    dense(inputs)
 
   def testActivation(self):
-    fc = core_layers.FullyConnected(2, activation=tf.nn.relu, name='fc1')
+    dense = core_layers.Dense(2, activation=tf.nn.relu, name='dense1')
     inputs = tf.random_uniform((5, 3), seed=1)
-    outputs = fc(inputs)
-    self.assertEqual(outputs.op.name, 'fc1/Relu')
+    outputs = dense(inputs)
+    self.assertEqual(outputs.op.name, 'dense1/Relu')
 
-    fc = core_layers.FullyConnected(2, name='fc2')
+    dense = core_layers.Dense(2, name='dense2')
     inputs = tf.random_uniform((5, 3), seed=1)
-    outputs = fc(inputs)
-    self.assertEqual(outputs.op.name, 'fc2/BiasAdd')
+    outputs = dense(inputs)
+    self.assertEqual(outputs.op.name, 'dense2/BiasAdd')
 
   def testActivityRegularizer(self):
     regularizer = lambda x: tf.reduce_sum(x) * 1e-3
-    fc = core_layers.FullyConnected(2, name='fc',
-                                    activity_regularizer=regularizer)
+    dense = core_layers.Dense(2, name='my_dense',
+                              activity_regularizer=regularizer)
     inputs = tf.random_uniform((5, 3), seed=1)
-    _ = fc(inputs)
+    _ = dense(inputs)
     loss_keys = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
     self.assertEqual(len(loss_keys), 1)
-    self.assertListEqual(fc.losses, loss_keys)
+    self.assertListEqual(dense.losses, loss_keys)
 
   def testWeightsRegularizer(self):
     regularizer = lambda x: tf.reduce_sum(x) * 1e-3
-    fc = core_layers.FullyConnected(2, name='fc',
-                                    weights_regularizer=regularizer)
+    dense = core_layers.Dense(2, name='my_dense',
+                              weights_regularizer=regularizer)
     inputs = tf.random_uniform((5, 3), seed=1)
-    _ = fc(inputs)
+    _ = dense(inputs)
     loss_keys = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
     self.assertEqual(len(loss_keys), 1)
-    self.assertListEqual(fc.losses, loss_keys)
+    self.assertListEqual(dense.losses, loss_keys)
 
   def testBiasRegularizer(self):
     regularizer = lambda x: tf.reduce_sum(x) * 1e-3
-    fc = core_layers.FullyConnected(2, name='fc',
-                                    bias_regularizer=regularizer)
+    dense = core_layers.Dense(2, name='my_dense',
+                              bias_regularizer=regularizer)
     inputs = tf.random_uniform((5, 3), seed=1)
-    _ = fc(inputs)
+    _ = dense(inputs)
     loss_keys = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
     self.assertEqual(len(loss_keys), 1)
-    self.assertListEqual(fc.losses, loss_keys)
+    self.assertListEqual(dense.losses, loss_keys)
 
-  def testFunctionalFC(self):
+  def testFunctionalDense(self):
     inputs = tf.random_uniform((5, 3), seed=1)
-    outputs = core_layers.fully_connected(
-        inputs, 2, activation=tf.nn.relu, name='fc')
+    outputs = core_layers.dense(
+        inputs, 2, activation=tf.nn.relu, name='my_dense')
     self.assertEqual(
         len(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)), 2)
-    self.assertEqual(outputs.op.name, 'fc/Relu')
+    self.assertEqual(outputs.op.name, 'my_dense/Relu')
     self.assertEqual(outputs.get_shape().as_list(), [5, 2])
 
-  def testFunctionalFCTwice(self):
+  def testFunctionalDenseTwice(self):
     inputs = tf.random_uniform((5, 3), seed=1)
-    core_layers.fully_connected(inputs, 2)
+    core_layers.dense(inputs, 2)
     vars1 = tf.trainable_variables()
-    core_layers.fully_connected(inputs, 2)
+    core_layers.dense(inputs, 2)
     vars2 = tf.trainable_variables()
     self.assertEqual(len(vars1), 2)
     self.assertEqual(len(vars2), 4)
 
-  def testFunctionalFCTwiceReuse(self):
+  def testFunctionalDenseTwiceReuse(self):
     inputs = tf.random_uniform((5, 3), seed=1)
-    core_layers.fully_connected(inputs, 2, name='fc')
+    core_layers.dense(inputs, 2, name='my_dense')
     vars1 = tf.trainable_variables()
-    core_layers.fully_connected(inputs, 2, name='fc', reuse=True)
+    core_layers.dense(inputs, 2, name='my_dense', reuse=True)
     vars2 = tf.trainable_variables()
     self.assertEqual(vars1, vars2)
 
-  def testFunctionalFCWithCustomGetter(self):
+  def testFunctionalDenseWithCustomGetter(self):
     called = [0]
     def custom_getter(getter, *args, **kwargs):
       called[0] += 1
       return getter(*args, **kwargs)
     with tf.variable_scope('test', custom_getter=custom_getter):
       inputs = tf.random_uniform((5, 3), seed=1)
-      core_layers.fully_connected(inputs, 2)
+      core_layers.dense(inputs, 2)
     self.assertEqual(called[0], 2)
 
-  def testFunctionalFCInScope(self):
+  def testFunctionalDenseInScope(self):
     with tf.variable_scope('test'):
       inputs = tf.random_uniform((5, 3), seed=1)
-      core_layers.fully_connected(inputs, 2, name='fc')
+      core_layers.dense(inputs, 2, name='my_dense')
       var = tf.trainable_variables()[0]
-      self.assertEqual(var.name, 'test/fc/weights:0')
+      self.assertEqual(var.name, 'test/my_dense/weights:0')
     with tf.variable_scope('test1') as scope:
       inputs = tf.random_uniform((5, 3), seed=1)
-      core_layers.fully_connected(inputs, 2, name=scope)
+      core_layers.dense(inputs, 2, name=scope)
       var = tf.trainable_variables()[2]
       self.assertEqual(var.name, 'test1/weights:0')
     with tf.variable_scope('test2'):
       inputs = tf.random_uniform((5, 3), seed=1)
-      core_layers.fully_connected(inputs, 2)
+      core_layers.dense(inputs, 2)
       var = tf.trainable_variables()[4]
-      self.assertEqual(var.name, 'test2/fully_connected/weights:0')
+      self.assertEqual(var.name, 'test2/dense/weights:0')
 
 
 class DropoutTest(tf.test.TestCase):
diff --git a/tensorflow/python/layers/layers.py b/tensorflow/python/layers/layers.py
index 14664871644099..c25361d562a66b 100644
--- a/tensorflow/python/layers/layers.py
+++ b/tensorflow/python/layers/layers.py
@@ -18,7 +18,29 @@
 
 ## Core layers
 
-@@fully_connected
+@@dense
+@@dropout
+
+## Convolutional layers
+
+@@conv1d
+@@conv2d
+@@conv3d
+@@separable_conv2d
+@@conv2d_transpose
+
+## Pooling layers
+
+@@average_pooling1d
+@@max_pooling1d
+@@average_pooling2d
+@@max_pooling2d
+@@average_pooling3d
+@@max_pooling3d
+
+## Normalization layers
+
+@@batch_normalization
 
 """
 
@@ -31,7 +53,27 @@
 # pylint: disable=g-bad-import-order,unused-import
 
 # Core layers.
-from tensorflow.python.layers.core import fully_connected
+from tensorflow.python.layers.core import dense
+from tensorflow.python.layers.core import dropout
+
+# Convolutional layers.
+from tensorflow.python.layers.convolutional import conv1d
+from tensorflow.python.layers.convolutional import conv2d
+from tensorflow.python.layers.convolutional import conv3d
+from tensorflow.python.layers.convolutional import separable_conv2d
+from tensorflow.python.layers.convolutional import conv2d_transpose
+
+# Pooling layers.
+from tensorflow.python.layers.pooling import average_pooling1d
+from tensorflow.python.layers.pooling import max_pooling1d
+from tensorflow.python.layers.pooling import average_pooling2d
+from tensorflow.python.layers.pooling import max_pooling2d
+from tensorflow.python.layers.pooling import average_pooling3d
+from tensorflow.python.layers.pooling import max_pooling3d
+
+# Normalization layers.
+from tensorflow.python.layers.normalization import batch_normalization
+
 # pylint: enable=g-bad-import-order,unused-import
 
 _allowed_symbols = []
diff --git a/tensorflow/python/layers/normalization.py b/tensorflow/python/layers/normalization.py
new file mode 100644
index 00000000000000..c9c6ce98a19490
--- /dev/null
+++ b/tensorflow/python/layers/normalization.py
@@ -0,0 +1,333 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# pylint: disable=unused-import,g-bad-import-order
+"""Contains the normalization layer classes and their functional aliases.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import standard_ops
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.training import moving_averages
+from tensorflow.python.framework import tensor_util
+
+from tensorflow.python.layers import base
+
+
+class BatchNormalization(base._Layer):  # pylint: disable=protected-access
+  """Batch Normalization layer from http://arxiv.org/abs/1502.03167.
+
+  "Batch Normalization: Accelerating Deep Network Training by Reducing
+  Internal Covariate Shift"
+
+  Sergey Ioffe, Christian Szegedy
+
+  Arguments:
+    axis: Integer, the axis that should be normalized (typically the features
+      axis). For instance, after a `Convolution2D` layer with
+      `data_format="channels_first"`, set `axis=1` in `BatchNormalization`.
+    momentum: Momentum for the moving average.
+    epsilon: Small float added to variance to avoid dividing by zero.
+    center: If True, subtract `beta`. If False, `beta` is ignored.
+    scale: If True, multiply by `gamma`. If False, `gamma` is
+      not used. When the next layer is linear (also e.g. `nn.relu`), this can be
+      disabled since the scaling can be done by the next layer.
+    beta_initializer: Initializer for the beta weight.
+    gamma_initializer: Initializer for the gamma weight.
+    moving_mean_initializer: Initializer for the moving mean.
+    moving_variance_initializer: Initializer for the moving variance.
+    beta_regularizer: Optional regularizer for the beta weight.
+    gamma_regularizer: Optional regularizer for the gamma weight.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self,
+               axis=-1,
+               momentum=0.99,
+               epsilon=1e-3,
+               center=True,
+               scale=True,
+               beta_initializer=init_ops.zeros_initializer,
+               gamma_initializer=init_ops.ones_initializer(),
+               moving_mean_initializer=init_ops.zeros_initializer,
+               moving_variance_initializer=init_ops.ones_initializer(),
+               beta_regularizer=None,
+               gamma_regularizer=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(BatchNormalization, self).__init__(
+        name=name, trainable=trainable, **kwargs)
+    self.axis = axis
+    self.momentum = momentum
+    self.epsilon = epsilon
+    self.center = center
+    self.scale = scale
+    self.beta_initializer = beta_initializer
+    self.gamma_initializer = gamma_initializer
+    self.moving_mean_initializer = moving_mean_initializer
+    self.moving_variance_initializer = moving_variance_initializer
+    self.beta_regularizer = beta_regularizer
+    self.gamma_regularizer = gamma_regularizer
+
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    if not input_shape.ndims:
+      raise ValueError('Input has undefined rank:', input_shape)
+    ndim = len(input_shape)
+    if self.axis < 0:
+      axis = ndim + self.axis
+    else:
+      axis = self.axis
+    if axis < 0 or axis >= ndim:
+      raise ValueError('Value of `axis` argument ' + str(self.axis) +
+                       ' is out of range for input with rank ' + str(ndim))
+    param_dim = input_shape[axis]
+    if not param_dim.value:
+      raise ValueError('Input has undefined `axis` dimension. Input shape: ',
+                       input_shape)
+
+    if self.center:
+      self.beta = vs.get_variable('beta',
+                                  shape=(param_dim,),
+                                  initializer=self.beta_initializer,
+                                  regularizer=self.beta_regularizer,
+                                  trainable=True)
+    else:
+      self.beta = None
+    if self.scale:
+      self.gamma = vs.get_variable('gamma',
+                                   shape=(param_dim,),
+                                   initializer=self.gamma_initializer,
+                                   regularizer=self.gamma_regularizer,
+                                   trainable=True)
+    else:
+      self.gamma = None
+
+    # Disable variable partitioning when creating the moving mean and variance
+    partitioner = vs.get_variable_scope().partitioner
+    try:
+      vs.get_variable_scope().set_partitioner(None)
+      self.moving_mean = vs.get_variable(
+          'moving_mean',
+          shape=(param_dim,),
+          initializer=self.moving_mean_initializer,
+          trainable=False)
+      self.moving_variance = vs.get_variable(
+          'moving_variance',
+          shape=(param_dim,),
+          initializer=self.moving_variance_initializer,
+          trainable=False)
+    finally:
+      vs.get_variable_scope().set_partitioner(partitioner)
+
+  def call(self, inputs, training=False):
+    # First, compute the axes along which to reduce the mean / variance,
+    # as well as the broadcast shape to be used for all parameters.
+    input_shape = inputs.get_shape()
+    ndim = len(input_shape)
+    reduction_axes = list(range(len(input_shape)))
+    del reduction_axes[self.axis]
+    broadcast_shape = [1] * len(input_shape)
+    broadcast_shape[self.axis] = input_shape[self.axis].value
+
+    # Determines whether broadcasting is needed.
+    needs_broadcasting = (sorted(reduction_axes) != range(ndim)[:-1])
+
+    # Determine boolean training boolean value. May be False, True, None.
+    # If None, it is assumed that `training` is a variable to be used in `cond`.
+    if isinstance(training, bool):
+      training_bool = training
+    else:
+      try:
+        training_bool = tensor_util.constant_value(training)
+      except TypeError:
+        training_bool = None
+
+    # Obtain current current batch mean, variance, if necessary.
+    if training_bool is not False:
+      # Use a copy of moving_mean as a shift to compute more reliable moments.
+      shift = math_ops.add(self.moving_mean, 0)
+      if needs_broadcasting:
+        shift = array_ops.reshape(shift, broadcast_shape)
+        broadcast_mean, broadcast_variance = nn.moments(
+            inputs, reduction_axes, shift=shift, keep_dims=True)
+        mean = array_ops.reshape(broadcast_mean, [-1])
+        variance = array_ops.reshape(broadcast_variance, [-1])
+      else:
+        mean, variance = nn.moments(inputs, reduction_axes, shift=shift)
+
+    # Prepare updates if necessary.
+    if training_bool is not False and not self.updates:
+      mean_update = moving_averages.assign_moving_average(
+          self.moving_mean, mean, self.momentum, zero_debias=False)
+      variance_update = moving_averages.assign_moving_average(
+          self.moving_variance, variance, self.momentum, zero_debias=False)
+      # In the future this should be refactored into a self.add_update
+      # methods in order to allow for instance-based BN layer sharing
+      # across unrelated input streams (e.g. like in Keras).
+      self.updates.append(mean_update)
+      self.updates.append(variance_update)
+
+    # Normalize batch.
+    if needs_broadcasting:
+      # In this case we must explictly broadcast all parameters.
+      broadcast_moving_mean = array_ops.reshape(self.moving_mean,
+                                                broadcast_shape)
+      broadcast_moving_variance = array_ops.reshape(self.moving_variance,
+                                                    broadcast_shape)
+      if self.center:
+        broadcast_beta = array_ops.reshape(self.beta, broadcast_shape)
+      else:
+        broadcast_beta = None
+      if self.scale:
+        broadcast_gamma = array_ops.reshape(self.gamma, broadcast_shape)
+      else:
+        broadcast_gamma = None
+
+      if training_bool is not False:
+        normed_inputs_training = nn.batch_normalization(inputs,
+                                                        broadcast_mean,
+                                                        broadcast_variance,
+                                                        broadcast_beta,
+                                                        broadcast_gamma,
+                                                        self.epsilon)
+      normed_inputs = nn.batch_normalization(inputs,
+                                             broadcast_moving_mean,
+                                             broadcast_moving_variance,
+                                             broadcast_beta,
+                                             broadcast_gamma,
+                                             self.epsilon)
+    else:
+      # No need for broadcasting.
+      if training_bool is not False:
+        normed_inputs_training = nn.batch_normalization(
+            inputs,
+            mean,
+            variance,
+            self.beta if self.center else None,
+            self.gamma if self.scale else None,
+            self.epsilon)
+      normed_inputs = nn.batch_normalization(inputs,
+                                             self.moving_mean,
+                                             self.moving_variance,
+                                             self.beta if self.center else None,
+                                             self.gamma if self.scale else None,
+                                             self.epsilon)
+
+    # Return the proper output depending on the boolean training phase.
+    if training_bool is True:
+      return normed_inputs_training
+    if training_bool is False:
+      return normed_inputs
+    return control_flow_ops.cond(training,
+                                 lambda: normed_inputs_training,
+                                 lambda: normed_inputs)
+
+
+def batch_normalization(inputs,
+                        axis=-1,
+                        momentum=0.99,
+                        epsilon=1e-3,
+                        center=True,
+                        scale=True,
+                        beta_initializer=init_ops.zeros_initializer,
+                        gamma_initializer=init_ops.ones_initializer(),
+                        moving_mean_initializer=init_ops.zeros_initializer,
+                        moving_variance_initializer=init_ops.ones_initializer(),
+                        beta_regularizer=None,
+                        gamma_regularizer=None,
+                        training=False,
+                        trainable=True,
+                        name=None,
+                        reuse=False):
+  """Functional interface for the batch normalization layer.
+
+  Reference: http://arxiv.org/abs/1502.03167
+
+  "Batch Normalization: Accelerating Deep Network Training by Reducing
+  Internal Covariate Shift"
+
+  Sergey Ioffe, Christian Szegedy
+
+  Arguments:
+    inputs: Tensor input.
+    axis: Integer, the axis that should be normalized (typically the features
+      axis). For instance, after a `Convolution2D` layer with
+      `data_format="channels_first"`, set `axis=1` in `BatchNormalization`.
+    momentum: Momentum for the moving average.
+    epsilon: Small float added to variance to avoid dividing by zero.
+    center: If True, subtract `beta`. If False, `beta` is ignored.
+    scale: If True, multiply by `gamma`. If False, `gamma` is
+      not used. When the next layer is linear (also e.g. `nn.relu`), this can be
+      disabled since the scaling can be done by the next layer.
+    beta_initializer: Initializer for the beta weight.
+    gamma_initializer: Initializer for the gamma weight.
+    moving_mean_initializer: Initializer for the moving mean.
+    moving_variance_initializer: Initializer for the moving variance.
+    beta_regularizer: Optional regularizer for the beta weight.
+    gamma_regularizer: Optional regularizer for the gamma weight.
+    training: Either a Python boolean, or a TensorFlow boolean scalar tensor
+      (e.g. a placeholder). Whether to return the output in training mode
+      (normalized with statistics of the current batch) or in inference mode
+      (normalized with moving statistics).
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
+    name: String, the name of the layer.
+    reuse: Boolean, whether to reuse the weights of a previous layer
+      by the same name.
+
+  Returns:
+    Output tensor.
+  """
+  layer = BatchNormalization(
+      axis=axis,
+      momentum=momentum,
+      epsilon=epsilon,
+      center=center,
+      scale=scale,
+      beta_initializer=beta_initializer,
+      gamma_initializer=gamma_initializer,
+      moving_mean_initializer=moving_mean_initializer,
+      moving_variance_initializer=moving_variance_initializer,
+      beta_regularizer=beta_regularizer,
+      gamma_regularizer=gamma_regularizer,
+      trainable=trainable,
+      name=name,
+      _reuse=reuse,
+      _scope=name)
+  return layer.apply(inputs, training=training)
+
+
+# Aliases
+
+BatchNorm = BatchNormalization
+batch_norm = batch_normalization
diff --git a/tensorflow/python/layers/normalization_test.py b/tensorflow/python/layers/normalization_test.py
new file mode 100644
index 00000000000000..e179b092dcf06b
--- /dev/null
+++ b/tensorflow/python/layers/normalization_test.py
@@ -0,0 +1,479 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf.layers.core."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.python.layers import normalization as normalization_layers
+
+
+class BNTest(tf.test.TestCase):
+
+  def testCreateBN(self):
+    # Call layer.
+    bn = normalization_layers.BatchNormalization(axis=1)
+    inputs = tf.random_uniform((5, 4, 3), seed=1)
+    training = tf.placeholder(dtype='bool')
+    outputs = bn.apply(inputs, training=training)
+
+    # Verify shape.
+    self.assertListEqual(outputs.get_shape().as_list(), [5, 4, 3])
+
+    # Verify layer attributes.
+    self.assertEqual(len(bn.updates), 2)
+    self.assertEqual(len(bn.variables), 4)
+    self.assertEqual(len(bn.trainable_variables), 2)
+    self.assertEqual(len(bn.non_trainable_variables), 2)
+
+    # Test that updates were created and added to UPDATE_OPS.
+    self.assertEqual(len(bn.updates), 2)
+    self.assertListEqual(
+        tf.get_collection(tf.GraphKeys.UPDATE_OPS), bn.updates)
+
+    # Test that weights were created and added to TRAINABLE_VARIABLES.
+    self.assertListEqual(
+        tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES),
+        bn.trainable_variables)
+
+  def test3DInputAxis1(self):
+    epsilon = 1e-3
+    bn = normalization_layers.BatchNormalization(axis=1,
+                                                 epsilon=epsilon, momentum=0.9)
+    inputs = tf.Variable(np.random.random((5, 4, 3)), dtype=tf.float32)
+    training = tf.placeholder(dtype='bool')
+    outputs = bn.apply(inputs, training=training)
+
+    with self.test_session() as sess:
+      # Test training with placeholder learning phase.
+      sess.run(tf.global_variables_initializer())
+      for _ in range(100):
+        np_output, _, _ = sess.run([outputs] + bn.updates,
+                                   feed_dict={training: True})
+
+      # Verify that the statistics are updated during training.
+      moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
+      np_inputs = sess.run(inputs)
+      mean = np.mean(np_inputs, axis=(0, 2))
+      std = np.std(np_inputs, axis=(0, 2))
+      variance = np.square(std)
+      self.assertAllClose(mean, moving_mean, atol=1e-2)
+      self.assertAllClose(variance, moving_var, atol=1e-2)
+
+      # Verify that the axis is normalized during training.
+      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+      np_gamma = np.reshape(np_gamma, (1, 4, 1))
+      np_beta = np.reshape(np_beta, (1, 4, 1))
+      normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
+      self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
+
+      # Test inference with placeholder learning phase.
+      np_output = sess.run(outputs, feed_dict={training: False})
+
+      # Verify that the axis is normalized during inference.
+      normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
+      self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
+
+  def test3DInputAxis2(self):
+    epsilon = 1e-3
+    bn = normalization_layers.BatchNormalization(axis=2,
+                                                 epsilon=epsilon, momentum=0.9)
+    inputs = tf.Variable(np.random.random((5, 4, 3)), dtype=tf.float32)
+    training = tf.placeholder(dtype='bool')
+    outputs = bn.apply(inputs, training=training)
+
+    with self.test_session() as sess:
+      # Test training with placeholder learning phase.
+      sess.run(tf.global_variables_initializer())
+      for _ in range(100):
+        np_output, _, _ = sess.run([outputs] + bn.updates,
+                                   feed_dict={training: True})
+
+      # Verify that the statistics are updated during training.
+      moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
+      np_inputs = sess.run(inputs)
+      mean = np.mean(np_inputs, axis=(0, 1))
+      std = np.std(np_inputs, axis=(0, 1))
+      variance = np.square(std)
+      self.assertAllClose(mean, moving_mean, atol=1e-2)
+      self.assertAllClose(variance, moving_var, atol=1e-2)
+
+      # Verify that the axis is normalized during training.
+      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+      np_gamma = np.reshape(np_gamma, (1, 1, 3))
+      np_beta = np.reshape(np_beta, (1, 1, 3))
+      normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
+      self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
+
+      # Test inference with placeholder learning phase.
+      np_output = sess.run(outputs, feed_dict={training: False})
+
+      # Verify that the axis is normalized during inference.
+      normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
+      self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
+
+  def test4DInputAxis1(self):
+    epsilon = 1e-3
+    bn = normalization_layers.BatchNormalization(axis=1,
+                                                 epsilon=epsilon, momentum=0.9)
+    inputs = tf.Variable(np.random.random((5, 4, 3, 6)), dtype=tf.float32)
+    training = tf.placeholder(dtype='bool')
+    outputs = bn.apply(inputs, training=training)
+
+    with self.test_session() as sess:
+      # Test training with placeholder learning phase.
+      sess.run(tf.global_variables_initializer())
+      for _ in range(100):
+        np_output, _, _ = sess.run([outputs] + bn.updates,
+                                   feed_dict={training: True})
+
+      # Verify that the statistics are updated during training.
+      moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
+      np_inputs = sess.run(inputs)
+      mean = np.mean(np_inputs, axis=(0, 2, 3))
+      std = np.std(np_inputs, axis=(0, 2, 3))
+      variance = np.square(std)
+      self.assertAllClose(mean, moving_mean, atol=1e-2)
+      self.assertAllClose(variance, moving_var, atol=1e-2)
+
+      # Verify that the axis is normalized during training.
+      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+      np_gamma = np.reshape(np_gamma, (1, 4, 1, 1))
+      np_beta = np.reshape(np_beta, (1, 4, 1, 1))
+      normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
+      self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
+
+      # Test inference with placeholder learning phase.
+      np_output = sess.run(outputs, feed_dict={training: False})
+
+      # Verify that the axis is normalized during inference.
+      normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
+      self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
+
+  def test4DInputAxis2(self):
+    epsilon = 1e-3
+    bn = normalization_layers.BatchNormalization(axis=2,
+                                                 epsilon=epsilon, momentum=0.9)
+    inputs = tf.Variable(np.random.random((5, 4, 3, 6)), dtype=tf.float32)
+    training = tf.placeholder(dtype='bool')
+    outputs = bn.apply(inputs, training=training)
+
+    with self.test_session() as sess:
+      # Test training with placeholder learning phase.
+      sess.run(tf.global_variables_initializer())
+      for _ in range(100):
+        np_output, _, _ = sess.run([outputs] + bn.updates,
+                                   feed_dict={training: True})
+
+      # Verify that the statistics are updated during training.
+      moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
+      np_inputs = sess.run(inputs)
+      mean = np.mean(np_inputs, axis=(0, 1, 3))
+      std = np.std(np_inputs, axis=(0, 1, 3))
+      variance = np.square(std)
+      self.assertAllClose(mean, moving_mean, atol=1e-2)
+      self.assertAllClose(variance, moving_var, atol=1e-2)
+
+      # Verify that the axis is normalized during training.
+      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+      np_gamma = np.reshape(np_gamma, (1, 1, 3, 1))
+      np_beta = np.reshape(np_beta, (1, 1, 3, 1))
+      normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
+      self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
+
+      # Test inference with placeholder learning phase.
+      np_output = sess.run(outputs, feed_dict={training: False})
+
+      # Verify that the axis is normalized during inference.
+      normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
+      self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
+
+  def test4DInputAxis3(self):
+    epsilon = 1e-3
+    bn = normalization_layers.BatchNormalization(axis=3,
+                                                 epsilon=epsilon, momentum=0.9)
+    inputs = tf.Variable(np.random.random((5, 4, 3, 6)), dtype=tf.float32)
+    training = tf.placeholder(dtype='bool')
+    outputs = bn.apply(inputs, training=training)
+
+    with self.test_session() as sess:
+      # Test training with placeholder learning phase.
+      sess.run(tf.global_variables_initializer())
+      for _ in range(100):
+        np_output, _, _ = sess.run([outputs] + bn.updates,
+                                   feed_dict={training: True})
+
+      # Verify that the statistics are updated during training.
+      moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
+      np_inputs = sess.run(inputs)
+      mean = np.mean(np_inputs, axis=(0, 1, 2))
+      std = np.std(np_inputs, axis=(0, 1, 2))
+      variance = np.square(std)
+      self.assertAllClose(mean, moving_mean, atol=1e-2)
+      self.assertAllClose(variance, moving_var, atol=1e-2)
+
+      # Verify that the axis is normalized during training.
+      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+      np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
+      np_beta = np.reshape(np_beta, (1, 1, 1, 6))
+      normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
+      self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
+
+      # Test inference with placeholder learning phase.
+      np_output = sess.run(outputs, feed_dict={training: False})
+
+      # Verify that the axis is normalized during inference.
+      normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
+      self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
+
+  def testNegativeAxis(self):
+    epsilon = 1e-3
+    bn = normalization_layers.BatchNormalization(axis=-1,
+                                                 epsilon=epsilon, momentum=0.9)
+    inputs = tf.Variable(np.random.random((5, 4, 3, 6)), dtype=tf.float32)
+    training = tf.placeholder(dtype='bool')
+    outputs = bn.apply(inputs, training=training)
+
+    with self.test_session() as sess:
+      # Test training with placeholder learning phase.
+      sess.run(tf.global_variables_initializer())
+      for _ in range(100):
+        np_output, _, _ = sess.run([outputs] + bn.updates,
+                                   feed_dict={training: True})
+
+      # Verify that the statistics are updated during training.
+      moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
+      np_inputs = sess.run(inputs)
+      mean = np.mean(np_inputs, axis=(0, 1, 2))
+      std = np.std(np_inputs, axis=(0, 1, 2))
+      variance = np.square(std)
+      self.assertAllClose(mean, moving_mean, atol=1e-2)
+      self.assertAllClose(variance, moving_var, atol=1e-2)
+
+      # Verify that the axis is normalized during training.
+      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+      np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
+      np_beta = np.reshape(np_beta, (1, 1, 1, 6))
+      normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
+      self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
+
+      # Test inference with placeholder learning phase.
+      np_output = sess.run(outputs, feed_dict={training: False})
+
+      # Verify that the axis is normalized during inference.
+      normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
+      self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
+
+  def testBooleanLearningPhase(self):
+    epsilon = 1e-3
+    bn = normalization_layers.BatchNormalization(axis=-1,
+                                                 epsilon=epsilon, momentum=0.9)
+    inputs = tf.Variable(np.random.random((5, 4, 3, 6)), dtype=tf.float32)
+    outputs_training = bn.apply(inputs, training=True)
+    outputs_infer = bn.apply(inputs, training=False)
+
+    with self.test_session() as sess:
+      # Test training with placeholder learning phase.
+      sess.run(tf.global_variables_initializer())
+      for _ in range(100):
+        np_output, _, _ = sess.run([outputs_training] + bn.updates)
+
+      # Verify that the statistics are updated during training.
+      moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
+      np_inputs = sess.run(inputs)
+      mean = np.mean(np_inputs, axis=(0, 1, 2))
+      std = np.std(np_inputs, axis=(0, 1, 2))
+      variance = np.square(std)
+      self.assertAllClose(mean, moving_mean, atol=1e-2)
+      self.assertAllClose(variance, moving_var, atol=1e-2)
+
+      # Verify that the axis is normalized during training.
+      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+      np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
+      np_beta = np.reshape(np_beta, (1, 1, 1, 6))
+      normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
+      self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
+
+      # Test inference with placeholder learning phase.
+      np_output = sess.run(outputs_infer)
+
+      # Verify that the axis is normalized during inference.
+      normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
+      self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
+
+  def testFunctionalNoReuse(self):
+    inputs = tf.Variable(np.random.random((5, 4, 3, 6)), dtype=tf.float32)
+    epsilon = 1e-3
+    training = tf.placeholder(dtype='bool')
+    outputs = normalization_layers.batch_norm(
+        inputs, axis=-1, momentum=0.9, epsilon=epsilon,
+        training=training, name='bn')
+
+    updates = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+    all_vars = dict([(v.name, v) for v in tf.global_variables()])
+    moving_mean = all_vars['bn/moving_mean:0']
+    moving_variance = all_vars['bn/moving_variance:0']
+    beta = all_vars['bn/beta:0']
+    gamma = all_vars['bn/gamma:0']
+
+    with self.test_session() as sess:
+      # Test training with placeholder learning phase.
+      sess.run(tf.global_variables_initializer())
+      for _ in range(100):
+        np_output, _, _ = sess.run([outputs] + updates,
+                                   feed_dict={training: True})
+
+      # Verify that the statistics are updated during training.
+      np_moving_mean, np_moving_var = sess.run([moving_mean, moving_variance])
+      np_inputs = sess.run(inputs)
+      np_mean = np.mean(np_inputs, axis=(0, 1, 2))
+      np_std = np.std(np_inputs, axis=(0, 1, 2))
+      np_variance = np.square(np_std)
+      self.assertAllClose(np_mean, np_moving_mean, atol=1e-2)
+      self.assertAllClose(np_variance, np_moving_var, atol=1e-2)
+
+      # Verify that the axis is normalized during training.
+      np_gamma, np_beta = sess.run([gamma, beta])
+      np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
+      np_beta = np.reshape(np_beta, (1, 1, 1, 6))
+      normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
+      self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
+
+      # Test inference with placeholder learning phase.
+      np_output = sess.run(outputs, feed_dict={training: False})
+
+      # Verify that the axis is normalized during inference.
+      normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
+      self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
+
+  def testFunctionalReuse(self):
+    inputs1 = tf.Variable(np.random.random((5, 4, 3, 6)), dtype=tf.float32)
+    inputs2 = tf.Variable(np.random.random((5, 4, 3, 6)), dtype=tf.float32)
+    epsilon = 1e-3
+    training = tf.placeholder(dtype='bool')
+    _ = normalization_layers.batch_norm(
+        inputs1, axis=-1, momentum=0.9, epsilon=epsilon,
+        training=training, name='bn')
+    outputs2 = normalization_layers.batch_norm(
+        inputs2, axis=-1, momentum=0.9, epsilon=epsilon,
+        training=training, name='bn', reuse=True)
+
+    # Last 2 update ops
+    updates = tf.get_collection(tf.GraphKeys.UPDATE_OPS)[-2:]
+    all_vars = dict([(v.name, v) for v in tf.global_variables()])
+    moving_mean = all_vars['bn/moving_mean:0']
+    moving_variance = all_vars['bn/moving_variance:0']
+    beta = all_vars['bn/beta:0']
+    gamma = all_vars['bn/gamma:0']
+
+    with self.test_session() as sess:
+      # Test training with placeholder learning phase.
+      sess.run(tf.global_variables_initializer())
+      for _ in range(100):
+        np_output, _, _ = sess.run([outputs2] + updates,
+                                   feed_dict={training: True})
+
+      # Verify that the statistics are updated during training.
+      np_moving_mean, np_moving_var = sess.run([moving_mean, moving_variance])
+      np_inputs = sess.run(inputs2)
+      np_mean = np.mean(np_inputs, axis=(0, 1, 2))
+      np_std = np.std(np_inputs, axis=(0, 1, 2))
+      np_variance = np.square(np_std)
+      self.assertAllClose(np_mean, np_moving_mean, atol=1e-2)
+      self.assertAllClose(np_variance, np_moving_var, atol=1e-2)
+
+      # Verify that the axis is normalized during training.
+      np_gamma, np_beta = sess.run([gamma, beta])
+      np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
+      np_beta = np.reshape(np_beta, (1, 1, 1, 6))
+      normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
+      self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
+
+      # Test inference with placeholder learning phase.
+      np_output = sess.run(outputs2, feed_dict={training: False})
+
+      # Verify that the axis is normalized during inference.
+      normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
+      self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
+
+  def testNoCenter(self):
+    bn = normalization_layers.BatchNormalization(axis=1, center=False)
+    inputs = tf.random_uniform((5, 4, 3), seed=1)
+    training = tf.placeholder(dtype='bool')
+    outputs = bn.apply(inputs, training=training)
+
+    # Verify shape.
+    self.assertListEqual(outputs.get_shape().as_list(), [5, 4, 3])
+
+    # Verify layer attributes.
+    self.assertEqual(len(bn.updates), 2)
+    self.assertEqual(len(bn.variables), 3)
+    self.assertEqual(len(bn.trainable_variables), 1)
+    self.assertEqual(len(bn.non_trainable_variables), 2)
+
+  def testNoScale(self):
+    bn = normalization_layers.BatchNormalization(axis=1, scale=False)
+    inputs = tf.random_uniform((5, 4, 3), seed=1)
+    training = tf.placeholder(dtype='bool')
+    outputs = bn.apply(inputs, training=training)
+
+    # Verify shape.
+    self.assertListEqual(outputs.get_shape().as_list(), [5, 4, 3])
+
+    # Verify layer attributes.
+    self.assertEqual(len(bn.updates), 2)
+    self.assertEqual(len(bn.variables), 3)
+    self.assertEqual(len(bn.trainable_variables), 1)
+    self.assertEqual(len(bn.non_trainable_variables), 2)
+
+  def testRegularizers(self):
+    reg = lambda x: 0.1 * tf.reduce_sum(x)
+    bn = normalization_layers.BatchNormalization(axis=1, beta_regularizer=reg)
+    inputs = tf.random_uniform((5, 4, 3), seed=1)
+    training = tf.placeholder(dtype='bool')
+    _ = bn.apply(inputs, training=training)
+    self.assertEqual(len(bn.losses), 1)
+
+    bn = normalization_layers.BatchNormalization(axis=1, gamma_regularizer=reg)
+    inputs = tf.random_uniform((5, 4, 3), seed=1)
+    training = tf.placeholder(dtype='bool')
+    _ = bn.apply(inputs, training=training)
+    self.assertEqual(len(bn.losses), 1)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/python/layers/pooling.py b/tensorflow/python/layers/pooling.py
new file mode 100644
index 00000000000000..f6e8ce8a281a50
--- /dev/null
+++ b/tensorflow/python/layers/pooling.py
@@ -0,0 +1,614 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# pylint: disable=unused-import,g-bad-import-order
+"""Contains the pooling layer classes and their functional aliases.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import numpy as np
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import standard_ops
+from tensorflow.python.ops import variable_scope as vs
+
+from tensorflow.python.layers import base
+from tensorflow.python.layers import conv_utils as utils
+
+
+class _Pooling1D(base._Layer):  # pylint: disable=protected-access
+  """Pooling layer for arbitrary pooling functions, for 1D inputs.
+
+  This class only exists for code reuse. It will never be an exposed API.
+
+  Arguments:
+    pool_function: The pooling function to apply, e.g. `tf.nn.max_pool`.
+    pool_size: An integer or tuple/list of a single integer,
+      representing the size of the pooling window.
+    strides: An integer or tuple/list of a single integer, specifying the
+      strides of the pooling operation.
+    padding: A string. The padding method, either 'valid' or 'same'.
+      Case-insensitive.
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, length, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, length)`.
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, pool_function, pool_size, strides,
+               padding='valid', data_format='channels_last',
+               name=None, **kwargs):
+    super(_Pooling1D, self).__init__(name=name, **kwargs)
+    self.pool_function = pool_function
+    self.pool_size = utils.normalize_tuple(pool_size, 1, 'pool_size')
+    self.strides = utils.normalize_tuple(strides, 1, 'strides')
+    self.padding = utils.normalize_padding(padding)
+    self.data_format = utils.normalize_data_format(data_format)
+
+  def build(self, input_shape):
+    if len(input_shape) != 3:
+      raise ValueError('Inputs should have rank 3. '
+                       'Received input shape:', str(input_shape))
+
+  def call(self, inputs):
+    # There is no TF op for 1D pooling, hence we make the inputs 4D.
+    if self.data_format == 'channels_last':
+      inputs = array_ops.expand_dims(inputs, 2)
+      pool_shape = (1,) + self.pool_size + (1, 1)
+      strides = (1,) + self.strides + (1, 1)
+      data_format = 'NHWC'
+    else:
+      inputs = array_ops.expand_dims(inputs, 1)
+      pool_shape = (1, 1) + self.pool_size + (1,)
+      strides = (1, 1) + self.strides + (1,)
+      data_format = 'NCHW'
+
+    outputs = self.pool_function(
+        inputs,
+        ksize=pool_shape,
+        strides=strides,
+        padding=self.padding.upper(),
+        data_format=data_format)
+
+    if self.data_format == 'channels_last':
+      return array_ops.squeeze(outputs, 2)
+    else:
+      return array_ops.squeeze(outputs, 1)
+
+
+class AveragePooling1D(_Pooling1D):
+  """Average Pooling layer for 1D inputs.
+
+  Arguments:
+    pool_size: An integer or tuple/list of a single integer,
+      representing the size of the pooling window.
+    strides: An integer or tuple/list of a single integer, specifying the
+      strides of the pooling operation.
+    padding: A string. The padding method, either 'valid' or 'same'.
+      Case-insensitive.
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, length, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, length)`.
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, pool_size, strides,
+               padding='valid', data_format='channels_last',
+               name=None, **kwargs):
+    super(AveragePooling1D, self).__init__(
+        nn.avg_pool,
+        pool_size=pool_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        name=name,
+        **kwargs)
+
+
+def average_pooling1d(inputs, pool_size, strides,
+                      padding='valid', data_format='channels_last',
+                      name=None):
+  """Average Pooling layer for 1D inputs.
+
+  Arguments:
+    inputs: The tensor over which to pool. Must have rank 3.
+    pool_size: An integer or tuple/list of a single integer,
+      representing the size of the pooling window.
+    strides: An integer or tuple/list of a single integer, specifying the
+      strides of the pooling operation.
+    padding: A string. The padding method, either 'valid' or 'same'.
+      Case-insensitive.
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, length, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, length)`.
+    name: A string, the name of the layer.
+
+  Returns:
+    The output tensor, of rank 3.
+  """
+  layer = AveragePooling1D(pool_size=pool_size,
+                           strides=strides,
+                           padding=padding,
+                           data_format=data_format,
+                           name=name)
+  return layer.apply(inputs)
+
+
+class MaxPooling1D(_Pooling1D):
+  """Max Pooling layer for 1D inputs.
+
+  Arguments:
+    pool_size: An integer or tuple/list of a single integer,
+      representing the size of the pooling window.
+    strides: An integer or tuple/list of a single integer, specifying the
+      strides of the pooling operation.
+    padding: A string. The padding method, either 'valid' or 'same'.
+      Case-insensitive.
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, length, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, length)`.
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, pool_size, strides,
+               padding='valid', data_format='channels_last',
+               name=None, **kwargs):
+    super(MaxPooling1D, self).__init__(
+        nn.max_pool,
+        pool_size=pool_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        name=name,
+        **kwargs)
+
+
+def max_pooling1d(inputs, pool_size, strides,
+                  padding='valid', data_format='channels_last',
+                  name=None):
+  """Max Pooling layer for 1D inputs.
+
+  Arguments:
+    inputs: The tensor over which to pool. Must have rank 3.
+    pool_size: An integer or tuple/list of a single integer,
+      representing the size of the pooling window.
+    strides: An integer or tuple/list of a single integer, specifying the
+      strides of the pooling operation.
+    padding: A string. The padding method, either 'valid' or 'same'.
+      Case-insensitive.
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, length, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, length)`.
+    name: A string, the name of the layer.
+
+  Returns:
+    The output tensor, of rank 3.
+  """
+  layer = MaxPooling1D(pool_size=pool_size,
+                       strides=strides,
+                       padding=padding,
+                       data_format=data_format,
+                       name=name)
+  return layer.apply(inputs)
+
+
+class _Pooling2D(base._Layer):  # pylint: disable=protected-access
+  """Pooling layer for arbitrary pooling functions, for 2D inputs (e.g. images).
+
+  This class only exists for code reuse. It will never be an exposed API.
+
+  Arguments:
+    pool_function: The pooling function to apply, e.g. `tf.nn.max_pool`.
+    pool_size: An integer or tuple/list of 2 integers: (pool_height, pool_width)
+      specifying the size of the pooling window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 2 integers,
+      specifying the strides of the pooling operation.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    padding: A string. The padding method, either 'valid' or 'same'.
+      Case-insensitive.
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, height, width)`.
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, pool_function, pool_size, strides,
+               padding='valid', data_format='channels_last',
+               name=None, **kwargs):
+    super(_Pooling2D, self).__init__(name=name, **kwargs)
+    self.pool_function = pool_function
+    self.pool_size = utils.normalize_tuple(pool_size, 2, 'pool_size')
+    self.strides = utils.normalize_tuple(strides, 2, 'strides')
+    self.padding = utils.normalize_padding(padding)
+    self.data_format = utils.normalize_data_format(data_format)
+
+  def build(self, input_shape):
+    if len(input_shape) != 4:
+      raise ValueError('Inputs should have rank 4. '
+                       'Received input shape:', str(input_shape))
+
+  def call(self, inputs):
+    if self.data_format == 'channels_last':
+      pool_shape = (1,) + self.pool_size + (1,)
+      strides = (1,) + self.strides + (1,)
+    else:
+      pool_shape = (1, 1) + self.pool_size
+      strides = (1, 1) + self.strides
+    return self.pool_function(
+        inputs,
+        ksize=pool_shape,
+        strides=strides,
+        padding=self.padding.upper(),
+        data_format=utils.convert_data_format(self.data_format, 4))
+
+
+class AveragePooling2D(_Pooling2D):
+  """Average pooling layer for 2D inputs (e.g. images).
+
+  Arguments:
+    pool_size: An integer or tuple/list of 2 integers: (pool_height, pool_width)
+      specifying the size of the pooling window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 2 integers,
+      specifying the strides of the pooling operation.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    padding: A string. The padding method, either 'valid' or 'same'.
+      Case-insensitive.
+    data_format: A string. The ordering of the dimensions in the inputs.
+      `channels_last` (default) and `channels_first` are supported.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, channels, width)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, height, width)`.
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, pool_size, strides,
+               padding='valid', data_format='channels_last',
+               name=None, **kwargs):
+    super(AveragePooling2D, self).__init__(
+        nn.avg_pool,
+        pool_size=pool_size, strides=strides,
+        padding=padding, data_format=data_format, name=name, **kwargs)
+
+
+def average_pooling2d(inputs,
+                      pool_size, strides,
+                      padding='valid', data_format='channels_last',
+                      name=None):
+  """Average pooling layer for 2D inputs (e.g. images).
+
+  Arguments:
+    inputs: The tensor over which to pool. Must have rank 4.
+    pool_size: An integer or tuple/list of 2 integers: (pool_height, pool_width)
+      specifying the size of the pooling window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 2 integers,
+      specifying the strides of the pooling operation.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    padding: A string. The padding method, either 'valid' or 'same'.
+      Case-insensitive.
+    data_format: A string. The ordering of the dimensions in the inputs.
+      `channels_last` (default) and `channels_first` are supported.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, channels, width)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, height, width)`.
+    name: A string, the name of the layer.
+
+  Returns:
+    Output tensor.
+  """
+  layer = AveragePooling2D(pool_size=pool_size, strides=strides,
+                           padding=padding, data_format=data_format,
+                           name=name)
+  return layer.apply(inputs)
+
+
+class MaxPooling2D(_Pooling2D):
+  """Max pooling layer for 2D inputs (e.g. images).
+
+  Arguments:
+    pool_size: An integer or tuple/list of 2 integers: (pool_height, pool_width)
+      specifying the size of the pooling window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 2 integers,
+      specifying the strides of the pooling operation.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    padding: A string. The padding method, either 'valid' or 'same'.
+      Case-insensitive.
+    data_format: A string. The ordering of the dimensions in the inputs.
+      `channels_last` (default) and `channels_first` are supported.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, height, width)`.
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, pool_size, strides,
+               padding='valid', data_format='channels_last',
+               name=None, **kwargs):
+    super(MaxPooling2D, self).__init__(
+        nn.max_pool,
+        pool_size=pool_size, strides=strides,
+        padding=padding, data_format=data_format, name=name, **kwargs)
+
+
+def max_pooling2d(inputs,
+                  pool_size, strides,
+                  padding='valid', data_format='channels_last',
+                  name=None):
+  """Max pooling layer for 2D inputs (e.g. images).
+
+  Arguments:
+    inputs: The tensor over which to pool. Must have rank 4.
+    pool_size: An integer or tuple/list of 2 integers: (pool_height, pool_width)
+      specifying the size of the pooling window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 2 integers,
+      specifying the strides of the pooling operation.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    padding: A string. The padding method, either 'valid' or 'same'.
+      Case-insensitive.
+    data_format: A string. The ordering of the dimensions in the inputs.
+      `channels_last` (default) and `channels_first` are supported.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, height, width)`.
+    name: A string, the name of the layer.
+
+  Returns:
+    Output tensor.
+  """
+  layer = MaxPooling2D(pool_size=pool_size, strides=strides,
+                       padding=padding, data_format=data_format,
+                       name=name)
+  return layer.apply(inputs)
+
+
+class _Pooling3D(base._Layer):  # pylint: disable=protected-access
+  """Pooling layer for arbitrary pooling functions, for 3D inputs.
+
+  This class only exists for code reuse. It will never be an exposed API.
+
+  Arguments:
+    pool_function: The pooling function to apply, e.g. `tf.nn.max_pool`.
+    pool_size: An integer or tuple/list of 3 integers:
+      (pool_depth, pool_height, pool_width)
+      specifying the size of the pooling window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 3 integers,
+      specifying the strides of the pooling operation.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    padding: A string. The padding method, either 'valid' or 'same'.
+      Case-insensitive.
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, depth, height, width, channels)`
+      while `channels_first` corresponds to
+      inputs with shape `(batch, channels, depth, height, width)`.
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, pool_function, pool_size, strides,
+               padding='valid', data_format='channels_last',
+               name=None, **kwargs):
+    super(_Pooling3D, self).__init__(name=name, **kwargs)
+    self.pool_function = pool_function
+    self.pool_size = utils.normalize_tuple(pool_size, 3, 'pool_size')
+    self.strides = utils.normalize_tuple(strides, 3, 'strides')
+    self.padding = utils.normalize_padding(padding)
+    self.data_format = utils.normalize_data_format(data_format)
+
+  def build(self, input_shape):
+    if len(input_shape) != 5:
+      raise ValueError('Inputs should have rank 5. '
+                       'Received input shape:', str(input_shape))
+
+  def call(self, inputs):
+    pool_shape = (1,) + self.pool_size + (1,)
+    strides = (1,) + self.strides + (1,)
+
+    if self.data_format == 'channels_first':
+      # TF does not support channels first with 3D pooling operations,
+      # so we must handle this case manually.
+      inputs = array_ops.transpose(inputs, (0, 2, 3, 4, 1))
+
+    outputs = self.pool_function(
+        inputs,
+        ksize=pool_shape,
+        strides=strides,
+        padding=self.padding.upper())
+
+    if self.data_format == 'channels_first':
+      outputs = array_ops.transpose(outputs, (0, 4, 1, 2, 3))
+    return outputs
+
+
+class AveragePooling3D(_Pooling3D):
+  """Average pooling layer for 3D inputs (e.g. volumes).
+
+  Arguments:
+    pool_size: An integer or tuple/list of 3 integers:
+      (pool_depth, pool_height, pool_width)
+      specifying the size of the pooling window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 3 integers,
+      specifying the strides of the pooling operation.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    padding: A string. The padding method, either 'valid' or 'same'.
+      Case-insensitive.
+    data_format: A string. The ordering of the dimensions in the inputs.
+      `channels_last` (default) and `channels_first` are supported.
+      `channels_last` corresponds to inputs with shape
+      `(batch, depth, height, width, channels)` while `channels_first`
+      corresponds to inputs with shape
+      `(batch, channels, depth, height, width)`.
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, pool_size, strides,
+               padding='valid', data_format='channels_last',
+               name=None, **kwargs):
+    super(AveragePooling3D, self).__init__(
+        nn.avg_pool3d,
+        pool_size=pool_size, strides=strides,
+        padding=padding, data_format=data_format, name=name, **kwargs)
+
+
+def average_pooling3d(inputs,
+                      pool_size, strides,
+                      padding='valid', data_format='channels_last',
+                      name=None):
+  """Average pooling layer for 3D inputs (e.g. volumes).
+
+  Arguments:
+    inputs: The tensor over which to pool. Must have rank 5.
+    pool_size: An integer or tuple/list of 3 integers:
+      (pool_depth, pool_height, pool_width)
+      specifying the size of the pooling window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 3 integers,
+      specifying the strides of the pooling operation.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    padding: A string. The padding method, either 'valid' or 'same'.
+      Case-insensitive.
+    data_format: A string. The ordering of the dimensions in the inputs.
+      `channels_last` (default) and `channels_first` are supported.
+      `channels_last` corresponds to inputs with shape
+      `(batch, depth, height, width, channels)` while `channels_first`
+      corresponds to inputs with shape
+      `(batch, channels, depth, height, width)`.
+    name: A string, the name of the layer.
+
+  Returns:
+    Output tensor.
+  """
+  layer = AveragePooling3D(pool_size=pool_size, strides=strides,
+                           padding=padding, data_format=data_format,
+                           name=name)
+  return layer.apply(inputs)
+
+
+class MaxPooling3D(_Pooling3D):
+  """Max pooling layer for 3D inputs (e.g. volumes).
+
+  Arguments:
+    pool_size: An integer or tuple/list of 3 integers:
+      (pool_depth, pool_height, pool_width)
+      specifying the size of the pooling window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 3 integers,
+      specifying the strides of the pooling operation.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    padding: A string. The padding method, either 'valid' or 'same'.
+      Case-insensitive.
+    data_format: A string. The ordering of the dimensions in the inputs.
+      `channels_last` (default) and `channels_first` are supported.
+      `channels_last` corresponds to inputs with shape
+      `(batch, depth, height, width, channels)` while `channels_first`
+      corresponds to inputs with shape
+      `(batch, channels, depth, height, width)`.
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, pool_size, strides,
+               padding='valid', data_format='channels_last',
+               name=None, **kwargs):
+    super(MaxPooling3D, self).__init__(
+        nn.max_pool3d,
+        pool_size=pool_size, strides=strides,
+        padding=padding, data_format=data_format, name=name, **kwargs)
+
+
+def max_pooling3d(inputs,
+                  pool_size, strides,
+                  padding='valid', data_format='channels_last',
+                  name=None):
+  """Max pooling layer for 3D inputs (e.g. volumes).
+
+  Arguments:
+    inputs: The tensor over which to pool. Must have rank 5.
+    pool_size: An integer or tuple/list of 3 integers:
+      (pool_depth, pool_height, pool_width)
+      specifying the size of the pooling window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 3 integers,
+      specifying the strides of the pooling operation.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    padding: A string. The padding method, either 'valid' or 'same'.
+      Case-insensitive.
+    data_format: A string. The ordering of the dimensions in the inputs.
+      `channels_last` (default) and `channels_first` are supported.
+      `channels_last` corresponds to inputs with shape
+      `(batch, depth, height, width, channels)` while `channels_first`
+      corresponds to inputs with shape
+      `(batch, channels, depth, height, width)`.
+    name: A string, the name of the layer.
+
+  Returns:
+    Output tensor.
+  """
+  layer = MaxPooling3D(pool_size=pool_size, strides=strides,
+                       padding=padding, data_format=data_format,
+                       name=name)
+  return layer.apply(inputs)
+
+# Aliases
+
+AvgPool2D = AveragePooling2D
+MaxPool2D = MaxPooling2D
+max_pool2d = max_pooling2d
+avg_pool2d = average_pooling2d
diff --git a/tensorflow/python/layers/pooling_test.py b/tensorflow/python/layers/pooling_test.py
new file mode 100644
index 00000000000000..c9af80926d2da4
--- /dev/null
+++ b/tensorflow/python/layers/pooling_test.py
@@ -0,0 +1,162 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf.layers.pooling."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from tensorflow.python.layers import pooling as pooling_layers
+
+
+class PoolingTest(tf.test.TestCase):
+
+  def testInvalidDataFormat(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, height, width, 3), seed=1)
+    with self.assertRaisesRegexp(
+        ValueError, 'data_format'):
+      pooling_layers.max_pooling2d(images, 3, strides=2, data_format='invalid')
+
+  def testInvalidStrides(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, height, width, 3), seed=1)
+    with self.assertRaisesRegexp(
+        ValueError, 'strides'):
+      pooling_layers.max_pooling2d(images, 3, strides=(1, 2, 3))
+
+    with self.assertRaisesRegexp(
+        ValueError, 'strides'):
+      pooling_layers.max_pooling2d(images, 3, strides=None)
+
+  def testInvalidPoolSize(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, height, width, 3), seed=1)
+    with self.assertRaisesRegexp(
+        ValueError, 'pool_size'):
+      pooling_layers.max_pooling2d(images, (1, 2, 3), strides=2)
+
+    with self.assertRaisesRegexp(
+        ValueError, 'pool_size'):
+      pooling_layers.max_pooling2d(images, None, strides=2)
+
+  def testCreateMaxPooling2D(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, height, width, 4))
+    layer = pooling_layers.MaxPooling2D([2, 2], strides=2)
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, 3, 4, 4])
+
+  def testCreateAveragePooling2D(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, height, width, 4))
+    layer = pooling_layers.AveragePooling2D([2, 2], strides=2)
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, 3, 4, 4])
+
+  def testCreateMaxPooling1D(self):
+    width = 7
+    images = tf.random_uniform((5, width, 4))
+    layer = pooling_layers.MaxPooling1D(2, strides=2)
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, 3, 4])
+
+  def testCreateAveragePooling1D(self):
+    width = 7
+    images = tf.random_uniform((5, width, 4))
+    layer = pooling_layers.AveragePooling1D(2, strides=2)
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, 3, 4])
+
+  def testCreateMaxPooling1DChannelsFirst(self):
+    width = 7
+    images = tf.random_uniform((5, width, 4))
+    layer = pooling_layers.MaxPooling1D(2, strides=2,
+                                        data_format='channels_first')
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, 3, 4])
+
+  def testCreateMaxPooling3D(self):
+    depth, height, width = 6, 7, 9
+    images = tf.random_uniform((5, depth, height, width, 4))
+    layer = pooling_layers.MaxPooling3D([2, 2, 2], strides=2)
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, 3, 3, 4, 4])
+
+  def testCreateAveragePooling3D(self):
+    depth, height, width = 6, 7, 9
+    images = tf.random_uniform((5, depth, height, width, 4))
+    layer = pooling_layers.AveragePooling3D([2, 2, 2], strides=2)
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, 3, 3, 4, 4])
+
+  def testmaxPooling3DChannelsFirst(self):
+    depth, height, width = 6, 7, 9
+    images = tf.random_uniform((5, 4, depth, height, width))
+    layer = pooling_layers.AveragePooling3D([2, 2, 2], strides=2,
+                                            data_format='channels_first')
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, 4, 3, 3, 4])
+
+  def testCreateMaxPooling2DIntegerPoolSize(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, height, width, 4))
+    layer = pooling_layers.MaxPooling2D(2, strides=2)
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, 3, 4, 4])
+
+  def testMaxPooling2DPaddingSame(self):
+    height, width = 7, 9
+    images = tf.random_uniform((5, height, width, 4), seed=1)
+    layer = pooling_layers.MaxPooling2D(images.get_shape()[1:3],
+                                        strides=2, padding='same')
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(), [5, 4, 5, 4])
+
+  def testCreatePooling2DWithStrides(self):
+    height, width = 6, 8
+    # Test strides tuple
+    images = tf.random_uniform((5, height, width, 3), seed=1)
+    layer = pooling_layers.MaxPooling2D([2, 2], strides=(2, 2), padding='same')
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, height / 2, width / 2, 3])
+
+    # Test strides integer
+    layer = pooling_layers.MaxPooling2D([2, 2], strides=2, padding='same')
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, height / 2, width / 2, 3])
+
+    # Test unequal strides
+    layer = pooling_layers.MaxPooling2D([2, 2], strides=(2, 1), padding='same')
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, height / 2, width, 3])
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index 17853c88c3d712..7efdb281aed6c2 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -462,11 +462,13 @@ def _PadGrad(op, grad):
 @ops.RegisterGradient("ReverseSequence")
 def _ReverseSequenceGrad(op, grad):
   seq_lengths = op.inputs[1]
-  return [array_ops.reverse_sequence(grad,
-                                     batch_dim=op.get_attr("batch_dim"),
-                                     seq_dim=op.get_attr("seq_dim"),
-                                     seq_lengths=seq_lengths),
-          None]
+  return [
+      array_ops.reverse_sequence(
+          grad,
+          batch_axis=op.get_attr("batch_dim"),
+          seq_axis=op.get_attr("seq_dim"),
+          seq_lengths=seq_lengths), None
+  ]
 
 
 @ops.RegisterGradient("Reverse")
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 1d7827bb98a029..1b5b11354e3f33 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -116,6 +116,7 @@
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_array_ops import *
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.deprecation import deprecated
 # pylint: enable=wildcard-import
 
@@ -248,7 +249,7 @@ def shape_internal(input, name=None, optimize=True, out_type=dtypes.int32):
   with ops.name_scope(name, "Shape", [input]) as name:
     if isinstance(
         input, (sparse_tensor.SparseTensor, sparse_tensor.SparseTensorValue)):
-      return gen_math_ops.cast(input.shape, out_type)
+      return gen_math_ops.cast(input.dense_shape, out_type)
     else:
       input_tensor = ops.convert_to_tensor(input)
       input_shape = input_tensor.get_shape()
@@ -301,7 +302,7 @@ def size_internal(input, name=None, optimize=True, out_type=dtypes.int32):
     if isinstance(
         input, (sparse_tensor.SparseTensor, sparse_tensor.SparseTensorValue)):
       return gen_math_ops._prod(
-          gen_math_ops.cast(input.shape, out_type), 0, name=name)
+          gen_math_ops.cast(input.dense_shape, out_type), 0, name=name)
     else:
       input_tensor = ops.convert_to_tensor(input)
       input_shape = input_tensor.get_shape()
@@ -357,7 +358,7 @@ def rank_internal(input, name=None, optimize=True):
   with ops.name_scope(name, "Rank", [input]) as name:
     if isinstance(
         input, (sparse_tensor.SparseTensor, sparse_tensor.SparseTensorValue)):
-      return gen_array_ops.size(input.shape, name=name)
+      return gen_array_ops.size(input.dense_shape, name=name)
     else:
       input_tensor = ops.convert_to_tensor(input)
       input_shape = input_tensor.get_shape()
@@ -534,7 +535,7 @@ def slice(input_, begin, size, name=None):
 def strided_slice(input_,
                   begin,
                   end,
-                  strides,
+                  strides=None,
                   begin_mask=0,
                   end_mask=0,
                   ellipsis_mask=0,
@@ -623,6 +624,10 @@ def strided_slice(input_,
   Returns:
     A `Tensor` the same type as `input`.
   """
+
+  if strides is None:
+    strides = ones_like(begin)
+
   op = gen_array_ops.strided_slice(
       input=input_,
       begin=begin,
@@ -1052,6 +1057,10 @@ def concat_v2(values, axis, name="concat_v2"):
                                   name=name)
 
 
+@deprecated(
+    "2016-12-13",
+    "This op will be removed after the deprecation date. "
+    "Please switch to tf.concat_v2().")
 def concat(concat_dim, values, name="concat"):
   """Concatenates tensors along one dimension.
 
@@ -1106,24 +1115,7 @@ def concat(concat_dim, values, name="concat"):
   Returns:
     A `Tensor` resulting from concatenation of the input tensors.
   """
-  # TODO(annarev): switch to call concat_v2 instead.
-  if not isinstance(values, (list, tuple)):
-    values = [values]
-  # TODO(mrry): Change to return values?
-  if len(values) == 1:  # Degenerate case of one tensor.
-    # Make a throwaway call to convert_to_tensor to make sure
-    # that axis is of the correct type, and make sure that
-    # the returned tensor is a scalar.
-    # TODO(keveman): Implement a standalone type and shape checker.
-    with ops.name_scope(name) as scope:
-      ops.convert_to_tensor(concat_dim,
-                            name="concat_dim",
-                            dtype=dtypes.int32).get_shape(
-                            ).assert_is_compatible_with(tensor_shape.scalar())
-      return identity(values[0], name=scope)
-  return gen_array_ops._concat(concat_dim=concat_dim,
-                               values=values,
-                               name=name)
+  return concat_v2(values, concat_dim, name)
 
 
 def boolean_mask(tensor, mask, name="boolean_mask"):
@@ -1235,11 +1227,17 @@ def sparse_mask(a, mask_indices, name=None):
     return ops.IndexedSlices(out_values, out_indices, a.dense_shape)
 
 
-def split(split_dim, num_split, value, name="split"):
-  """Splits a tensor into `num_split` tensors along one dimension.
+def split(axis=None,
+          num_or_size_splits=None,
+          value=None,
+          name="split",
+          split_dim=None):
+  """DEPRECATED: use split_v; split_v rename to split happening soon.
 
-  Splits `value` along dimension `split_dim` into `num_split` smaller tensors.
-  Requires that `num_split` evenly divide `value.shape[split_dim]`.
+  Splits a tensor into `num_split` tensors along one dimension.
+
+  Splits `value` along dimension `axis` into `num_or_size_splits` smaller
+  tensors. Requires that `num_or_size_splits` evenly divide `value.shape[axis]`.
 
   For example:
 
@@ -1265,29 +1263,35 @@ def split(split_dim, num_split, value, name="split"):
   ```
 
   Args:
-    split_dim: A 0-D `int32` `Tensor`. The dimension along which to split.
+    axis: A 0-D `int32` `Tensor`. The dimension along which to split.
       Must be in the range `[0, rank(value))`.
-    num_split: A Python integer. The number of ways to split.
+    num_or_size_splits: A Python integer. The number of ways to split. Has a
+      different meaning in split_v (see docs).
     value: The `Tensor` to split.
     name: A name for the operation (optional).
+    split_dim: The old (deprecated) name for axis.
 
   Returns:
     `num_split` `Tensor` objects resulting from splitting `value`.
   """
-  return gen_array_ops._split(split_dim=split_dim,
-                              num_split=num_split,
-                              value=value,
-                              name=name)
+  axis = deprecation.deprecated_argument_lookup("axis", axis, "split_dim",
+                                                split_dim)
+  return gen_array_ops._split(
+      split_dim=axis, num_split=num_or_size_splits, value=value, name=name)
 
 
-def split_v(value, size_splits, split_dim=0, num=None, name="split_v"):
+def split_v(value=None,
+            num_or_size_splits=None,
+            axis=0,
+            num=None,
+            name="split_v"):
   """Splits a tensor into sub tensors.
 
-  If size_splits is a scalar, `num_split`, then
-  splits `value` along dimension `split_dim` into `num_split` smaller tensors.
+  If num_or_size_splits is a scalar, `num_split`, then
+  splits `value` along dimension `axis` into `num_split` smaller tensors.
   Requires that `num_split` evenly divide `value.shape[split_dim]`.
 
-  If size_splits is a tensor, then
+  If num_or_size_splits is a tensor, then
   splits `value` into len(size_splits) pieces each the same size as the input
   except along dimension split_dim where the size is size_splits[i].
 
@@ -1307,29 +1311,32 @@ def split_v(value, size_splits, split_dim=0, num=None, name="split_v"):
 
   Args:
     value: The `Tensor` to split.
-    size_splits: Either an integer indicating the number of splits along
+    num_or_size_splits: Either an integer indicating the number of splits along
       split_dim or a 1-D Tensor containing the sizes of each output tensor
       along split_dim. If an integer then it must evenly divide
       value.shape[split_dim]; otherwise the sum of sizes along the split
       dimension must match that of the input.
-    split_dim: A 0-D `int32` `Tensor`. The dimension along which to split.
+    axis: A 0-D `int32` `Tensor`. The dimension along which to split.
       Must be in the range `[0, rank(value))`. Defaults to 0.
     num: Optional, used to specify the number of outputs when it cannot be
          inferred from the shape of size_splits.
     name: A name for the operation (optional).
 
   Returns:
-    `len(size_splits)` `Tensor` objects resulting from splitting `value`.
+    if `num_or_size_splits` is a scalar returns `num_or_size_splits` `Tensor`
+    objects; if `num_or_size_splits` is a 1-D Tensor returns
+    `num_or_size_splits.get_shape[0]` `Tensor` objects resulting from splitting
+    `value`.
 
   Raises:
     ValueError: If `num` is unspecified and cannot be inferred.
   """
-  if isinstance(size_splits, six.integer_types):
+  if isinstance(num_or_size_splits, six.integer_types):
     return gen_array_ops._split(
-        split_dim=split_dim, num_split=size_splits, value=value, name=name)
+        split_dim=axis, num_split=num_or_size_splits, value=value, name=name)
   else:
     if num is None:
-      size_splits = ops.convert_to_tensor(size_splits)
+      size_splits = ops.convert_to_tensor(num_or_size_splits)
       size_splits_shape = size_splits.get_shape()
       num = size_splits_shape.dims
     if num is None:
@@ -1337,7 +1344,7 @@ def split_v(value, size_splits, split_dim=0, num=None, name="split_v"):
     return gen_array_ops._split_v(
         value=value,
         size_splits=size_splits,
-        split_dim=split_dim,
+        split_dim=axis,
         num_split=num[0],
         name=name)
 
@@ -1670,7 +1677,7 @@ def sparse_placeholder(dtype, shape=None, name=None):
     print(sess.run(y, feed_dict={
       x: (indices, values, shape)}))  # Will succeed.
 
-    sp = tf.SparseTensor(indices=indices, values=values, shape=shape)
+    sp = tf.SparseTensor(indices=indices, values=values, dense_shape=shape)
     sp_value = sp.eval(session)
     print(sess.run(y, feed_dict={x: sp_value}))  # Will succeed.
   ```
@@ -2345,7 +2352,7 @@ def squeeze(input, axis=None, name=None, squeeze_dims=None):
   ```prettyprint
   # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
   shape(squeeze(t)) ==> [2, 3]
-            ```
+  ```
 
   Or, to remove specific size 1 dimensions:
 
@@ -2426,3 +2433,30 @@ def where(condition, x=None, y=None, name=None):
     return gen_math_ops._select(condition=condition, t=x, e=y, name=name)
   else:
     raise ValueError("x and y must both be non-None or both be None.")
+
+
+# pylint: disable=redefined-builtin
+def reverse_sequence(input,
+                     seq_lengths,
+                     seq_axis=None,
+                     batch_axis=None,
+                     name=None,
+                     seq_dim=None,
+                     batch_dim=None):
+  seq_axis = deprecation.deprecated_argument_lookup("seq_axis", seq_axis,
+                                                    "seq_dim", seq_dim)
+  batch_axis = deprecation.deprecated_argument_lookup("batch_axis", batch_axis,
+                                                      "batch_dim", batch_dim)
+  return gen_array_ops.reverse_sequence(
+      input=input,
+      seq_lengths=seq_lengths,
+      seq_dim=seq_axis,
+      batch_dim=batch_axis,
+      name=name)
+# pylint: enable=redefined-builtin
+
+
+reverse_sequence.__doc__ = deprecation.rewrite_argument_docstring(
+    deprecation.rewrite_argument_docstring(
+        gen_array_ops.reverse_sequence.__doc__, "batch_dim", "batch_axis"),
+    "seq_dim", "seq_axis")
diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index 6377a96de89638..b50153dcc67e61 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -665,6 +665,7 @@ def assert_type(tensor, tf_type, message=None, name=None):
     return control_flow_ops.no_op('statically_determined_correct_type')
 
 
+# pylint: disable=line-too-long
 def _get_diff_for_monotonic_comparison(x):
   """Gets the difference x[1:] - x[:-1]."""
   x = array_ops.reshape(x, [-1])
@@ -677,7 +678,7 @@ def _get_diff_for_monotonic_comparison(x):
 
   # With 2 or more elements, return x[1:] - x[:-1]
   s_len = array_ops.shape(x) - 1
-  diff = lambda: array_ops.slice(x, [1], s_len) - array_ops.slice(x, [0], s_len)
+  diff = lambda: array_ops.strided_slice(x, [1], [1] + s_len)- array_ops.strided_slice(x, [0], s_len)
   return control_flow_ops.cond(is_shorter_than_two, short_result, diff)
 
 
diff --git a/tensorflow/python/ops/confusion_matrix.py b/tensorflow/python/ops/confusion_matrix.py
new file mode 100644
index 00000000000000..0b9e79c640b220
--- /dev/null
+++ b/tensorflow/python/ops/confusion_matrix.py
@@ -0,0 +1,163 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Confusion matrix related utilities.
+
+
+@@remove_squeezable_dimensions
+@@confusion_matrix
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import sparse_ops
+
+
+def remove_squeezable_dimensions(labels, predictions, name=None):
+  """Squeeze last dim if ranks of `predictions` and `labels` differ by 1.
+
+  This will use static shape if available. Otherwise, it will add graph
+  operations, which could result in a performance hit.
+
+  Args:
+    labels: Label values, a `Tensor` whose dimensions match `predictions`.
+    predictions: Predicted values, a `Tensor` of arbitrary dimensions.
+    name: Name of the op.
+
+  Returns:
+    Tuple of `labels` and `predictions`, possibly with last dim squeezed.
+  """
+  with ops.name_scope(name, 'remove_squeezable_dimensions',
+                      [labels, predictions]):
+    predictions = ops.convert_to_tensor(predictions)
+    labels = ops.convert_to_tensor(labels)
+    predictions_shape = predictions.get_shape()
+    predictions_rank = predictions_shape.ndims
+    labels_shape = labels.get_shape()
+    labels_rank = labels_shape.ndims
+    if (labels_rank is not None) and (predictions_rank is not None):
+      # Use static rank.
+      rank_diff = predictions_rank - labels_rank
+      if rank_diff == -1:
+        labels = array_ops.squeeze(labels, [-1])
+      elif rank_diff == 1:
+        predictions = array_ops.squeeze(predictions, [-1])
+      return labels, predictions
+
+    # Use dynamic rank.
+    rank_diff = array_ops.rank(predictions) - array_ops.rank(labels)
+    if (predictions_rank is None) or (
+        predictions_shape.dims[-1].is_compatible_with(1)):
+      predictions = control_flow_ops.cond(
+          math_ops.equal(1, rank_diff),
+          lambda: array_ops.squeeze(predictions, [-1]),
+          lambda: predictions)
+    if (labels_rank is None) or (
+        labels_shape.dims[-1].is_compatible_with(1)):
+      labels = control_flow_ops.cond(
+          math_ops.equal(-1, rank_diff),
+          lambda: array_ops.squeeze(labels, [-1]),
+          lambda: labels)
+    return labels, predictions
+
+
+def confusion_matrix(labels, predictions, num_classes=None, dtype=dtypes.int32,
+                     name=None, weights=None):
+  """Computes the confusion matrix from predictions and labels.
+
+  Calculate the Confusion Matrix for a pair of prediction and
+  label 1-D int arrays.
+
+  The matrix rows represent the prediction labels and the columns
+  represents the real labels. The confusion matrix is always a 2-D array
+  of shape `[n, n]`, where `n` is the number of valid labels for a given
+  classification task. Both prediction and labels must be 1-D arrays of
+  the same shape in order for this function to work.
+
+  If `num_classes` is None, then `num_classes` will be set to the one plus
+  the maximum value in either predictions or labels.
+  Class labels are expected to start at 0. E.g., if `num_classes` was
+  three, then the possible labels would be `[0, 1, 2]`.
+
+  If `weights` is not `None`, then each prediction contributes its
+  corresponding weight to the total value of the confusion matrix cell.
+
+  For example:
+
+  ```python
+    tf.contrib.metrics.confusion_matrix([1, 2, 4], [2, 2, 4]) ==>
+        [[0 0 0 0 0]
+         [0 0 1 0 0]
+         [0 0 1 0 0]
+         [0 0 0 0 0]
+         [0 0 0 0 1]]
+  ```
+
+  Note that the possible labels are assumed to be `[0, 1, 2, 3, 4]`,
+  resulting in a 5x5 confusion matrix.
+
+  Args:
+    labels: A 1-D representing the real labels for the classification task.
+    predictions: A 1-D array representing the predictions for a given
+                 classification.
+    num_classes: The possible number of labels the classification task can
+                 have. If this value is not provided, it will be calculated
+                 using both predictions and labels array.
+    dtype: Data type of the confusion matrix.
+    name: Scope name.
+    weights: An optional `Tensor` whose shape matches `predictions`.
+
+  Returns:
+    A k X k matrix representing the confusion matrix, where k is the number of
+    possible labels in the classification task.
+
+  Raises:
+    ValueError: If both predictions and labels are not 1-D vectors and have
+      mismatched shapes, or if `weights` is not `None` and its shape doesn't
+      match `predictions`.
+  """
+  with ops.name_scope(name, 'confusion_matrix',
+                      [predictions, labels, num_classes]) as name:
+    labels, predictions = remove_squeezable_dimensions(
+        ops.convert_to_tensor(labels, name='labels'),
+        ops.convert_to_tensor(
+            predictions, name='predictions'))
+    predictions = math_ops.cast(predictions, dtypes.int64)
+    labels = math_ops.cast(labels, dtypes.int64)
+
+    if num_classes is None:
+      num_classes = math_ops.maximum(math_ops.reduce_max(predictions),
+                                     math_ops.reduce_max(labels)) + 1
+
+    if weights is not None:
+      predictions.get_shape().assert_is_compatible_with(weights.get_shape())
+      weights = math_ops.cast(weights, dtype)
+
+    shape = array_ops.pack([num_classes, num_classes])
+    indices = array_ops.transpose(array_ops.pack([predictions, labels]))
+    values = (array_ops.ones_like(predictions, dtype)
+              if weights is None else weights)
+    cm_sparse = sparse_tensor.SparseTensor(
+        indices=indices, values=values, shape=math_ops.to_int64(shape))
+    zero_matrix = array_ops.zeros(math_ops.to_int32(shape), dtype)
+
+    return sparse_ops.sparse_add(zero_matrix, cm_sparse)
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index ce22ffccbaf4e3..52f1e58ea13ac7 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -176,7 +176,7 @@ def _Identity(data, name=None):
         dense_shape = array_ops.identity(dense_shape, name="dense_shape")
       return ops.IndexedSlices(values, indices, dense_shape)
     else:
-      dense_shape = array_ops.identity(data.shape, name="dense_shape")
+      dense_shape = array_ops.identity(data.dense_shape, name="dense_shape")
       return sparse_tensor.SparseTensor(indices, values, dense_shape)
 
 
@@ -198,7 +198,7 @@ def _NextIteration(data, name=None):
         dense_shape = next_iteration(dense_shape, name="dense_shape")
       return ops.IndexedSlices(values, indices, dense_shape)
     else:
-      dense_shape = next_iteration(data.shape, name="dense_shape")
+      dense_shape = next_iteration(data.dense_shape, name="dense_shape")
       return sparse_tensor.SparseTensor(indices, values, dense_shape)
 
 
@@ -288,7 +288,7 @@ def exit(data, name=None):
         dense_shape = gen_control_flow_ops._exit(dense_shape, name)
       return ops.IndexedSlices(values, indices, dense_shape)
     else:
-      dense_shape = gen_control_flow_ops._exit(data.shape, name)
+      dense_shape = gen_control_flow_ops._exit(data.dense_shape, name)
       return sparse_tensor.SparseTensor(indices, values, dense_shape)
 
 
@@ -333,7 +333,7 @@ def switch(data, pred, dtype=None, name=None):
         return (ops.IndexedSlices(val_f, ind_f, dense_shape_f),
                 ops.IndexedSlices(val_t, ind_t, dense_shape_t))
       else:
-        dense_shape = data.shape
+        dense_shape = data.dense_shape
         dense_shape_f, dense_shape_t = gen_control_flow_ops._switch(
             data.shape, pred, name="dense_shape")
         return (sparse_tensor.SparseTensor(ind_f, val_f, dense_shape_f),
@@ -425,7 +425,7 @@ def merge(inputs, name=None):
       indices, chosen_index = gen_control_flow_ops._merge(
           [inp.indices for inp in inputs], name="indices")
       dense_shape, _ = gen_control_flow_ops._merge(
-          [inp.shape for inp in inputs], name="dense_shape")
+          [inp.dense_shape for inp in inputs], name="dense_shape")
       return (sparse_tensor.SparseTensor(indices, values, dense_shape),
               chosen_index)
     else:
@@ -456,7 +456,7 @@ def _make_tensor_array(ta, t_or_flow):
   new_ta = tensor_array_ops.TensorArray(
       dtype=ta.dtype, handle=ta.handle, flow=t_or_flow,
       infer_shape=ta._infer_shape)
-  new_ta._elem_shape = ta._elem_shape
+  new_ta._element_shape = ta._element_shape  # pylint: disable=protected-access
   return new_ta
 
 
@@ -550,16 +550,16 @@ def _SetShapeInvariants(input_vars, enter_vars, shapes):
         if var.dense_shape is not None:
           var.dense_shape.set_shape(tensor_shape.TensorShape([shape.ndims]))
       else:
-        if not _ShapeLessThanOrEqual(inp.shape.get_shape(), shape):
+        if not _ShapeLessThanOrEqual(inp.dense_shape.get_shape(), shape):
           raise ValueError(
               "The shape invariant specified for %s is not compatible with "
               "the initial shape of the shape tensor of this SparseTensor. "
               "It enters the loop with shape %s, but the specified shape "
               "invariant is %s."
-              % (inp.shape.name, inp.shape.get_shape(), shape))
+              % (inp.dense_shape.name, inp.dense_shape.get_shape(), shape))
         var.values.set_shape(tensor_shape.TensorShape([None]))
         var.indices.set_shape(tensor_shape.TensorShape([None, shape.ndims]))
-        var.shape.set_shape(shape)
+        var.dense_shape.set_shape(shape)
 
 
 def _EnforceShapeInvariant(merge_var, next_var):
@@ -613,10 +613,10 @@ def _EnforceShapeInvariant(merge_var, next_var):
     else:
       m_values_shape = merge_var.values.get_shape()
       m_indices_shape = merge_var.indices.get_shape()
-      m_shape_shape = merge_var.shape.get_shape()
+      m_shape_shape = merge_var.dense_shape.get_shape()
       n_values_shape = next_var.values.get_shape()
       n_indices_shape = next_var.indices.get_shape()
-      n_shape_shape = next_var.shape.get_shape()
+      n_shape_shape = next_var.dense_shape.get_shape()
       if (not _ShapeLessThanOrEqual(n_values_shape, m_values_shape) or
           not _ShapeLessThanOrEqual(n_indices_shape, m_indices_shape) or
           not _ShapeLessThanOrEqual(n_shape_shape, m_shape_shape)):
@@ -654,7 +654,7 @@ def _AddNextAndBackEdge(m, v):
     # pylint: disable=protected-access
     m.values.op._update_input(1, v.values)
     m.indices.op._update_input(1, v.indices)
-    m.shape.op._update_input(1, v.shape)
+    m.dense_shape.op._update_input(1, v.dense_shape)
     # pylint: enable=protected-access
   else:
     raise TypeError("Type %s not supported" % type(m))
@@ -971,7 +971,7 @@ def GetRealValue(self, value):
     Returns:
       The same tensor obtained from the saved history.
     """
-    assert value.op.type != "Variable"
+    assert value.op.type not in ["Variable", "VariableV2"]
     real_value = self._history_map.get(value.name)
     if real_value is None:
       cur_value = value
@@ -1698,7 +1698,7 @@ def BuildCondBranch(self, fn):
                 dense_shape = self._ProcessOutputTensor(dense_shape)
               real_v = ops.IndexedSlices(values, indices, dense_shape)
             else:
-              dense_shape = self._ProcessOutputTensor(v.shape)
+              dense_shape = self._ProcessOutputTensor(v.dense_shape)
               real_v = sparse_tensor.SparseTensor(indices, values, dense_shape)
           else:
             real_v = self._ProcessOutputTensor(v)
@@ -2363,7 +2363,7 @@ def _InitializeValues(self, values):
         if isinstance(x, ops.IndexedSlices):
           dense_shape = x.dense_shape
         elif isinstance(x, sparse_tensor.SparseTensor):
-          dense_shape = x.shape
+          dense_shape = x.dense_shape
         else:
           raise TypeError("Type %s not supported" % type(x))
         if dense_shape is not None:
@@ -2496,7 +2496,7 @@ def _FixControlInputsAndContext(self, enters):
         if not isinstance(e, (ops.IndexedSlices, sparse_tensor.SparseTensor)):
           raise TypeError("Type %s not supported" % type(e))
         xs = [e.values, e.indices]
-        shape = e.dense_shape if isinstance(e, ops.IndexedSlices) else e.shape
+        shape = e.dense_shape
         if shape is not None:
           xs.append(shape)
       for x in xs:
@@ -2548,7 +2548,7 @@ def while_loop(cond, body, loop_vars, shape_invariants=None,
   TensorShape([r]) where r is the rank of the dense tensor represented
   by the sparse tensor. It means the shapes of the three tensors of the
   SparseTensor are ([None], [None, r], [r]). NOTE: The shape invariant here
-  is the shape of the SparseTensor.shape property. It must be the shape of
+  is the shape of the SparseTensor.dense_shape property. It must be the shape of
   a vector.
 
   b) If a loop variable is an IndexedSlices, the shape invariant must be
diff --git a/tensorflow/python/ops/ctc_ops.py b/tensorflow/python/ops/ctc_ops.py
index 0e14b4f5a17db9..2a050498573c6c 100644
--- a/tensorflow/python/ops/ctc_ops.py
+++ b/tensorflow/python/ops/ctc_ops.py
@@ -28,7 +28,7 @@
 
 
 # pylint: disable=protected-access, invalid-name
-def ctc_loss(inputs, labels, sequence_length,
+def ctc_loss(labels, inputs, sequence_length,
              preprocess_collapse_repeated=False,
              ctc_merge_repeated=True, time_major=True):
   """Computes the CTC (Connectionist Temporal Classification) Loss.
@@ -97,17 +97,17 @@ def ctc_loss(inputs, labels, sequence_length,
     Untested.  Very likely will not learn to output repeated classes.
 
   Args:
+    labels: An `int32` `SparseTensor`.
+      `labels.indices[i, :] == [b, t]` means `labels.values[i]` stores
+      the id for (batch b, time t).
+      `labels.values[i]` must take on values in `[0, num_labels)`.
+      See `core/ops/ctc_ops.cc` for more details.
     inputs: 3-D `float` `Tensor`.
       If time_major == False, this will be a `Tensor` shaped:
         `[batch_size x max_time x num_classes]`.
       If time_major == True (default), this will be a `Tensor` shaped:
         `[max_time x batch_size x num_classes]`.
       The logits.
-    labels: An `int32` `SparseTensor`.
-      `labels.indices[i, :] == [b, t]` means `labels.values[i]` stores
-      the id for (batch b, time t).
-      `labels.values[i]` must take on values in `[0, num_labels)`.
-      See `core/ops/ctc_ops.cc` for more details.
     sequence_length: 1-D `int32` vector, size `[batch_size]`.
       The sequence lengths.
     preprocess_collapse_repeated: Boolean.  Default: False.
@@ -130,7 +130,7 @@ def ctc_loss(inputs, labels, sequence_length,
   # The second, third, etc output tensors contain the gradients.  We use it in
   # _CTCLossGrad() below.
   if not isinstance(labels, sparse_tensor.SparseTensor):
-    raise TypeError("Expected labels to be a SparseTensor")
+    raise TypeError("Expected labels (first argument) to be a SparseTensor")
 
   # For internal calculations, we transpose to [time, batch, num_classes]
   if not time_major:
diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py
index 1d9e3aa71e6f76..c6068a05c6aa79 100644
--- a/tensorflow/python/ops/embedding_ops.py
+++ b/tensorflow/python/ops/embedding_ops.py
@@ -290,8 +290,8 @@ def embedding_lookup_sparse(params, sp_ids, sp_weights,
         sp_weights.values.get_shape())
     sp_ids.indices.get_shape().assert_is_compatible_with(
         sp_weights.indices.get_shape())
-    sp_ids.shape.get_shape().assert_is_compatible_with(
-        sp_weights.shape.get_shape())
+    sp_ids.dense_shape.get_shape().assert_is_compatible_with(
+        sp_weights.dense_shape.get_shape())
     # TODO(yleon): Add enhanced node assertions to verify that sp_ids and
     # sp_weights have equal indices and shapes.
 
diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py
index 524d764b3500fb..3f50617bf380a3 100644
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@@ -239,13 +239,14 @@ def map_fn(fn, elems, dtype=None, parallel_iterations=10, back_prop=True,
   expressible as TensorFlow ops, use
 
   ```python
-    result = SparseTensor(input.indices, fn(input.values), input.shape)
+    result = SparseTensor(input.indices, fn(input.values), input.dense_shape)
   ```
 
   If, however, the function is not expressible as a TensorFlow op, then use
 
   ```python
-  result = SparseTensor(input.indices, map_fn(fn, input.values), input.shape)
+  result = SparseTensor(
+    input.indices, map_fn(fn, input.values), input.dense_shape)
   ```
 
   instead.
@@ -304,8 +305,9 @@ def map_fn(fn, elems, dtype=None, parallel_iterations=10, back_prop=True,
   if isinstance(elems, sparse_tensor.SparseTensor):
     raise TypeError(
         "To perform a map on the values of a sparse tensor use either "
-        " SparseTensor(input.indices, fn(input.values), input.shape) or "
-        " SparseTensor(input.indices, map_fn(fn, input.values), input.shape)")
+        " SparseTensor(input.indices, fn(input.values), input.dense_shape) or "
+        " SparseTensor(input.indices, map_fn(fn, input.values), "
+        "input.dense_shape)")
 
   input_is_sequence = nest.is_sequence(elems)
   input_flatten = lambda x: nest.flatten(x) if input_is_sequence else [x]
diff --git a/tensorflow/python/ops/hidden_ops.txt b/tensorflow/python/ops/hidden_ops.txt
index ab30c8cf199c84..a4aca2b83882cf 100644
--- a/tensorflow/python/ops/hidden_ops.txt
+++ b/tensorflow/python/ops/hidden_ops.txt
@@ -259,6 +259,7 @@ SdcaShrinkL1
 
 # state_ops
 Variable
+VariableV2
 TemporaryVariable
 DestroyTemporaryVariable
 
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index ce18da09d754e6..a21ffc4f813087 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -371,7 +371,7 @@ def _benchmarkAdjustHue(self, device, cpu_count):
         delta = tf.constant(0.1, dtype=tf.float32)
         outputs = image_ops.adjust_hue(inputs, delta)
         run_op = tf.group(outputs)
-        sess.run(tf.initialize_all_variables())
+        sess.run(tf.global_variables_initializer())
         for i in xrange(warmup_rounds + benchmark_rounds):
           if i == warmup_rounds:
             start = time.time()
@@ -1132,13 +1132,13 @@ def _testSampleDistortedBoundingBox(self, image, bounding_box,
       bounding_box_tf = constant_op.constant(bounding_box_np,
                                              dtype=dtypes.float32,
                                              shape=bounding_box_np.shape)
-      begin, end, _ = image_ops.sample_distorted_bounding_box(
+      begin, size, _ = image_ops.sample_distorted_bounding_box(
           image_size=image_size_tf,
           bounding_boxes=bounding_box_tf,
           min_object_covered=min_object_covered,
           aspect_ratio_range=aspect_ratio_range,
           area_range=area_range)
-      y = array_ops.slice(image_tf, begin, end)
+      y = array_ops.strided_slice(image_tf, begin, begin + size)
 
       for _ in xrange(num_iter):
         y_tf = y.eval()
diff --git a/tensorflow/python/ops/io_ops.py b/tensorflow/python/ops/io_ops.py
index 99f992ff5ff88d..e7af6bfe2d9be1 100644
--- a/tensorflow/python/ops/io_ops.py
+++ b/tensorflow/python/ops/io_ops.py
@@ -131,12 +131,16 @@
 (where you increase *N* until it can keep the queue full).  Use
 [`batch_join`](#batch_join) or [`shuffle_batch_join`](#shuffle_batch_join)
 if you have *N* different subgraphs producing examples to batch and you
-want them run by *N* threads.
+want them run by *N* threads. Use `maybe_*` to enqueue conditionally.
 
 @@batch
+@@maybe_batch
 @@batch_join
+@@maybe_batch_join
 @@shuffle_batch
+@@maybe_shuffle_batch
 @@shuffle_batch_join
+@@maybe_shuffle_batch_join
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/python/ops/logging_ops.py b/tensorflow/python/ops/logging_ops.py
index 4742b0687e615f..4901057836f300 100644
--- a/tensorflow/python/ops/logging_ops.py
+++ b/tensorflow/python/ops/logging_ops.py
@@ -76,8 +76,14 @@ def _Collect(val, collections, default_collections):
     "This means that TensorFlow will automatically de-duplicate summary "
     "names based on their scope.")
 def histogram_summary(tag, values, collections=None, name=None):
+  # pylint: disable=line-too-long
   """Outputs a `Summary` protocol buffer with a histogram.
 
+  This ops is deprecated. Please switch to tf.summary.histogram.
+
+  For an explanation of why this op was deprecated, and information on how to
+  migrate, look ['here'](https://www.tensorflow.org/code/tensorflow/contrib/deprecated/__init__.py)
+
   The generated
   [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
   has one summary value containing a histogram for `values`.
@@ -110,8 +116,12 @@ def histogram_summary(tag, values, collections=None, name=None):
     "names based on the scope they are created in. Also, the max_images "
     "argument was renamed to max_outputs.")
 def image_summary(tag, tensor, max_images=3, collections=None, name=None):
+  # pylint: disable=line-too-long
   """Outputs a `Summary` protocol buffer with images.
 
+  For an explanation of why this op was deprecated, and information on how to
+  migrate, look ['here'](https://www.tensorflow.org/code/tensorflow/contrib/deprecated/__init__.py)
+
   The summary has up to `max_images` summary values containing images. The
   images are built from `tensor` which must be 4-D with shape `[batch_size,
   height, width, channels]` and where `channels` can be:
@@ -171,8 +181,13 @@ def audio_summary(tag,
                   max_outputs=3,
                   collections=None,
                   name=None):
+  # pylint: disable=line-too-long
   """Outputs a `Summary` protocol buffer with audio.
 
+  This op is deprecated. Please switch to tf.summary.audio.
+  For an explanation of why this op was deprecated, and information on how to
+  migrate, look ['here'](https://www.tensorflow.org/code/tensorflow/contrib/deprecated/__init__.py)
+
   The summary has up to `max_outputs` summary values containing audio. The
   audio is built from `tensor` which must be 3-D with shape `[batch_size,
   frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
@@ -219,6 +234,9 @@ def merge_summary(inputs, collections=None, name=None):
   # pylint: disable=line-too-long
   """Merges summaries.
 
+  This op is deprecated. Please switch to tf.summary.merge, which has identical
+  behavior.
+
   This op creates a
   [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
   protocol buffer that contains the union of all the values in the input
@@ -248,6 +266,9 @@ def merge_summary(inputs, collections=None, name=None):
 def merge_all_summaries(key=ops.GraphKeys.SUMMARIES):
   """Merges all summaries collected in the default graph.
 
+  This op is deprecated. Please switch to tf.summary.merge_all, which has
+  identical behavior.
+
   Args:
     key: `GraphKey` used to collect the summaries.  Defaults to
       `GraphKeys.SUMMARIES`.
@@ -296,8 +317,13 @@ def get_summary_op():
     "tensor or list of tags to a scalar summary op is no longer "
     "supported.")
 def scalar_summary(tags, values, collections=None, name=None):
+  # pylint: disable=line-too-long
   """Outputs a `Summary` protocol buffer with scalar values.
 
+  This ops is deprecated. Please switch to tf.summary.scalar.
+  For an explanation of why this op was deprecated, and information on how to
+  migrate, look ['here'](https://www.tensorflow.org/code/tensorflow/contrib/deprecated/__init__.py)
+
   The input `tags` and `values` must have the same shape.  The generated
   summary has a summary value for each tag-value pair in `tags` and `values`.
 
diff --git a/tensorflow/python/ops/losses/BUILD b/tensorflow/python/ops/losses/BUILD
new file mode 100644
index 00000000000000..4a2edd99b2b1a6
--- /dev/null
+++ b/tensorflow/python/ops/losses/BUILD
@@ -0,0 +1,39 @@
+package(
+    default_visibility = ["//tensorflow:internal"],
+    features = [
+        "-layering_check",
+        "-parse_headers",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+py_library(
+    name = "losses",
+    srcs = [
+        "__init__.py",
+        "losses.py",
+        "util.py",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:nn_ops",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/python/ops/losses/__init__.py b/tensorflow/python/ops/losses/__init__.py
new file mode 100644
index 00000000000000..3b0d0d8e5a5f28
--- /dev/null
+++ b/tensorflow/python/ops/losses/__init__.py
@@ -0,0 +1,21 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Loss functions and helpers to manipulate them.
+"""
+
+# pylint: disable=wildcard-import
+from tensorflow.python.ops.losses.losses import *
+from tensorflow.python.ops.losses.util import *
diff --git a/tensorflow/python/ops/losses/losses.py b/tensorflow/python/ops/losses/losses.py
new file mode 100644
index 00000000000000..d1bfbb62364721
--- /dev/null
+++ b/tensorflow/python/ops/losses/losses.py
@@ -0,0 +1,588 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Loss operations for use in neural networks.
+
+Note: All the losses are added to the `GraphKeys.LOSSES` collection by default.
+
+@@absolute_difference
+@@compute_weighted_loss
+@@cosine_distance
+@@hinge_loss
+@@log_loss
+@@mean_pairwise_squared_error
+@@mean_squared_error
+@@sigmoid_cross_entropy
+@@softmax_cross_entropy
+@@sparse_softmax_cross_entropy
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.losses import util
+
+
+def _scale_losses(losses, weights):
+  """Computes the scaled loss.
+
+  Args:
+    losses: A `Tensor` of size [batch_size, d1, ... dN].
+    weights: A `Tensor` of size [1], [batch_size] or [batch_size, d1, ... dN].
+      The `losses` are reduced (tf.reduce_sum) until its dimension matches
+      that of `weights` at which point the reduced `losses` are element-wise
+      multiplied by `weights` and a final reduce_sum is computed on the result.
+      Conceptually, this operation is equivalent to broadcasting (tiling)
+      `weights` to be the same size as `losses`, performing an element-wise
+      multiplication, and summing the result.
+
+  Returns:
+    A scalar tf.float32 `Tensor` whose value represents the sum of the scaled
+      `losses`.
+  """
+  # First, compute the sum of the losses over all elements:
+  start_index = max(0, weights.get_shape().ndims)
+  reduction_indices = list(range(start_index, losses.get_shape().ndims))
+  reduced_losses = math_ops.reduce_sum(losses,
+                                       reduction_indices=reduction_indices)
+  reduced_losses = math_ops.mul(reduced_losses, weights)
+  return math_ops.reduce_sum(reduced_losses)
+
+
+def _safe_div(numerator, denominator, name="value"):
+  """Computes a safe divide which returns 0 if the denominator is zero.
+
+  Note that the function contains an additional conditional check that is
+  necessary for avoiding situations where the loss is zero causing NaNs to
+  creep into the gradient computation.
+
+  Args:
+    numerator: An arbitrary `Tensor`.
+    denominator: A `Tensor` whose shape matches `numerator` and whose values are
+      assumed to be non-negative.
+    name: An optional name for the returned op.
+
+  Returns:
+    The element-wise value of the numerator divided by the denominator.
+  """
+  return array_ops.where(
+      math_ops.greater(denominator, 0),
+      math_ops.div(numerator, array_ops.where(
+          math_ops.equal(denominator, 0),
+          array_ops.ones_like(denominator), denominator)),
+      array_ops.zeros_like(numerator),
+      name=name)
+
+
+def _safe_mean(losses, num_present):
+  """Computes a safe mean of the losses.
+
+  Args:
+    losses: A tensor whose elements contain individual loss measurements.
+    num_present: The number of measurable losses in the tensor.
+
+  Returns:
+    A scalar representing the mean of the losses. If `num_present` is zero,
+      then zero is returned.
+  """
+  total_loss = math_ops.reduce_sum(losses)
+  return _safe_div(total_loss, num_present)
+
+
+def _num_present(losses, weights, per_batch=False):
+  """Computes the number of elements in the loss function induced by `weights`.
+
+  A given weights tensor induces different numbers of usable elements in the
+  `losses` tensor. The `weights` tensor is broadcast across `losses` for all
+  possible dimensions. For example, if `losses` is a tensor of dimension
+  [4, 5, 6, 3] and `weights` is a tensor of size [4, 5], then `weights` is, in
+  effect, tiled to match the size of `losses`. Following this effective tile,
+  the total number of present elements is the number of non-zero weights.
+
+  Args:
+    losses: A tensor of size [batch_size, d1, ... dN].
+    weights: A tensor of size [1] or [batch_size, d1, ... dK] where K < N.
+    per_batch: Whether to return the number of elements per batch or as a sum
+      total.
+
+  Returns:
+    The number of present (non-zero) elements in the losses tensor. If
+      `per_batch` is True, the value is returned as a tensor of size
+      [batch_size]. Otherwise, a single scalar tensor is returned.
+  """
+  # If weights is a scalar, its easy to compute:
+  if weights.get_shape().ndims == 0:
+    batch_size = array_ops.reshape(array_ops.slice(array_ops.shape(losses),
+                                                   [0], [1]), [])
+    num_per_batch = math_ops.div(math_ops.to_float(array_ops.size(losses)),
+                                 math_ops.to_float(batch_size))
+    num_per_batch = array_ops.where(math_ops.equal(weights, 0),
+                                    0.0, num_per_batch)
+    num_per_batch = math_ops.mul(array_ops.ones(
+        array_ops.reshape(batch_size, [1])), num_per_batch)
+    return num_per_batch if per_batch else math_ops.reduce_sum(num_per_batch)
+
+  # First, count the number of nonzero weights:
+  if weights.get_shape().ndims >= 1:
+    reduction_indices = list(range(1, weights.get_shape().ndims))
+    num_nonzero_per_batch = math_ops.reduce_sum(
+        math_ops.to_float(math_ops.not_equal(weights, 0)),
+        reduction_indices=reduction_indices)
+
+  # Next, determine the number of elements that weight would broadcast to:
+  broadcast_dims = array_ops.slice(array_ops.shape(losses),
+                                   [weights.get_shape().ndims], [-1])
+  num_to_broadcast = math_ops.to_float(math_ops.reduce_prod(broadcast_dims))
+
+  num_per_batch = math_ops.mul(num_nonzero_per_batch, num_to_broadcast)
+  return num_per_batch if per_batch else math_ops.reduce_sum(num_per_batch)
+
+
+def compute_weighted_loss(
+    losses, weights=1.0, scope=None, loss_collection=ops.GraphKeys.LOSSES):
+  """Computes the weighted loss.
+
+  Args:
+    losses: A tensor of size [batch_size, d1, ... dN].
+    weights: A tensor of size [1] or [batch_size, d1, ... dK] where K < N.
+    scope: the scope for the operations performed in computing the loss.
+    loss_collection: the loss will be added to these collections.
+
+  Returns:
+    A scalar `Tensor` that returns the weighted loss.
+
+  Raises:
+    ValueError: If `weights` is `None` or the shape is not compatible with
+      `losses`, or if the number of dimensions (rank) of either `losses` or
+      `weights` is missing.
+  """
+  with ops.name_scope(scope, "weighted_loss", [losses, weights]):
+    losses = ops.convert_to_tensor(losses)
+    input_dtype = losses.dtype
+    losses = math_ops.to_float(losses)
+    weights = math_ops.to_float(ops.convert_to_tensor(weights))
+
+    if losses.get_shape().ndims is None:
+      raise ValueError("losses.get_shape().ndims cannot be None")
+    weights_shape = weights.get_shape()
+    if weights_shape.ndims is None:
+      raise ValueError("weight.get_shape().ndims cannot be None")
+
+    if weights_shape.ndims > 1 and weights_shape.dims[-1].is_compatible_with(1):
+      weights = array_ops.squeeze(weights, [-1])
+
+    total_loss = _scale_losses(losses, weights)
+    num_present = _num_present(losses, weights)
+    mean_loss = _safe_mean(total_loss, num_present)
+    # convert the result back to the input type
+    mean_loss = math_ops.cast(mean_loss, input_dtype)
+    util.add_loss(mean_loss, loss_collection)
+    return mean_loss
+
+
+def absolute_difference(
+    labels, predictions, weights=1.0, scope=None,
+    loss_collection=ops.GraphKeys.LOSSES):
+  """Adds an Absolute Difference loss to the training procedure.
+
+  `weights` acts as a coefficient for the loss. If a scalar is provided, then
+  the loss is simply scaled by the given value. If `weights` is a tensor of
+  size [batch_size], then the total loss for each sample of the batch is
+  rescaled by the corresponding element in the `weight` vector. If the shape of
+  `weight` matches the shape of `predictions`, then the loss of each
+  measurable element of `predictions` is scaled by the corresponding value of
+  `weight`.
+
+  Args:
+    labels: The ground truth output tensor, same dimensions as 'predictions'.
+    predictions: The predicted outputs.
+    weights: Coefficients for the loss a scalar, a tensor of shape
+      [batch_size] or a tensor whose shape matches `predictions`.
+    scope: The scope for the operations performed in computing the loss.
+    loss_collection: collection to which this loss will be added.
+
+  Returns:
+    A scalar `Tensor` representing the loss value.
+
+  Raises:
+    ValueError: If the shape of `predictions` doesn't match that of `labels` or
+      if the shape of `weight` is invalid.
+  """
+  with ops.name_scope(scope, "absolute_difference",
+                      [predictions, labels, weights]) as scope:
+    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
+    predictions = math_ops.to_float(predictions)
+    labels = math_ops.to_float(labels)
+    losses = math_ops.abs(math_ops.sub(predictions, labels))
+    return compute_weighted_loss(losses, weights, scope, loss_collection)
+
+
+def cosine_distance(
+    labels, predictions, dim=None, weights=1.0, scope=None,
+    loss_collection=ops.GraphKeys.LOSSES):
+  """Adds a cosine-distance loss to the training procedure.
+
+  Note that the function assumes that `predictions` and `labels` are already
+  unit-normalized.
+
+  Args:
+    labels: A `Tensor` whose shape matches 'predictions'
+    predictions: An arbitrary matrix.
+    dim: The dimension along which the cosine distance is computed.
+    weights: Coefficients for the loss a scalar, a tensor of shape
+      [batch_size] or a tensor whose shape matches `predictions`.
+    scope: The scope for the operations performed in computing the loss.
+    loss_collection: collection to which this loss will be added.
+
+  Returns:
+    A scalar `Tensor` representing the loss value.
+
+  Raises:
+    ValueError: If `predictions` shape doesn't match `labels` shape, or
+      `weights` is `None`.
+  """
+  if dim is None:
+    raise ValueError("`dim` cannot be None.")
+  with ops.name_scope(scope, "cosine_distance_loss",
+                      [predictions, labels, weights]) as scope:
+    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
+
+    predictions = math_ops.to_float(predictions)
+    labels = math_ops.to_float(labels)
+
+    radial_diffs = math_ops.mul(predictions, labels)
+    losses = 1 - math_ops.reduce_sum(radial_diffs, reduction_indices=[dim,])
+    return compute_weighted_loss(losses, weights, scope, loss_collection)
+
+
+def hinge_loss(labels, logits, weights=1.0, scope=None,
+               loss_collection=ops.GraphKeys.LOSSES):
+  """Adds a hinge loss to the training procedure.
+
+  Args:
+    labels: The ground truth output tensor. Its shape should match the shape of
+      logits. The values of the tensor are expected to be 0.0 or 1.0.
+    logits: The logits, a float tensor.
+    weights: Coefficients for the loss a scalar, a tensor of shape
+      [batch_size] or a tensor whose shape matches `predictions`.
+    scope: The scope for the operations performed in computing the loss.
+    loss_collection: collection to which the loss will be added.
+
+  Returns:
+    A scalar `Tensor` of the loss value.
+
+  Raises:
+    ValueError: If the shapes of `logits` and `labels` don't match.
+  """
+  with ops.name_scope(scope, "hinge_loss", [logits, labels]) as scope:
+    logits.get_shape().assert_is_compatible_with(labels.get_shape())
+    # We first need to convert binary labels to -1/1 labels (as floats).
+    labels = math_ops.to_float(labels)
+    all_ones = array_ops.ones_like(labels)
+    labels = math_ops.sub(2 * labels, all_ones)
+    losses = nn_ops.relu(math_ops.sub(all_ones, math_ops.mul(labels, logits)))
+    return compute_weighted_loss(losses, weights, scope, loss_collection)
+
+
+def log_loss(labels, predictions, weights=1.0, epsilon=1e-7, scope=None,
+             loss_collection=ops.GraphKeys.LOSSES):
+  """Adds a Log Loss term to the training procedure.
+
+  `weight` acts as a coefficient for the loss. If a scalar is provided, then the
+  loss is simply scaled by the given value. If `weight` is a tensor of size
+  [batch_size], then the total loss for each sample of the batch is rescaled
+  by the corresponding element in the `weight` vector. If the shape of
+  `weight` matches the shape of `predictions`, then the loss of each
+  measurable element of `predictions` is scaled by the corresponding value of
+  `weight`.
+
+  Args:
+    labels: The ground truth output tensor, same dimensions as 'predictions'.
+    predictions: The predicted outputs.
+    weights: Coefficients for the loss a scalar, a tensor of shape
+      [batch_size] or a tensor whose shape matches `predictions`.
+    epsilon: A small increment to add to avoid taking a log of zero.
+    scope: The scope for the operations performed in computing the loss.
+    loss_collection: collection to which the loss will be added.
+
+  Returns:
+    A scalar `Tensor` representing the loss value.
+
+  Raises:
+    ValueError: If the shape of `predictions` doesn't match that of `labels` or
+      if the shape of `weight` is invalid.
+  """
+  with ops.name_scope(scope, "log_loss",
+                      [predictions, labels, weights]) as scope:
+    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
+    predictions = math_ops.to_float(predictions)
+    labels = math_ops.to_float(labels)
+    losses = -math_ops.mul(
+        labels,
+        math_ops.log(predictions + epsilon)) - math_ops.mul(
+            (1 - labels), math_ops.log(1 - predictions + epsilon))
+    return compute_weighted_loss(losses, weights, scope, loss_collection)
+
+
+def mean_pairwise_squared_error(labels, predictions, weights=1.0, scope=None,
+                                loss_collection=ops.GraphKeys.LOSSES):
+  """Adds a pairwise-errors-squared loss to the training procedure.
+
+  Unlike `mean_squared_error`, which is a measure of the differences between
+  corresponding elements of `predictions` and `labels`,
+  `mean_pairwise_squared_error` is a measure of the differences between pairs of
+  corresponding elements of `predictions` and `labels`.
+
+  For example, if `labels`=[a, b, c] and `predictions`=[x, y, z], there are
+  three pairs of differences are summed to compute the loss:
+    loss = [ ((a-b) - (x-y)).^2 + ((a-c) - (x-z)).^2 + ((b-c) - (y-z)).^2 ] / 3
+
+  Note that since the inputs are of size [batch_size, d0, ... dN], the
+  corresponding pairs are computed within each batch sample but not across
+  samples within a batch. For example, if `predictions` represents a batch of
+  16 grayscale images of dimension [batch_size, 100, 200], then the set of pairs
+  is drawn from each image, but not across images.
+
+  `weight` acts as a coefficient for the loss. If a scalar is provided, then the
+  loss is simply scaled by the given value. If `weight` is a tensor of size
+  [batch_size], then the total loss for each sample of the batch is rescaled
+  by the corresponding element in the `weight` vector.
+
+  Args:
+    labels: The ground truth output tensor, whose shape must match the shape of
+      the `predictions` tensor.
+    predictions: The predicted outputs, a tensor of size [batch_size, d0, .. dN]
+      where N+1 is the total number of dimensions in `predictions`.
+    weights: Coefficients for the loss a scalar, a tensor of shape [batch_size]
+      or a tensor whose shape matches `predictions`.
+    scope: The scope for the operations performed in computing the loss.
+    loss_collection: collection to which the loss will be added.
+
+  Returns:
+    A scalar `Tensor` representing the loss value.
+
+  Raises:
+    ValueError: If the shape of `predictions` doesn't match that of `labels` or
+      if the shape of `weight` is invalid.
+  """
+  with ops.name_scope(scope, "mean_pairwise_squared_error",
+                      [predictions, labels, weights]) as scope:
+    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
+    predictions = math_ops.to_float(predictions)
+    labels = math_ops.to_float(labels)
+    weights = math_ops.to_float(ops.convert_to_tensor(weights))
+
+    diffs = math_ops.sub(predictions, labels)
+
+    # Need to verify here since the function doesn't use compute_weighted_loss
+    if diffs.get_shape().ndims is None:
+      raise ValueError("diffs.get_shape().ndims cannot be None")
+    if weights.get_shape().ndims is None:
+      raise ValueError("weights.get_shape().ndims cannot be None")
+
+    reduction_indices = list(range(1, diffs.get_shape().ndims))
+
+    sum_squares_diff_per_batch = math_ops.reduce_sum(
+        math_ops.square(diffs),
+        reduction_indices=reduction_indices)
+    num_present_per_batch = _num_present(diffs, weights, per_batch=True)
+
+    term1 = 2.0 * _safe_div(sum_squares_diff_per_batch,
+                            num_present_per_batch)
+
+    sum_diff = math_ops.reduce_sum(diffs, reduction_indices=reduction_indices)
+    term2 = 2.0 * _safe_div(math_ops.square(sum_diff),
+                            math_ops.square(num_present_per_batch))
+
+    loss = _scale_losses(term1 - term2, weights)
+
+    mean_loss = array_ops.where(math_ops.reduce_sum(num_present_per_batch) > 0,
+                                loss,
+                                array_ops.zeros_like(loss),
+                                name="value")
+    util.add_loss(mean_loss, loss_collection)
+    return mean_loss
+
+
+def mean_squared_error(labels, predictions, weights=1.0, scope=None,
+                       loss_collection=ops.GraphKeys.LOSSES):
+  """Adds a Sum-of-Squares loss to the training procedure.
+
+  `weight` acts as a coefficient for the loss. If a scalar is provided, then the
+  loss is simply scaled by the given value. If `weight` is a tensor of size
+  [batch_size], then the total loss for each sample of the batch is rescaled
+  by the corresponding element in the `weight` vector. If the shape of
+  `weight` matches the shape of `predictions`, then the loss of each
+  measurable element of `predictions` is scaled by the corresponding value of
+  `weight`.
+
+  Args:
+    labels: The ground truth output tensor, same dimensions as 'predictions'.
+    predictions: The predicted outputs.
+    weights: Coefficients for the loss a scalar, a tensor of shape
+      [batch_size] or a tensor whose shape matches `predictions`.
+    scope: The scope for the operations performed in computing the loss.
+    loss_collection: collection to which the loss will be added.
+
+  Returns:
+    A scalar `Tensor` representing the loss value.
+
+  Raises:
+    ValueError: If the shape of `predictions` doesn't match that of `labels` or
+      if the shape of `weight` is invalid.
+  """
+  with ops.name_scope(scope, "mean_squared_error",
+                      [predictions, labels, weights]) as scope:
+    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
+    predictions = math_ops.to_float(predictions)
+    labels = math_ops.to_float(labels)
+    losses = math_ops.square(math_ops.sub(predictions, labels))
+    return compute_weighted_loss(losses, weights, scope, loss_collection)
+
+
+def sigmoid_cross_entropy(
+    multi_class_labels, logits, weights=1.0, label_smoothing=0, scope=None,
+    loss_collection=ops.GraphKeys.LOSSES):
+  """Creates a cross-entropy loss using tf.nn.sigmoid_cross_entropy_with_logits.
+
+  `weight` acts as a coefficient for the loss. If a scalar is provided,
+  then the loss is simply scaled by the given value. If `weight` is a
+  tensor of size [`batch_size`], then the loss weights apply to each
+  corresponding sample.
+
+  If `label_smoothing` is nonzero, smooth the labels towards 1/2:
+
+      new_multiclass_labels = multiclass_labels * (1 - label_smoothing)
+                              + 0.5 * label_smoothing
+
+  Args:
+    multi_class_labels: [batch_size, num_classes] target labels in (0, 1).
+    logits: [batch_size, num_classes] logits outputs of the network .
+    weights: Coefficients for the loss. The tensor must be a scalar, a tensor of
+      shape [batch_size] or shape [batch_size, num_classes].
+    label_smoothing: If greater than 0 then smooth the labels.
+    scope: The scope for the operations performed in computing the loss.
+    loss_collection: collection to which the loss will be added.
+
+  Returns:
+    A scalar `Tensor` representing the loss value.
+
+  Raises:
+    ValueError: If the shape of `logits` doesn't match that of
+      `multi_class_labels` or if the shape of `weight` is invalid, or if
+      `weight` is None.
+  """
+  with ops.name_scope(scope, "sigmoid_cross_entropy_loss",
+                      [logits, multi_class_labels, weights]) as scope:
+    logits.get_shape().assert_is_compatible_with(multi_class_labels.get_shape())
+
+    multi_class_labels = math_ops.cast(multi_class_labels, logits.dtype)
+
+    if label_smoothing > 0:
+      multi_class_labels = (multi_class_labels * (1 - label_smoothing) +
+                            0.5 * label_smoothing)
+
+    losses = nn.sigmoid_cross_entropy_with_logits(logits, multi_class_labels,
+                                                  name="xentropy")
+    return compute_weighted_loss(losses, weights, scope, loss_collection)
+
+
+def softmax_cross_entropy(
+    onehot_labels, logits, weights=1.0, label_smoothing=0, scope=None,
+    loss_collection=ops.GraphKeys.LOSSES):
+  """Creates a cross-entropy loss using tf.nn.softmax_cross_entropy_with_logits.
+
+  `weight` acts as a coefficient for the loss. If a scalar is provided,
+  then the loss is simply scaled by the given value. If `weight` is a
+  tensor of size [`batch_size`], then the loss weights apply to each
+  corresponding sample.
+
+  If `label_smoothing` is nonzero, smooth the labels towards 1/num_classes:
+      new_onehot_labels = onehot_labels * (1 - label_smoothing)
+                          + label_smoothing / num_classes
+
+  Args:
+    onehot_labels: [batch_size, num_classes] target one_hot_encoded labels.
+    logits: [batch_size, num_classes] logits outputs of the network .
+    weights: Coefficients for the loss. The tensor must be a scalar or a tensor
+      of shape [batch_size].
+    label_smoothing: If greater than 0 then smooth the labels.
+    scope: the scope for the operations performed in computing the loss.
+    loss_collection: collection to which the loss will be added.
+
+  Returns:
+    A scalar `Tensor` representing the loss value.
+
+  Raises:
+    ValueError: If the shape of `logits` doesn't match that of `onehot_labels`
+      or if the shape of `weight` is invalid or if `weight` is None.
+  """
+  with ops.name_scope(scope, "softmax_cross_entropy_loss",
+                      [logits, onehot_labels, weights]) as scope:
+    logits.get_shape().assert_is_compatible_with(onehot_labels.get_shape())
+
+    onehot_labels = math_ops.cast(onehot_labels, logits.dtype)
+
+    if label_smoothing > 0:
+      num_classes = math_ops.cast(
+          array_ops.shape(onehot_labels)[1], logits.dtype)
+      smooth_positives = 1.0 - label_smoothing
+      smooth_negatives = label_smoothing / num_classes
+      onehot_labels = onehot_labels * smooth_positives + smooth_negatives
+
+    losses = nn.softmax_cross_entropy_with_logits(logits, onehot_labels,
+                                                  name="xentropy")
+    return compute_weighted_loss(losses, weights, scope, loss_collection)
+
+
+def sparse_softmax_cross_entropy(labels, logits, weights=1.0, scope=None,
+                                 loss_collection=ops.GraphKeys.LOSSES):
+  """Cross-entropy loss using `tf.nn.sparse_softmax_cross_entropy_with_logits`.
+
+  `weight` acts as a coefficient for the loss. If a scalar is provided,
+  then the loss is simply scaled by the given value. If `weight` is a
+  tensor of size [`batch_size`], then the loss weights apply to each
+  corresponding sample.
+
+  Args:
+    labels: [batch_size, 1] or [batch_size] target labels of dtype `int32` or
+      `int64` in the range `[0, num_classes)`.
+    logits: [batch_size, num_classes] logits outputs of the network .
+    weights: Coefficients for the loss. The tensor must be a scalar or a tensor
+      of shape [batch_size] or [batch_size, 1].
+    scope: the scope for the operations performed in computing the loss.
+    loss_collection: collection to which the loss will be added.
+
+  Returns:
+    A scalar `Tensor` representing the loss value.
+
+  Raises:
+    ValueError: If the shapes of logits, labels, and weight are incompatible, or
+      if `weight` is None.
+  """
+  with ops.name_scope(scope, "sparse_softmax_cross_entropy_loss",
+                      [logits, labels, weights]) as scope:
+    labels = array_ops.reshape(labels, shape=[array_ops.shape(labels)[0]])
+    weights = array_ops.squeeze(weights)
+
+    losses = nn.sparse_softmax_cross_entropy_with_logits(logits, labels,
+                                                         name="xentropy")
+    return compute_weighted_loss(losses, weights, scope, loss_collection)
diff --git a/tensorflow/python/ops/losses/util.py b/tensorflow/python/ops/losses/util.py
new file mode 100644
index 00000000000000..aaf324891f3f3d
--- /dev/null
+++ b/tensorflow/python/ops/losses/util.py
@@ -0,0 +1,88 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for manipulating the loss collections.
+
+
+@@add_loss
+@@get_losses
+@@get_regularization_losses
+@@get_total_loss
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+
+
+def add_loss(loss, loss_collection=ops.GraphKeys.LOSSES):
+  """Adds a externally defined loss to the collection of losses.
+
+  Args:
+    loss: A loss `Tensor`.
+    loss_collection: Optional collection to add the loss to.
+  """
+  if loss_collection:
+    ops.add_to_collection(loss_collection, loss)
+
+
+def get_losses(scope=None, loss_collection=ops.GraphKeys.LOSSES):
+  """Gets the list of losses from the loss_collection.
+
+  Args:
+    scope: an optional scope for filtering the losses to return.
+    loss_collection: Optional losses collection.
+
+  Returns:
+    a list of loss tensors.
+  """
+  return ops.get_collection(loss_collection, scope)
+
+
+def get_regularization_losses(scope=None):
+  """Gets the regularization losses.
+
+  Args:
+    scope: an optional scope for filtering the losses to return.
+
+  Returns:
+    A list of loss variables.
+  """
+  return ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES, scope)
+
+
+def get_total_loss(add_regularization_losses=True, name="total_loss"):
+  """Returns a tensor whose value represents the total loss.
+
+  Notice that the function adds the given losses to the regularization losses.
+
+  Args:
+    add_regularization_losses: A boolean indicating whether or not to use the
+      regularization losses in the sum.
+    name: The name of the returned tensor.
+
+  Returns:
+    A `Tensor` whose value represents the total loss.
+
+  Raises:
+    ValueError: if `losses` is not iterable.
+  """
+  losses = get_losses()
+  if add_regularization_losses:
+    losses += get_regularization_losses()
+  return math_ops.add_n(losses, name=name)
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 21bfe205ef16fc..649f8d34ae92cd 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -95,7 +95,6 @@
 @@matrix_transpose
 
 @@matmul
-@@batch_matmul
 
 @@matrix_determinant
 @@matrix_inverse
@@ -251,7 +250,8 @@ def argmax(input, axis=None, name=None, dimension=None):
   return gen_math_ops.arg_max(input, axis, name)
 
 
-argmax.__doc__ = gen_math_ops.arg_max.__doc__.replace("dimension", "axis")
+argmax.__doc__ = (gen_math_ops.arg_max.__doc__
+                  .replace("dimensions", "axes").replace("dimension", "axis"))
 
 
 # TODO(aselle:deprecate arg_min)
@@ -263,7 +263,8 @@ def argmin(input, axis=None, name=None, dimension=None):
   return gen_math_ops.arg_min(input, axis, name)
 
 
-argmin.__doc__ = gen_math_ops.arg_min.__doc__.replace("dimension", "axis")
+argmin.__doc__ = (gen_math_ops.arg_min.__doc__
+                  .replace("dimensions", "axes").replace("dimension", "axis"))
 
 # pylint: enable=redefined-builtin
 
@@ -296,10 +297,10 @@ def abs(x, name=None):
         x_abs = gen_math_ops.complex_abs(
             x.values, Tout=x.values.dtype.real_dtype, name=name)
         return sparse_tensor.SparseTensor(
-            indices=x.indices, values=x_abs, shape=x.shape)
+            indices=x.indices, values=x_abs, dense_shape=x.shape)
       x_abs = gen_math_ops._abs(x.values, name=name)
       return sparse_tensor.SparseTensor(
-          indices=x.indices, values=x_abs, shape=x.shape)
+          indices=x.indices, values=x_abs, dense_shape=x.shape)
     else:
       x = ops.convert_to_tensor(x, name="x")
       if x.dtype in (dtypes.complex64, dtypes.complex128):
@@ -336,7 +337,7 @@ def neg(x, name=None):
     if isinstance(x, sparse_tensor.SparseTensor):
       x_neg = gen_math_ops.neg(x.values, name=name)
       return sparse_tensor.SparseTensor(
-          indices=x.indices, values=x_neg, shape=x.shape)
+          indices=x.indices, values=x_neg, dense_shape=x.shape)
     else:
       return gen_math_ops.neg(x, name=name)
 
@@ -360,7 +361,7 @@ def sign(x, name=None):
     if isinstance(x, sparse_tensor.SparseTensor):
       x_sign = gen_math_ops.sign(x.values, name=name)
       return sparse_tensor.SparseTensor(
-          indices=x.indices, values=x_sign, shape=x.shape)
+          indices=x.indices, values=x_sign, dense_shape=x.shape)
     else:
       return gen_math_ops.sign(x, name=name)
 
@@ -382,7 +383,7 @@ def square(x, name=None):
     if isinstance(x, sparse_tensor.SparseTensor):
       x_square = gen_math_ops.square(x.values, name=name)
       return sparse_tensor.SparseTensor(
-          indices=x.indices, values=x_square, shape=x.shape)
+          indices=x.indices, values=x_square, dense_shape=x.shape)
     else:
       return gen_math_ops.square(x, name=name)
 
@@ -404,7 +405,7 @@ def sqrt(x, name=None):
     if isinstance(x, sparse_tensor.SparseTensor):
       x_sqrt = gen_math_ops.sqrt(x.values, name=name)
       return sparse_tensor.SparseTensor(
-          indices=x.indices, values=x_sqrt, shape=x.shape)
+          indices=x.indices, values=x_sqrt, dense_shape=x.shape)
     else:
       return gen_math_ops.sqrt(x, name=name)
 
@@ -424,7 +425,7 @@ def erf(x, name=None):
     if isinstance(x, sparse_tensor.SparseTensor):
       x_erf = gen_math_ops.erf(x.values, name=name)
       return sparse_tensor.SparseTensor(
-          indices=x.indices, values=x_erf, shape=x.shape)
+          indices=x.indices, values=x_erf, dense_shape=x.shape)
     else:
       return gen_math_ops.erf(x, name=name)
 
@@ -821,8 +822,8 @@ def binary_op_wrapper_sparse(sp_x, y):
       return sparse_tensor.SparseTensor(
           sp_x.indices,
           func(
-              sp_x.indices, sp_x.values, sp_x.shape, y, name=name),
-          sp_x.shape)
+              sp_x.indices, sp_x.values, sp_x.dense_shape, y, name=name),
+          sp_x.dense_shape)
 
   def r_binary_op_wrapper(y, x):
     with ops.name_scope(None, op_name, [x, y]) as name:
@@ -1170,8 +1171,8 @@ def _ReductionDims(x, axis, reduction_indices):
       return constant_op.constant(
           np.arange(x.get_shape().ndims), dtype=dtypes.int32)
     if (isinstance(x, sparse_tensor.SparseTensor) and
-        x.shape.get_shape().is_fully_defined()):
-      rank = x.shape.get_shape()[0].value  # sparse.shape is an 1-D tensor.
+        x.dense_shape.get_shape().is_fully_defined()):
+      rank = x.dense_shape.get_shape()[0].value  # sparse.dense_shape is 1-D.
       return constant_op.constant(np.arange(rank), dtype=dtypes.int32)
 
     # Otherwise, we rely on Range and Rank to do the right thing at run-time.
@@ -1782,8 +1783,6 @@ def matmul(a,
 
 
 sparse_matmul = gen_math_ops._sparse_mat_mul
-# TODO(rmlarsen): Remove Python interface to batch_matmul.
-batch_matmul = gen_math_ops._batch_mat_mul
 
 
 @ops.RegisterStatistics("MatMul", "flops")
@@ -2000,7 +1999,7 @@ def tanh(x, name=None):
     if isinstance(x, sparse_tensor.SparseTensor):
       x_tanh = gen_math_ops._tanh(x.values, name=name)
       return sparse_tensor.SparseTensor(
-          indices=x.indices, values=x_tanh, shape=x.shape)
+          indices=x.indices, values=x_tanh, dense_shape=x.shape)
     else:
       return gen_math_ops._tanh(x, name=name)
 
@@ -2175,4 +2174,3 @@ def reduced_shape(input_shape, axes):
 def select(condition, x, y, name=None):
   return gen_math_ops._select(condition, x, y, name)
 select.__doc__ = gen_math_ops._select.__doc__
-
diff --git a/tensorflow/python/ops/metrics.py b/tensorflow/python/ops/metrics.py
new file mode 100644
index 00000000000000..0989cf20e610ac
--- /dev/null
+++ b/tensorflow/python/ops/metrics.py
@@ -0,0 +1,2588 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Evaluation-related metrics.
+
+@@accuracy
+@@auc
+@@mean
+@@mean_absolute_error
+@@mean_cosine_distance
+@mean_iou
+@@mean_relative_error
+@@mean_squared_error
+@@mean_tensor
+@@percentage_below
+@@precision
+@@precision_at_thresholds
+@@recall
+@@recall_at_k
+@@recall_at_thresholds
+@@root_mean_squared_error
+@@sensitivity_at_specificity
+@@sparse_average_precision_at_k
+@@sparse_precision_at_k
+@@specificity_at_sensitivity
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import confusion_matrix
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import sets
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+
+
+def _local_variable(initial_value, validate_shape=True, name=None):
+  """Create variable and add it to `GraphKeys.LOCAL_VARIABLES` collection.
+
+  Args:
+    initial_value: See variables.Variable.__init__.
+    validate_shape: See variables.Variable.__init__.
+    name: See variables.Variable.__init__.
+  Returns:
+    New variable.
+  """
+  return variables.Variable(
+      initial_value, trainable=False,
+      collections=[ops.GraphKeys.LOCAL_VARIABLES],
+      validate_shape=validate_shape, name=name)
+
+
+def _remove_squeezable_dimensions(labels, predictions, weights):
+  """Internal version of _remove_squeezable_dimensions which handles weights.
+
+  Squeezes `predictions` and `labels` if their rank differs by 1.
+  Squeezes `weights` if its rank is 1 more than the new rank of `predictions`
+
+  This will use static shape if available. Otherwise, it will add graph
+  operations, which could result in a performance hit.
+
+  Args:
+    labels: Label values, a `Tensor` whose dimensions match `predictions`.
+    predictions: Predicted values, a `Tensor` of arbitrary dimensions.
+    weights: Optional weight `Tensor`. It will be squeezed if its rank is 1
+      more than the new rank of `predictions`
+
+  Returns:
+    Tuple of `predictions`, `labels` and `weights`, possibly with the last
+    dimension squeezed.
+  """
+  labels, predictions = confusion_matrix.remove_squeezable_dimensions(
+      labels, predictions)
+  predictions.get_shape().assert_is_compatible_with(labels.get_shape())
+
+  if weights is not None:
+    weights = ops.convert_to_tensor(weights)
+    predictions_shape = predictions.get_shape()
+    predictions_rank = predictions_shape.ndims
+    weights_shape = weights.get_shape()
+    weights_rank = weights_shape.ndims
+
+    if (predictions_rank is not None) and (weights_rank is not None):
+      # Use static rank.
+      if weights_rank - predictions_rank == 1:
+        weights = array_ops.squeeze(weights, [-1])
+    elif (weights_rank is None) or (
+        weights_shape.dims[-1].is_compatible_with(1)):
+      # Use dynamic rank
+      weights = control_flow_ops.cond(
+          math_ops.equal(array_ops.rank(weights),
+                         math_ops.add(array_ops.rank(predictions), 1)),
+          lambda: array_ops.squeeze(weights, [-1]),
+          lambda: weights)
+  return labels, predictions, weights
+
+
+def _create_local(name, shape, collections=None, validate_shape=True,
+                  dtype=dtypes.float32):
+  """Creates a new local variable.
+
+  Args:
+    name: The name of the new or existing variable.
+    shape: Shape of the new or existing variable.
+    collections: A list of collection names to which the Variable will be added.
+    validate_shape: Whether to validate the shape of the variable.
+    dtype: Data type of the variables.
+
+  Returns:
+    The created variable.
+  """
+  # Make sure local variables are added to tf.GraphKeys.LOCAL_VARIABLES
+  collections = list(collections or [])
+  collections += [ops.GraphKeys.LOCAL_VARIABLES]
+  return variables.Variable(
+      initial_value=array_ops.zeros(shape, dtype=dtype),
+      name=name,
+      trainable=False,
+      collections=collections,
+      validate_shape=validate_shape)
+
+
+def _broadcast_weights(weights, values):
+  """Broadcast `weights` to the same shape as `values`.
+
+  This returns a version of `weights` following the same broadcast rules as
+  `mul(weights, values)`. When computing a weighted average, use this function
+  to broadcast `weights` before summing them; e.g.,
+  `reduce_sum(w * v) / reduce_sum(_broadcast_weights(w, v))`.
+
+  Args:
+    weights: `Tensor` whose shape is broadcastable to `values`.
+    values: `Tensor` of any shape.
+
+  Returns:
+    `weights` broadcast to `values` shape.
+  """
+  weights_shape = weights.get_shape()
+  values_shape = values.get_shape()
+  if (weights_shape.is_fully_defined() and
+      values_shape.is_fully_defined() and
+      weights_shape.is_compatible_with(values_shape)):
+    return weights
+  return math_ops.mul(
+      weights, array_ops.ones_like(values), name='broadcast_weights')
+
+
+def _safe_div(numerator, denominator, name):
+  """Divides two values, returning 0 if the denominator is <= 0.
+
+  Args:
+    numerator: A real `Tensor`.
+    denominator: A real `Tensor`, with dtype matching `numerator`.
+    name: Name for the returned op.
+
+  Returns:
+    0 if `denominator` <= 0, else `numerator` / `denominator`
+  """
+  return array_ops.where(
+      math_ops.greater(denominator, 0),
+      math_ops.truediv(numerator, denominator),
+      0,
+      name=name)
+
+
+def _safe_scalar_div(numerator, denominator, name):
+  """Divides two values, returning 0 if the denominator is 0.
+
+  Args:
+    numerator: A scalar `float64` `Tensor`.
+    denominator: A scalar `float64` `Tensor`.
+    name: Name for the returned op.
+
+  Returns:
+    0 if `denominator` == 0, else `numerator` / `denominator`
+  """
+  numerator.get_shape().with_rank_at_most(1)
+  denominator.get_shape().with_rank_at_most(1)
+  return control_flow_ops.cond(
+      math_ops.equal(
+          array_ops.constant(0.0, dtype=dtypes.float64), denominator),
+      lambda: array_ops.constant(0.0, dtype=dtypes.float64),
+      lambda: math_ops.div(numerator, denominator),
+      name=name)
+
+
+def mean(values, weights=None, metrics_collections=None,
+         updates_collections=None, name=None):
+  """Computes the (weighted) mean of the given values.
+
+  The `mean` function creates two local variables, `total` and `count`
+  that are used to compute the average of `values`. This average is ultimately
+  returned as `mean` which is an idempotent operation that simply divides
+  `total` by `count`.
+
+  For estimation of the metric  over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the `mean`.
+  `update_op` increments `total` with the reduced sum of the product of `values`
+  and `weights`, and it increments `count` with the reduced sum of `weights`.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    values: A `Tensor` of arbitrary dimensions.
+    weights: An optional `Tensor` whose shape is broadcastable to `values`.
+    metrics_collections: An optional list of collections that `mean`
+      should be added to.
+    updates_collections: An optional list of collections that `update_op`
+      should be added to.
+    name: An optional variable_scope name.
+
+  Returns:
+    mean: A `Tensor` representing the current mean, the value of `total` divided
+      by `count`.
+    update_op: An operation that increments the `total` and `count` variables
+      appropriately and whose value matches `mean_value`.
+
+  Raises:
+    ValueError: If `weights` is not `None` and its shape doesn't match `values`,
+      or if either `metrics_collections` or `updates_collections` are not a list
+      or tuple.
+  """
+  with variable_scope.variable_scope(name, 'mean', (values, weights)):
+    values = math_ops.to_float(values)
+
+    total = _create_local('total', shape=[])
+    count = _create_local('count', shape=[])
+
+    if weights is not None:
+      weights = math_ops.to_float(weights)
+      values = math_ops.mul(values, weights)
+      num_values = math_ops.reduce_sum(_broadcast_weights(weights, values))
+    else:
+      num_values = math_ops.to_float(array_ops.size(values))
+
+    total_compute_op = state_ops.assign_add(total, math_ops.reduce_sum(values))
+    count_compute_op = state_ops.assign_add(count, num_values)
+
+    mean_t = _safe_div(total, count, 'value')
+    with ops.control_dependencies([total_compute_op, count_compute_op]):
+      update_op = _safe_div(total, count, 'update_op')
+
+    if metrics_collections:
+      ops.add_to_collections(metrics_collections, mean_t)
+
+    if updates_collections:
+      ops.add_to_collections(updates_collections, update_op)
+
+    return mean_t, update_op
+
+
+def accuracy(labels, predictions, weights=None, metrics_collections=None,
+             updates_collections=None, name=None):
+  """Calculates how often `predictions` matches `labels`.
+
+  The `accuracy` function creates two local variables, `total` and
+  `count` that are used to compute the frequency with which `predictions`
+  matches `labels`. This frequency is ultimately returned as `accuracy`: an
+  idempotent operation that simply divides `total` by `count`.
+
+  For estimation of the metric  over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the `accuracy`.
+  Internally, an `is_correct` operation computes a `Tensor` with elements 1.0
+  where the corresponding elements of `predictions` and `labels` match and 0.0
+  otherwise. Then `update_op` increments `total` with the reduced sum of the
+  product of `weights` and `is_correct`, and it increments `count` with the
+  reduced sum of `weights`.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: The ground truth values, a `Tensor` whose shape matches
+      `predictions`.
+    predictions: The predicted values, a `Tensor` of any shape.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
+    metrics_collections: An optional list of collections that `accuracy` should
+      be added to.
+    updates_collections: An optional list of collections that `update_op` should
+      be added to.
+    name: An optional variable_scope name.
+
+  Returns:
+    accuracy: A `Tensor` representing the accuracy, the value of `total` divided
+      by `count`.
+    update_op: An operation that increments the `total` and `count` variables
+      appropriately and whose value matches `accuracy`.
+
+  Raises:
+    ValueError: If `predictions` and `labels` have mismatched shapes, or if
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      either `metrics_collections` or `updates_collections` are not a list or
+      tuple.
+  """
+  labels, predictions, weights = _remove_squeezable_dimensions(
+      labels, predictions, weights=weights)
+  predictions.get_shape().assert_is_compatible_with(labels.get_shape())
+  if labels.dtype != predictions.dtype:
+    predictions = math_ops.cast(predictions, labels.dtype)
+  is_correct = math_ops.to_float(math_ops.equal(predictions, labels))
+  return mean(is_correct, weights, metrics_collections,
+              updates_collections, name or 'accuracy')
+
+
+def _confusion_matrix_at_thresholds(
+    labels, predictions, thresholds, weights=None, includes=None):
+  """Computes true_positives, false_negatives, true_negatives, false_positives.
+
+  This function creates up to four local variables, `true_positives`,
+  `true_negatives`, `false_positives` and `false_negatives`.
+  `true_positive[i]` is defined as the total weight of values in `predictions`
+  above `thresholds[i]` whose corresponding entry in `labels` is `True`.
+  `false_negatives[i]` is defined as the total weight of values in `predictions`
+  at most `thresholds[i]` whose corresponding entry in `labels` is `True`.
+  `true_negatives[i]` is defined as the total weight of values in `predictions`
+  at most `thresholds[i]` whose corresponding entry in `labels` is `False`.
+  `false_positives[i]` is defined as the total weight of values in `predictions`
+  above `thresholds[i]` whose corresponding entry in `labels` is `False`.
+
+  For estimation of these metrics over a stream of data, for each metric the
+  function respectively creates an `update_op` operation that updates the
+  variable and returns its value.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: A `Tensor` whose shape matches `predictions`. `labels` will be cast
+      to `bool`.
+    predictions: A floating point `Tensor` of arbitrary shape and whose values
+      are in the range `[0, 1]`.
+    thresholds: A python list or tuple of float thresholds in `[0, 1]`.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
+    includes: Tuple of keys to return, from 'tp', 'fn', 'tn', fp'. If `None`,
+        default to all four.
+
+  Returns:
+    values: Dict of variables of shape `[len(thresholds)]`. Keys are from
+        `includes`.
+    update_ops: Dict of operations that increments the `values`. Keys are from
+        `includes`.
+
+  Raises:
+    ValueError: If `predictions` and `labels` have mismatched shapes, or if
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      `includes` contains invalid keys.
+  """
+  all_includes = ('tp', 'fn', 'tn', 'fp')
+  if includes is None:
+    includes = all_includes
+  else:
+    for include in includes:
+      if include not in all_includes:
+        raise ValueError('Invaild key: %s.' % include)
+
+  labels, predictions, weights = _remove_squeezable_dimensions(
+      labels, predictions, weights)
+  predictions.get_shape().assert_is_compatible_with(labels.get_shape())
+
+  num_thresholds = len(thresholds)
+
+  # Reshape predictions and labels.
+  predictions_2d = array_ops.reshape(predictions, [-1, 1])
+  labels_2d = array_ops.reshape(
+      math_ops.cast(labels, dtype=dtypes.bool), [1, -1])
+
+  # Use static shape if known.
+  num_predictions = predictions_2d.get_shape().as_list()[0]
+
+  # Otherwise use dynamic shape.
+  if num_predictions is None:
+    num_predictions = array_ops.shape(predictions_2d)[0]
+  thresh_tiled = array_ops.tile(
+      array_ops.expand_dims(array_ops.constant(thresholds), [1]),
+      array_ops.pack([1, num_predictions]))
+
+  # Tile the predictions after thresholding them across different thresholds.
+  pred_is_pos = math_ops.greater(
+      array_ops.tile(array_ops.transpose(predictions_2d), [num_thresholds, 1]),
+      thresh_tiled)
+  if ('fn' in includes) or ('tn' in includes):
+    pred_is_neg = math_ops.logical_not(pred_is_pos)
+
+  # Tile labels by number of thresholds
+  label_is_pos = array_ops.tile(labels_2d, [num_thresholds, 1])
+  if ('fp' in includes) or ('tn' in includes):
+    label_is_neg = math_ops.logical_not(label_is_pos)
+
+  if weights is not None:
+    weights = math_ops.to_float(weights)
+    weights_tiled = array_ops.tile(array_ops.reshape(_broadcast_weights(
+        weights, predictions), [1, -1]), [num_thresholds, 1])
+    thresh_tiled.get_shape().assert_is_compatible_with(
+        weights_tiled.get_shape())
+  else:
+    weights_tiled = None
+
+  values = {}
+  update_ops = {}
+
+  if 'tp' in includes:
+    true_p = _create_local('true_positives', shape=[num_thresholds])
+    is_true_positive = math_ops.to_float(
+        math_ops.logical_and(label_is_pos, pred_is_pos))
+    if weights_tiled is not None:
+      is_true_positive *= weights_tiled
+    update_ops['tp'] = state_ops.assign_add(
+        true_p, math_ops.reduce_sum(is_true_positive, 1))
+    values['tp'] = true_p
+
+  if 'fn' in includes:
+    false_n = _create_local('false_negatives', shape=[num_thresholds])
+    is_false_negative = math_ops.to_float(
+        math_ops.logical_and(label_is_pos, pred_is_neg))
+    if weights_tiled is not None:
+      is_false_negative *= weights_tiled
+    update_ops['fn'] = state_ops.assign_add(
+        false_n, math_ops.reduce_sum(is_false_negative, 1))
+    values['fn'] = false_n
+
+  if 'tn' in includes:
+    true_n = _create_local('true_negatives', shape=[num_thresholds])
+    is_true_negative = math_ops.to_float(
+        math_ops.logical_and(label_is_neg, pred_is_neg))
+    if weights_tiled is not None:
+      is_true_negative *= weights_tiled
+    update_ops['tn'] = state_ops.assign_add(
+        true_n, math_ops.reduce_sum(is_true_negative, 1))
+    values['tn'] = true_n
+
+  if 'fp' in includes:
+    false_p = _create_local('false_positives', shape=[num_thresholds])
+    is_false_positive = math_ops.to_float(
+        math_ops.logical_and(label_is_neg, pred_is_pos))
+    if weights_tiled is not None:
+      is_false_positive *= weights_tiled
+    update_ops['fp'] = state_ops.assign_add(
+        false_p, math_ops.reduce_sum(is_false_positive, 1))
+    values['fp'] = false_p
+
+  return values, update_ops
+
+
+def auc(labels, predictions, weights=None, num_thresholds=200,
+        metrics_collections=None, updates_collections=None,
+        curve='ROC', name=None):
+  """Computes the approximate AUC via a Riemann sum.
+
+  The `auc` function creates four local variables, `true_positives`,
+  `true_negatives`, `false_positives` and `false_negatives` that are used to
+  compute the AUC. To discretize the AUC curve, a linearly spaced set of
+  thresholds is used to compute pairs of recall and precision values. The area
+  under the ROC-curve is therefore computed using the height of the recall
+  values by the false positive rate, while the area under the PR-curve is the
+  computed using the height of the precision values by the recall.
+
+  This value is ultimately returned as `auc`, an idempotent operation that
+  computes the area under a discretized curve of precision versus recall values
+  (computed using the aforementioned variables). The `num_thresholds` variable
+  controls the degree of discretization with larger numbers of thresholds more
+  closely approximating the true AUC. The quality of the approximation may vary
+  dramatically depending on `num_thresholds`.
+
+  For best results, `predictions` should be distributed approximately uniformly
+  in the range [0, 1] and not peaked around 0 or 1. The quality of the AUC
+  approximation may be poor if this is not the case.
+
+  For estimation of the metric over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the `auc`.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: A `bool` `Tensor` whose shape matches `predictions`.
+    predictions: A floating point `Tensor` of arbitrary shape and whose values
+      are in the range `[0, 1]`.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
+    num_thresholds: The number of thresholds to use when discretizing the roc
+      curve.
+    metrics_collections: An optional list of collections that `auc` should be
+      added to.
+    updates_collections: An optional list of collections that `update_op` should
+      be added to.
+    curve: Specifies the name of the curve to be computed, 'ROC' [default] or
+    'PR' for the Precision-Recall-curve.
+    name: An optional variable_scope name.
+
+  Returns:
+    auc: A scalar `Tensor` representing the current area-under-curve.
+    update_op: An operation that increments the `true_positives`,
+      `true_negatives`, `false_positives` and `false_negatives` variables
+      appropriately and whose value matches `auc`.
+
+  Raises:
+    ValueError: If `predictions` and `labels` have mismatched shapes, or if
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      either `metrics_collections` or `updates_collections` are not a list or
+      tuple.
+  """
+  with variable_scope.variable_scope(
+      name, 'auc', (labels, predictions, weights)):
+    if curve != 'ROC' and  curve != 'PR':
+      raise ValueError('curve must be either ROC or PR, %s unknown' %
+                       (curve))
+    kepsilon = 1e-7  # to account for floating point imprecisions
+    thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
+                  for i in range(num_thresholds-2)]
+    thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
+
+    values, update_ops = _confusion_matrix_at_thresholds(
+        labels, predictions, thresholds, weights)
+
+    # Add epsilons to avoid dividing by 0.
+    epsilon = 1.0e-6
+    def compute_auc(tp, fn, tn, fp, name):
+      """Computes the roc-auc or pr-auc based on confusion counts."""
+      rec = math_ops.div(tp + epsilon, tp + fn + epsilon)
+      if curve == 'ROC':
+        fp_rate = math_ops.div(fp, fp + tn + epsilon)
+        x = fp_rate
+        y = rec
+      else:  # curve == 'PR'.
+        prec = math_ops.div(tp + epsilon, tp + fp + epsilon)
+        x = rec
+        y = prec
+      return math_ops.reduce_sum(math_ops.mul(
+          x[:num_thresholds - 1] - x[1:],
+          (y[:num_thresholds - 1] + y[1:]) / 2.), name=name)
+
+    # sum up the areas of all the trapeziums
+    auc_value = compute_auc(
+        values['tp'], values['fn'], values['tn'], values['fp'], 'value')
+    update_op = compute_auc(
+        update_ops['tp'], update_ops['fn'], update_ops['tn'], update_ops['fp'],
+        'update_op')
+
+    if metrics_collections:
+      ops.add_to_collections(metrics_collections, auc_value)
+
+    if updates_collections:
+      ops.add_to_collections(updates_collections, update_op)
+
+    return auc_value, update_op
+
+
+def mean_absolute_error(labels, predictions, weights=None,
+                        metrics_collections=None,
+                        updates_collections=None,
+                        name=None):
+  """Computes the mean absolute error between the labels and predictions.
+
+  The `mean_absolute_error` function creates two local variables,
+  `total` and `count` that are used to compute the mean absolute error. This
+  average is weighted by `weights`, and it is ultimately returned as
+  `mean_absolute_error`: an idempotent operation that simply divides `total` by
+  `count`.
+
+  For estimation of the metric over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the
+  `mean_absolute_error`. Internally, an `absolute_errors` operation computes the
+  absolute value of the differences between `predictions` and `labels`. Then
+  `update_op` increments `total` with the reduced sum of the product of
+  `weights` and `absolute_errors`, and it increments `count` with the reduced
+  sum of `weights`
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: A `Tensor` of the same shape as `predictions`.
+    predictions: A `Tensor` of arbitrary shape.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
+    metrics_collections: An optional list of collections that
+      `mean_absolute_error` should be added to.
+    updates_collections: An optional list of collections that `update_op` should
+      be added to.
+    name: An optional variable_scope name.
+
+  Returns:
+    mean_absolute_error: A `Tensor` representing the current mean, the value of
+      `total` divided by `count`.
+    update_op: An operation that increments the `total` and `count` variables
+      appropriately and whose value matches `mean_absolute_error`.
+
+  Raises:
+    ValueError: If `predictions` and `labels` have mismatched shapes, or if
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      either `metrics_collections` or `updates_collections` are not a list or
+      tuple.
+  """
+  predictions, labels, weights = _remove_squeezable_dimensions(
+      labels, predictions, weights)
+  predictions.get_shape().assert_is_compatible_with(labels.get_shape())
+  absolute_errors = math_ops.abs(predictions - labels)
+  return mean(absolute_errors, weights, metrics_collections,
+              updates_collections, name or 'mean_absolute_error')
+
+
+def mean_cosine_distance(labels, predictions, dim, weights=None,
+                         metrics_collections=None,
+                         updates_collections=None,
+                         name=None):
+  """Computes the cosine distance between the labels and predictions.
+
+  The `mean_cosine_distance` function creates two local variables,
+  `total` and `count` that are used to compute the average cosine distance
+  between `predictions` and `labels`. This average is weighted by `weights`,
+  and it is ultimately returned as `mean_distance`, which is an idempotent
+  operation that simply divides `total` by `count`.
+
+  For estimation of the metric over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the
+  `mean_distance`.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: A `Tensor` of arbitrary shape.
+    predictions: A `Tensor` of the same shape as `labels`.
+    dim: The dimension along which the cosine distance is computed.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`,
+      and whose dimension `dim` is 1.
+    metrics_collections: An optional list of collections that the metric
+      value variable should be added to.
+    updates_collections: An optional list of collections that the metric update
+      ops should be added to.
+    name: An optional variable_scope name.
+
+  Returns:
+    mean_distance: A `Tensor` representing the current mean, the value of
+      `total` divided by `count`.
+    update_op: An operation that increments the `total` and `count` variables
+      appropriately.
+
+  Raises:
+    ValueError: If `predictions` and `labels` have mismatched shapes, or if
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      either `metrics_collections` or `updates_collections` are not a list or
+      tuple.
+  """
+  labels, predictions, weights = _remove_squeezable_dimensions(
+      labels, predictions, weights)
+  predictions.get_shape().assert_is_compatible_with(labels.get_shape())
+  radial_diffs = math_ops.mul(predictions, labels)
+  radial_diffs = math_ops.reduce_sum(radial_diffs,
+                                     reduction_indices=[dim,],
+                                     keep_dims=True)
+  mean_distance, update_op = mean(radial_diffs, weights,
+                                  None,
+                                  None,
+                                  name or 'mean_cosine_distance')
+  mean_distance = math_ops.sub(1.0, mean_distance)
+  update_op = math_ops.sub(1.0, update_op)
+
+  if metrics_collections:
+    ops.add_to_collections(metrics_collections, mean_distance)
+
+  if updates_collections:
+    ops.add_to_collections(updates_collections, update_op)
+
+  return mean_distance, update_op
+
+
+def mean_iou(labels,
+             predictions,
+             num_classes,
+             weights=None,
+             metrics_collections=None,
+             updates_collections=None,
+             name=None):
+  """Calculate per-step mean Intersection-Over-Union (mIOU).
+
+  Mean Intersection-Over-Union is a common evaluation metric for
+  semantic image segmentation, which first computes the IOU for each
+  semantic class and then computes the average over classes.
+  IOU is defined as follows:
+    IOU = true_positive / (true_positive + false_positive + false_negative).
+  The predictions are accumulated in a confusion matrix, weighted by `weights`,
+  and mIOU is then calculated from it.
+
+  For estimation of the metric over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the `mean_iou`.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: A `Tensor` of ground truth labels with shape [batch size] and of
+      type `int32` or `int64`. The tensor will be flattened, if its rank > 1.
+    predictions: A `Tensor` of prediction results for semantic labels, whose
+      shape is [batch size] and type `int32` or `int64`. The tensor will be
+      flattened, if its rank > 1.
+    num_classes: The possible number of labels the prediction task can
+      have. This value must be provided, since a confusion matrix of
+      dimension = [num_classes, num_classes] will be allocated.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
+    metrics_collections: An optional list of collections that `mean_iou`
+      should be added to.
+    updates_collections: An optional list of collections `update_op` should be
+      added to.
+    name: An optional variable_scope name.
+
+  Returns:
+    mean_iou: A `Tensor` representing the mean intersection-over-union.
+    update_op: An operation that increments the confusion matrix.
+
+  Raises:
+    ValueError: If `predictions` and `labels` have mismatched shapes, or if
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      either `metrics_collections` or `updates_collections` are not a list or
+      tuple.
+  """
+  with variable_scope.variable_scope(
+      name, 'mean_iou', (predictions, labels, weights)):
+    # Check if shape is compatible.
+    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
+
+    # Local variable to accumulate the predictions in the confusion matrix.
+    cm_dtype = dtypes.int64 if weights is not None else dtypes.float64
+    total_cm = _create_local('total_confusion_matrix',
+                             shape=[num_classes, num_classes], dtype=cm_dtype)
+
+    # Cast the type to int64 required by confusion_matrix_ops.
+    predictions = math_ops.to_int64(predictions)
+    labels = math_ops.to_int64(labels)
+    num_classes = math_ops.to_int64(num_classes)
+
+    # Flatten the input if its rank > 1.
+    predictions_rank = predictions.get_shape().ndims
+    if predictions_rank > 1:
+      predictions = array_ops.reshape(predictions, [-1])
+
+    labels_rank = labels.get_shape().ndims
+    if labels_rank > 1:
+      labels = array_ops.reshape(labels, [-1])
+
+    if weights is not None:
+      weights_rank = weights.get_shape().ndims
+      if weights_rank > 1:
+        weights = array_ops.reshape(weights, [-1])
+
+    # Accumulate the prediction to current confusion matrix.
+    current_cm = confusion_matrix.confusion_matrix(
+        labels, predictions, num_classes, weights=weights, dtype=cm_dtype)
+    update_op = state_ops.assign_add(total_cm, current_cm)
+
+    def compute_mean_iou(name):
+      """Compute the mean intersection-over-union via the confusion matrix."""
+      sum_over_row = math_ops.to_float(math_ops.reduce_sum(total_cm, 0))
+      sum_over_col = math_ops.to_float(math_ops.reduce_sum(total_cm, 1))
+      cm_diag = math_ops.to_float(array_ops.diag_part(total_cm))
+      denominator = sum_over_row + sum_over_col - cm_diag
+
+      # If the value of the denominator is 0, set it to 1 to avoid
+      # zero division.
+      denominator = array_ops.where(
+          math_ops.greater(denominator, 0),
+          denominator,
+          array_ops.ones_like(denominator))
+      iou = math_ops.div(cm_diag, denominator)
+      return math_ops.reduce_mean(iou, name=name)
+
+    mean_iou_v = compute_mean_iou('mean_iou')
+
+    if metrics_collections:
+      ops.add_to_collections(metrics_collections, mean_iou_v)
+
+    if updates_collections:
+      ops.add_to_collections(updates_collections, update_op)
+
+    return mean_iou_v, update_op
+
+
+def mean_relative_error(labels, predictions, normalizer, weights=None,
+                        metrics_collections=None,
+                        updates_collections=None,
+                        name=None):
+  """Computes the mean relative error by normalizing with the given values.
+
+  The `mean_relative_error` function creates two local variables,
+  `total` and `count` that are used to compute the mean relative absolute error.
+  This average is weighted by `weights`, and it is ultimately returned as
+  `mean_relative_error`: an idempotent operation that simply divides `total` by
+  `count`.
+
+  For estimation of the metric over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the
+  `mean_reative_error`. Internally, a `relative_errors` operation divides the
+  absolute value of the differences between `predictions` and `labels` by the
+  `normalizer`. Then `update_op` increments `total` with the reduced sum of the
+  product of `weights` and `relative_errors`, and it increments `count` with the
+  reduced sum of `weights`.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: A `Tensor` of the same shape as `predictions`.
+    predictions: A `Tensor` of arbitrary shape.
+    normalizer: A `Tensor` of the same shape as `predictions`.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
+    metrics_collections: An optional list of collections that
+      `mean_relative_error` should be added to.
+    updates_collections: An optional list of collections that `update_op` should
+      be added to.
+    name: An optional variable_scope name.
+
+  Returns:
+    mean_relative_error: A `Tensor` representing the current mean, the value of
+      `total` divided by `count`.
+    update_op: An operation that increments the `total` and `count` variables
+      appropriately and whose value matches `mean_relative_error`.
+
+  Raises:
+    ValueError: If `predictions` and `labels` have mismatched shapes, or if
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      either `metrics_collections` or `updates_collections` are not a list or
+      tuple.
+  """
+  labels, predictions, weights = _remove_squeezable_dimensions(
+      labels, predictions, weights)
+  predictions.get_shape().assert_is_compatible_with(labels.get_shape())
+
+  predictions, normalizer = confusion_matrix.remove_squeezable_dimensions(
+      predictions, normalizer)
+  predictions.get_shape().assert_is_compatible_with(normalizer.get_shape())
+  relative_errors = array_ops.where(
+      math_ops.equal(normalizer, 0.0),
+      array_ops.zeros_like(labels),
+      math_ops.div(math_ops.abs(labels - predictions), normalizer))
+  return mean(relative_errors, weights, metrics_collections,
+              updates_collections, name or 'mean_relative_error')
+
+
+def mean_squared_error(labels, predictions, weights=None,
+                       metrics_collections=None,
+                       updates_collections=None,
+                       name=None):
+  """Computes the mean squared error between the labels and predictions.
+
+  The `mean_squared_error` function creates two local variables,
+  `total` and `count` that are used to compute the mean squared error.
+  This average is weighted by `weights`, and it is ultimately returned as
+  `mean_squared_error`: an idempotent operation that simply divides `total` by
+  `count`.
+
+  For estimation of the metric over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the
+  `mean_squared_error`. Internally, a `squared_error` operation computes the
+  element-wise square of the difference between `predictions` and `labels`. Then
+  `update_op` increments `total` with the reduced sum of the product of
+  `weights` and `squared_error`, and it increments `count` with the reduced sum
+  of `weights`.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: A `Tensor` of the same shape as `predictions`.
+    predictions: A `Tensor` of arbitrary shape.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
+    metrics_collections: An optional list of collections that
+      `mean_squared_error` should be added to.
+    updates_collections: An optional list of collections that `update_op` should
+      be added to.
+    name: An optional variable_scope name.
+
+  Returns:
+    mean_squared_error: A `Tensor` representing the current mean, the value of
+      `total` divided by `count`.
+    update_op: An operation that increments the `total` and `count` variables
+      appropriately and whose value matches `mean_squared_error`.
+
+  Raises:
+    ValueError: If `predictions` and `labels` have mismatched shapes, or if
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      either `metrics_collections` or `updates_collections` are not a list or
+      tuple.
+  """
+  labels, predictions, weights = _remove_squeezable_dimensions(
+      labels, predictions, weights)
+  predictions.get_shape().assert_is_compatible_with(labels.get_shape())
+  squared_error = math_ops.square(labels - predictions)
+  return mean(squared_error, weights, metrics_collections,
+              updates_collections, name or 'mean_squared_error')
+
+
+def mean_tensor(values, weights=None, metrics_collections=None,
+                updates_collections=None, name=None):
+  """Computes the element-wise (weighted) mean of the given tensors.
+
+  In contrast to the `mean` function which returns a scalar with the
+  mean,  this function returns an average tensor with the same shape as the
+  input tensors.
+
+  The `mean_tensor` function creates two local variables,
+  `total_tensor` and `count_tensor` that are used to compute the average of
+  `values`. This average is ultimately returned as `mean` which is an idempotent
+  operation that simply divides `total` by `count`.
+
+  For estimation of the metric  over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the `mean`.
+  `update_op` increments `total` with the reduced sum of the product of `values`
+  and `weights`, and it increments `count` with the reduced sum of `weights`.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    values: A `Tensor` of arbitrary dimensions.
+    weights: An optional `Tensor` whose shape is broadcastable to `values`.
+    metrics_collections: An optional list of collections that `mean`
+      should be added to.
+    updates_collections: An optional list of collections that `update_op`
+      should be added to.
+    name: An optional variable_scope name.
+
+  Returns:
+    mean: A float `Tensor` representing the current mean, the value of `total`
+      divided by `count`.
+    update_op: An operation that increments the `total` and `count` variables
+      appropriately and whose value matches `mean_value`.
+
+  Raises:
+    ValueError: If `weights` is not `None` and its shape doesn't match `values`,
+      or if either `metrics_collections` or `updates_collections` are not a list
+      or tuple.
+  """
+  with variable_scope.variable_scope(name, 'mean', (values, weights)):
+    total = _create_local('total_tensor', shape=values.get_shape())
+    count = _create_local('count_tensor', shape=values.get_shape())
+
+    num_values = array_ops.ones_like(values)
+    if weights is not None:
+      weights = math_ops.to_float(weights)
+      values = math_ops.mul(values, weights)
+      num_values = math_ops.mul(num_values, weights)
+
+    total_compute_op = state_ops.assign_add(total, values)
+    count_compute_op = state_ops.assign_add(count, num_values)
+
+    def compute_mean(total, count, name):
+      non_zero_count = math_ops.maximum(count,
+                                        array_ops.ones_like(count),
+                                        name=name)
+      return math_ops.truediv(total, non_zero_count, name=name)
+
+    mean_t = compute_mean(total, count, 'value')
+    with ops.control_dependencies([total_compute_op, count_compute_op]):
+      update_op = compute_mean(total, count, 'update_op')
+
+    if metrics_collections:
+      ops.add_to_collections(metrics_collections, mean_t)
+
+    if updates_collections:
+      ops.add_to_collections(updates_collections, update_op)
+
+    return mean_t, update_op
+
+
+def percentage_below(values, threshold, weights=None,
+                     metrics_collections=None,
+                     updates_collections=None,
+                     name=None):
+  """Computes the percentage of values less than the given threshold.
+
+  The `percentage_below` function creates two local variables,
+  `total` and `count` that are used to compute the percentage of `values` that
+  fall below `threshold`. This rate is weighted by `weights`, and it is
+  ultimately returned as `percentage` which is an idempotent operation that
+  simply divides `total` by `count`.
+
+  For estimation of the metric over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the
+  `percentage`.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    values: A numeric `Tensor` of arbitrary size.
+    threshold: A scalar threshold.
+    weights: An optional `Tensor` whose shape is broadcastable to `values`.
+    metrics_collections: An optional list of collections that the metric
+      value variable should be added to.
+    updates_collections: An optional list of collections that the metric update
+      ops should be added to.
+    name: An optional variable_scope name.
+
+  Returns:
+    percentage: A `Tensor` representing the current mean, the value of `total`
+      divided by `count`.
+    update_op: An operation that increments the `total` and `count` variables
+      appropriately.
+
+  Raises:
+    ValueError: If `weights` is not `None` and its shape doesn't match `values`,
+      or if either `metrics_collections` or `updates_collections` are not a list
+      or tuple.
+  """
+  is_below_threshold = math_ops.to_float(math_ops.less(values, threshold))
+  return mean(is_below_threshold,
+              weights,
+              metrics_collections,
+              updates_collections,
+              name or 'percentage_below_threshold')
+
+
+def _count_condition(values, weights=None, metrics_collections=None,
+                     updates_collections=None):
+  """Sums the weights of cases where the given values are True.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    values: A `bool` `Tensor` of arbitrary size.
+    weights: An optional `Tensor` whose shape is broadcastable to `values`.
+    metrics_collections: An optional list of collections that the metric
+      value variable should be added to.
+    updates_collections: An optional list of collections that the metric update
+      ops should be added to.
+
+  Returns:
+    value_tensor: A `Tensor` representing the current value of the metric.
+    update_op: An operation that accumulates the error from a batch of data.
+
+  Raises:
+    ValueError: If `weights` is not `None` and its shape doesn't match `values`,
+      or if either `metrics_collections` or `updates_collections` are not a list
+      or tuple.
+  """
+  check_ops.assert_type(values, dtypes.bool)
+  count = _create_local('count', shape=[])
+
+  values = math_ops.to_float(values)
+  if weights is not None:
+    weights = math_ops.to_float(weights)
+    values = math_ops.mul(values, weights)
+
+  value_tensor = array_ops.identity(count)
+  update_op = state_ops.assign_add(count, math_ops.reduce_sum(values))
+
+  if metrics_collections:
+    ops.add_to_collections(metrics_collections, value_tensor)
+
+  if updates_collections:
+    ops.add_to_collections(updates_collections, update_op)
+
+  return value_tensor, update_op
+
+
+def true_positives(labels, predictions, weights=None,
+                   metrics_collections=None,
+                   updates_collections=None,
+                   name=None):
+  """Sum the weights of true_positives.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: The ground truth values, a `bool` `Tensor` whose dimensions must
+      match `predictions`.
+    predictions: The predicted values, a `bool` `Tensor` of arbitrary
+      dimensions.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
+    metrics_collections: An optional list of collections that the metric
+      value variable should be added to.
+    updates_collections: An optional list of collections that the metric update
+      ops should be added to.
+    name: An optional variable_scope name.
+
+  Returns:
+    value_tensor: A `Tensor` representing the current value of the metric.
+    update_op: An operation that accumulates the error from a batch of data.
+
+  Raises:
+    ValueError: If `predictions` and `labels` have mismatched shapes, or if
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      either `metrics_collections` or `updates_collections` are not a list or
+      tuple.
+  """
+  with variable_scope.variable_scope(
+      name, 'true_positives', (predictions, labels, weights)):
+
+    predictions = ops.convert_to_tensor(predictions)
+    labels = ops.convert_to_tensor(labels)
+    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
+    is_true_positive = math_ops.logical_and(math_ops.equal(labels, 1),
+                                            math_ops.equal(predictions, 1))
+    return _count_condition(is_true_positive, weights, metrics_collections,
+                            updates_collections)
+
+
+def false_positives(labels, predictions, weights=None,
+                    metrics_collections=None,
+                    updates_collections=None,
+                    name=None):
+  """Sum the weights of false positives.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: The ground truth values, a `bool` `Tensor` whose dimensions must
+      match `predictions`.
+    predictions: The predicted values, a `bool` `Tensor` of arbitrary
+      dimensions.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
+    metrics_collections: An optional list of collections that the metric
+      value variable should be added to.
+    updates_collections: An optional list of collections that the metric update
+      ops should be added to.
+    name: An optional variable_scope name.
+
+  Returns:
+    value_tensor: A `Tensor` representing the current value of the metric.
+    update_op: An operation that accumulates the error from a batch of data.
+
+  Raises:
+    ValueError: If `predictions` and `labels` have mismatched shapes, or if
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      either `metrics_collections` or `updates_collections` are not a list or
+      tuple.
+  """
+  with variable_scope.variable_scope(
+      name, 'false_positives', (predictions, labels, weights)):
+
+    predictions = ops.convert_to_tensor(predictions)
+    labels = ops.convert_to_tensor(labels)
+    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
+    is_false_positive = math_ops.logical_and(math_ops.equal(labels, 0),
+                                             math_ops.equal(predictions, 1))
+    return _count_condition(is_false_positive, weights, metrics_collections,
+                            updates_collections)
+
+
+def precision(labels, predictions, weights=None,
+              metrics_collections=None, updates_collections=None,
+              name=None):
+  """Computes the precision of the predictions with respect to the labels.
+
+  The `precision` function creates two local variables,
+  `true_positives` and `false_positives`, that are used to compute the
+  precision. This value is ultimately returned as `precision`, an idempotent
+  operation that simply divides `true_positives` by the sum of `true_positives`
+  and `false_positives`.
+
+  For estimation of the metric  over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the
+  `precision`. `update_op` weights each prediction by the corresponding value in
+  `weights`.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: The ground truth values, a `bool` `Tensor` whose dimensions must
+      match `predictions`.
+    predictions: The predicted values, a `bool` `Tensor` of arbitrary shape.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
+    metrics_collections: An optional list of collections that `precision` should
+      be added to.
+    updates_collections: An optional list of collections that `update_op` should
+      be added to.
+    name: An optional variable_scope name.
+
+  Returns:
+    precision: Scalar float `Tensor` with the value of `true_positives`
+      divided by the sum of `true_positives` and `false_positives`.
+    update_op: `Operation` that increments `true_positives` and
+      `false_positives` variables appropriately and whose value matches
+      `precision`.
+
+  Raises:
+    ValueError: If `predictions` and `labels` have mismatched shapes, or if
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      either `metrics_collections` or `updates_collections` are not a list or
+      tuple.
+  """
+  with variable_scope.variable_scope(
+      name, 'precision', (predictions, labels, weights)):
+
+    labels, predictions, weights = _remove_squeezable_dimensions(
+        labels, predictions, weights)
+    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
+
+    true_p, true_positives_update_op = true_positives(
+        labels, predictions, weights, metrics_collections=None,
+        updates_collections=None, name=None)
+    false_p, false_positives_update_op = false_positives(
+        labels, predictions, weights, metrics_collections=None,
+        updates_collections=None, name=None)
+
+    def compute_precision(name):
+      return array_ops.where(
+          math_ops.greater(true_p + false_p, 0),
+          math_ops.div(true_p, true_p + false_p),
+          0,
+          name)
+
+    p = compute_precision('value')
+    with ops.control_dependencies([true_positives_update_op,
+                                   false_positives_update_op]):
+      update_op = compute_precision('update_op')
+
+    if metrics_collections:
+      ops.add_to_collections(metrics_collections, p)
+
+    if updates_collections:
+      ops.add_to_collections(updates_collections, update_op)
+
+    return p, update_op
+
+
+def precision_at_thresholds(labels, predictions, thresholds,
+                            weights=None,
+                            metrics_collections=None,
+                            updates_collections=None, name=None):
+  """Computes precision values for different `thresholds` on `predictions`.
+
+  The `precision_at_thresholds` function creates four local variables,
+  `true_positives`, `true_negatives`, `false_positives` and `false_negatives`
+  for various values of thresholds. `precision[i]` is defined as the total
+  weight of values in `predictions` above `thresholds[i]` whose corresponding
+  entry in `labels` is `True`, divided by the total weight of values in
+  `predictions` above `thresholds[i]` (`true_positives[i] / (true_positives[i] +
+  false_positives[i])`).
+
+  For estimation of the metric over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the
+  `precision`.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: A `bool` `Tensor` whose shape matches `predictions`.
+    predictions: A floating point `Tensor` of arbitrary shape and whose values
+      are in the range `[0, 1]`.
+    thresholds: A python list or tuple of float thresholds in `[0, 1]`.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
+    metrics_collections: An optional list of collections that `auc` should be
+      added to.
+    updates_collections: An optional list of collections that `update_op` should
+      be added to.
+    name: An optional variable_scope name.
+
+  Returns:
+    precision: A float `Tensor` of shape `[len(thresholds)]`.
+    update_op: An operation that increments the `true_positives`,
+      `true_negatives`, `false_positives` and `false_negatives` variables that
+      are used in the computation of `precision`.
+
+  Raises:
+    ValueError: If `predictions` and `labels` have mismatched shapes, or if
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      either `metrics_collections` or `updates_collections` are not a list or
+      tuple.
+  """
+  with variable_scope.variable_scope(name, 'precision_at_thresholds',
+                                     (predictions, labels, weights)):
+    values, update_ops = _confusion_matrix_at_thresholds(
+        labels, predictions, thresholds, weights, includes=('tp', 'fp'))
+    tp = values['tp']
+    fp = values['fp']
+
+    # Avoid division by zero.
+    epsilon = 1e-7
+    def compute_precision(name):
+      return math_ops.div(tp, epsilon + tp + fp, name='precision_' + name)
+
+    prec = compute_precision('value')
+    with ops.control_dependencies(update_ops.values()):
+      update_op = compute_precision('update_op')
+
+    if metrics_collections:
+      ops.add_to_collections(metrics_collections, prec)
+
+    if updates_collections:
+      ops.add_to_collections(updates_collections, update_op)
+
+    return prec, update_op
+
+
+def false_negatives(labels, predictions, weights=None,
+                    metrics_collections=None,
+                    updates_collections=None,
+                    name=None):
+  """Computes the total number of false positives.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: The ground truth values, a `bool` `Tensor` whose dimensions must
+      match `predictions`.
+    predictions: The predicted values, a `bool` `Tensor` of arbitrary
+      dimensions.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
+    metrics_collections: An optional list of collections that the metric
+      value variable should be added to.
+    updates_collections: An optional list of collections that the metric update
+      ops should be added to.
+    name: An optional variable_scope name.
+
+  Returns:
+    value_tensor: A `Tensor` representing the current value of the metric.
+    update_op: An operation that accumulates the error from a batch of data.
+
+  Raises:
+    ValueError: If `weights` is not `None` and its shape doesn't match `values`,
+      or if either `metrics_collections` or `updates_collections` are not a list
+      or tuple.
+  """
+  with variable_scope.variable_scope(
+      name, 'false_negatives', (predictions, labels, weights)):
+
+    predictions = ops.convert_to_tensor(predictions)
+    labels = ops.convert_to_tensor(labels)
+    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
+    is_false_negative = math_ops.logical_and(math_ops.equal(labels, 1),
+                                             math_ops.equal(predictions, 0))
+    return _count_condition(is_false_negative, weights, metrics_collections,
+                            updates_collections)
+
+
+def recall(labels, predictions, weights=None,
+           metrics_collections=None, updates_collections=None,
+           name=None):
+  """Computes the recall of the predictions with respect to the labels.
+
+  The `recall` function creates two local variables, `true_positives`
+  and `false_negatives`, that are used to compute the recall. This value is
+  ultimately returned as `recall`, an idempotent operation that simply divides
+  `true_positives` by the sum of `true_positives`  and `false_negatives`.
+
+  For estimation of the metric  over a stream of data, the function creates an
+  `update_op` that updates these variables and returns the `recall`. `update_op`
+  weights each prediction by the corresponding value in `weights`.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: The ground truth values, a `bool` `Tensor` whose dimensions must
+      match `predictions`.
+    predictions: The predicted values, a `bool` `Tensor` of arbitrary shape.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
+    metrics_collections: An optional list of collections that `recall` should
+      be added to.
+    updates_collections: An optional list of collections that `update_op` should
+      be added to.
+    name: An optional variable_scope name.
+
+  Returns:
+    recall: Scalar float `Tensor` with the value of `true_positives` divided
+      by the sum of `true_positives` and `false_negatives`.
+    update_op: `Operation` that increments `true_positives` and
+      `false_negatives` variables appropriately and whose value matches
+      `recall`.
+
+  Raises:
+    ValueError: If `predictions` and `labels` have mismatched shapes, or if
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      either `metrics_collections` or `updates_collections` are not a list or
+      tuple.
+  """
+  with variable_scope.variable_scope(
+      name, 'recall', (predictions, labels, weights)):
+    labels, predictions, weights = _remove_squeezable_dimensions(
+        labels, predictions, weights)
+    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
+
+    true_p, true_positives_update_op = true_positives(
+        labels, predictions, weights, metrics_collections=None,
+        updates_collections=None, name=None)
+    false_n, false_negatives_update_op = false_negatives(
+        labels, predictions, weights, metrics_collections=None,
+        updates_collections=None, name=None)
+
+    def compute_recall(true_p, false_n, name):
+      return array_ops.where(
+          math_ops.greater(true_p + false_n, 0),
+          math_ops.div(true_p, true_p + false_n),
+          0,
+          name)
+
+    rec = compute_recall(true_p, false_n, 'value')
+    with ops.control_dependencies([true_positives_update_op,
+                                   false_negatives_update_op]):
+      update_op = compute_recall(true_p, false_n, 'update_op')
+
+    if metrics_collections:
+      ops.add_to_collections(metrics_collections, rec)
+
+    if updates_collections:
+      ops.add_to_collections(updates_collections, update_op)
+
+    return rec, update_op
+
+
+def _at_k_name(name, k=None, class_id=None):
+  if k is not None:
+    name = '%s_at_%d' % (name, k)
+  else:
+    name = '%s_at_k' % (name)
+  if class_id is not None:
+    name = '%s_class%d' % (name, class_id)
+  return name
+
+
+def _select_class_id(ids, selected_id):
+  """Filter all but `selected_id` out of `ids`.
+
+  Args:
+    ids: `int64` `Tensor` or `SparseTensor` of IDs.
+    selected_id: Int id to select.
+
+  Returns:
+    `SparseTensor` of same dimensions as `ids`. This contains only the entries
+    equal to `selected_id`.
+  """
+  if isinstance(
+      ids, (sparse_tensor.SparseTensor, sparse_tensor.SparseTensorValue)):
+    return sparse_ops.sparse_retain(
+        ids, math_ops.equal(ids.values, selected_id))
+
+  # TODO(ptucker): Make this more efficient, maybe add a sparse version of
+  # tf.equal and tf.reduce_any?
+
+  # Shape of filled IDs is the same as `ids` with the last dim collapsed to 1.
+  ids_shape = array_ops.shape(ids, out_type=dtypes.int64)
+  ids_last_dim = array_ops.size(ids_shape) - 1
+  filled_selected_id_shape = math_ops.reduced_shape(
+      ids_shape, array_ops.reshape(ids_last_dim, [1]))
+
+  # Intersect `ids` with the selected ID.
+  filled_selected_id = array_ops.fill(
+      filled_selected_id_shape, math_ops.to_int64(selected_id))
+  result = sets.set_intersection(filled_selected_id, ids)
+  return sparse_tensor.SparseTensor(
+      indices=result.indices, values=result.values, shape=ids_shape)
+
+
+def _maybe_select_class_id(labels, predictions_idx, selected_id=None):
+  """If class ID is specified, filter all other classes.
+
+  Args:
+    labels: `int64` `Tensor` or `SparseTensor` with shape
+      [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
+      target classes for the associated prediction. Commonly, N=1 and `labels`
+      has shape [batch_size, num_labels]. [D1, ... DN] must match
+      `predictions_idx`.
+    predictions_idx: `int64` `Tensor` of class IDs, with shape [D1, ... DN, k]
+      where N >= 1. Commonly, N=1 and `predictions_idx` has shape
+      [batch size, k].
+    selected_id: Int id to select.
+
+  Returns:
+    Tuple of `labels` and `predictions_idx`, possibly with classes removed.
+  """
+  if selected_id is None:
+    return labels, predictions_idx
+  return (_select_class_id(labels, selected_id),
+          _select_class_id(predictions_idx, selected_id))
+
+
+def _sparse_true_positive_at_k(labels,
+                               predictions_idx,
+                               class_id=None,
+                               weights=None,
+                               name=None):
+  """Calculates true positives for recall@k and precision@k.
+
+  If `class_id` is specified, calculate binary true positives for `class_id`
+      only.
+  If `class_id` is not specified, calculate metrics for `k` predicted vs
+      `n` label classes, where `n` is the 2nd dimension of `labels_sparse`.
+
+  Args:
+    labels: `int64` `Tensor` or `SparseTensor` with shape
+      [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
+      target classes for the associated prediction. Commonly, N=1 and `labels`
+      has shape [batch_size, num_labels]. [D1, ... DN] must match
+      `predictions_idx`.
+    predictions_idx: 1-D or higher `int64` `Tensor` with last dimension `k`,
+      top `k` predicted classes. For rank `n`, the first `n-1` dimensions must
+      match `labels`.
+    class_id: Class for which we want binary metrics.
+    weights: `Tensor` whose shape is broadcastable to the first [D1, ... DN]
+      dimensions of `predictions_idx` and `labels`.
+    name: Name of operation.
+
+  Returns:
+    A [D1, ... DN] `Tensor` of true positive counts.
+  """
+  with ops.name_scope(name, 'true_positives', (predictions_idx, labels)):
+    labels, predictions_idx = _maybe_select_class_id(
+        labels, predictions_idx, class_id)
+    tp = sets.set_size(sets.set_intersection(predictions_idx, labels))
+    tp = math_ops.to_double(tp)
+    if weights is not None:
+      weights = math_ops.to_double(weights)
+      tp = math_ops.mul(tp, weights)
+    return tp
+
+
+def _streaming_sparse_true_positive_at_k(labels,
+                                         predictions_idx,
+                                         k=None,
+                                         class_id=None,
+                                         weights=None,
+                                         name=None):
+  """Calculates weighted per step true positives for recall@k and precision@k.
+
+  If `class_id` is specified, calculate binary true positives for `class_id`
+      only.
+  If `class_id` is not specified, calculate metrics for `k` predicted vs
+      `n` label classes, where `n` is the 2nd dimension of `labels`.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: `int64` `Tensor` or `SparseTensor` with shape
+      [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
+      target classes for the associated prediction. Commonly, N=1 and `labels`
+      has shape [batch_size, num_labels]. [D1, ... DN] must match
+      `predictions_idx`.
+    predictions_idx: 1-D or higher `int64` `Tensor` with last dimension `k`,
+      top `k` predicted classes. For rank `n`, the first `n-1` dimensions must
+      match `labels`.
+    k: Integer, k for @k metric. This is only used for default op name.
+    class_id: Class for which we want binary metrics.
+    weights: `Tensor` whose shape is broadcastable to the first [D1, ... DN]
+      dimensions of `predictions_idx` and `labels`.
+    name: Name of new variable, and namespace for other dependent ops.
+
+  Returns:
+    A tuple of `Variable` and update `Operation`.
+
+  Raises:
+    ValueError: If `weights` is not `None` and has an incomptable shape.
+  """
+  default_name = _at_k_name('true_positive', k, class_id=class_id)
+  with ops.name_scope(name, default_name, (predictions_idx, labels)) as scope:
+    tp = _sparse_true_positive_at_k(
+        predictions_idx=predictions_idx, labels=labels, class_id=class_id,
+        weights=weights)
+    batch_total_tp = math_ops.to_double(math_ops.reduce_sum(tp))
+
+    var = _local_variable(array_ops.zeros([], dtype=dtypes.float64), name=scope)
+    return var, state_ops.assign_add(var, batch_total_tp, name='update')
+
+
+def _sparse_false_negative_at_k(labels,
+                                predictions_idx,
+                                class_id=None,
+                                weights=None):
+  """Calculates false negatives for recall@k.
+
+  If `class_id` is specified, calculate binary true positives for `class_id`
+      only.
+  If `class_id` is not specified, calculate metrics for `k` predicted vs
+      `n` label classes, where `n` is the 2nd dimension of `labels_sparse`.
+
+  Args:
+    labels: `int64` `Tensor` or `SparseTensor` with shape
+      [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
+      target classes for the associated prediction. Commonly, N=1 and `labels`
+      has shape [batch_size, num_labels]. [D1, ... DN] must match
+      `predictions_idx`.
+    predictions_idx: 1-D or higher `int64` `Tensor` with last dimension `k`,
+      top `k` predicted classes. For rank `n`, the first `n-1` dimensions must
+      match `labels`.
+    class_id: Class for which we want binary metrics.
+    weights: `Tensor` whose shape is broadcastable to the first [D1, ... DN]
+      dimensions of `predictions_idx` and `labels`.
+
+  Returns:
+    A [D1, ... DN] `Tensor` of false negative counts.
+  """
+  with ops.name_scope(None, 'false_negatives', (predictions_idx, labels)):
+    labels, predictions_idx = _maybe_select_class_id(labels,
+                                                     predictions_idx,
+                                                     class_id)
+    fn = sets.set_size(sets.set_difference(predictions_idx,
+                                           labels,
+                                           aminusb=False))
+    fn = math_ops.to_double(fn)
+    if weights is not None:
+      weights = math_ops.to_double(weights)
+      fn = math_ops.mul(fn, weights)
+    return fn
+
+
+def _streaming_sparse_false_negative_at_k(labels,
+                                          predictions_idx,
+                                          k,
+                                          class_id=None,
+                                          weights=None,
+                                          name=None):
+  """Calculates weighted per step false negatives for recall@k.
+
+  If `class_id` is specified, calculate binary true positives for `class_id`
+      only.
+  If `class_id` is not specified, calculate metrics for `k` predicted vs
+      `n` label classes, where `n` is the 2nd dimension of `labels`.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: `int64` `Tensor` or `SparseTensor` with shape
+      [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
+      target classes for the associated prediction. Commonly, N=1 and `labels`
+      has shape [batch_size, num_labels]. [D1, ... DN] must match
+      `predictions_idx`.
+    predictions_idx: 1-D or higher `int64` `Tensor` with last dimension `k`,
+      top `k` predicted classes. For rank `n`, the first `n-1` dimensions must
+      match `labels`.
+    k: Integer, k for @k metric. This is only used for default op name.
+    class_id: Class for which we want binary metrics.
+    weights: `Tensor` whose shape is broadcastable to the first [D1, ... DN]
+      dimensions of `predictions_idx` and `labels`.
+    name: Name of new variable, and namespace for other dependent ops.
+
+  Returns:
+    A tuple of `Variable` and update `Operation`.
+
+  Raises:
+    ValueError: If `weights` is not `None` and has an incomptable shape.
+  """
+  default_name = _at_k_name('false_negative', k, class_id=class_id)
+  with ops.name_scope(name, default_name, (predictions_idx, labels)) as scope:
+    fn = _sparse_false_negative_at_k(
+        predictions_idx=predictions_idx, labels=labels, class_id=class_id,
+        weights=weights)
+    batch_total_fn = math_ops.to_double(math_ops.reduce_sum(fn))
+
+    var = _local_variable(array_ops.zeros([], dtype=dtypes.float64), name=scope)
+    return var, state_ops.assign_add(var, batch_total_fn, name='update')
+
+
+def recall_at_k(labels,
+                predictions,
+                k,
+                class_id=None,
+                weights=None,
+                metrics_collections=None,
+                updates_collections=None,
+                name=None):
+  """Computes recall@k of the predictions with respect to sparse labels.
+
+  If `class_id` is specified, we calculate recall by considering only the
+      entries in the batch for which `class_id` is in the label, and computing
+      the fraction of them for which `class_id` is in the top-k `predictions`.
+  If `class_id` is not specified, we'll calculate recall as how often on
+      average a class among the labels of a batch entry is in the top-k
+      `predictions`.
+
+  `sparse_recall_at_k` creates two local variables,
+  `true_positive_at_<k>` and `false_negative_at_<k>`, that are used to compute
+  the recall_at_k frequency. This frequency is ultimately returned as
+  `recall_at_<k>`: an idempotent operation that simply divides
+  `true_positive_at_<k>` by total (`true_positive_at_<k>` +
+  `false_negative_at_<k>`).
+
+  For estimation of the metric over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the
+  `recall_at_<k>`. Internally, a `top_k` operation computes a `Tensor`
+  indicating the top `k` `predictions`. Set operations applied to `top_k` and
+  `labels` calculate the true positives and false negatives weighted by
+  `weights`. Then `update_op` increments `true_positive_at_<k>` and
+  `false_negative_at_<k>` using these values.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: `int64` `Tensor` or `SparseTensor` with shape
+      [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
+      target classes for the associated prediction. Commonly, N=1 and `labels`
+      has shape [batch_size, num_labels]. [D1, ... DN] must match `predictions`.
+      Values should be in range [0, num_classes), where num_classes is the last
+      dimension of `predictions`. Values outside this range always count
+      towards `false_negative_at_<k>`.
+    predictions: Float `Tensor` with shape [D1, ... DN, num_classes] where
+      N >= 1. Commonly, N=1 and predictions has shape [batch size, num_classes].
+      The final dimension contains the logit values for each class. [D1, ... DN]
+      must match `labels`.
+    k: Integer, k for @k metric.
+    class_id: Integer class ID for which we want binary metrics. This should be
+      in range [0, num_classes), where num_classes is the last dimension of
+      `predictions`. If class_id is outside this range, the method returns NAN.
+    weights: An optional `Tensor` whose shape is broadcastable to the first
+      [D1, ... DN] dimensions of `predictions` and `labels`.
+    metrics_collections: An optional list of collections that values should
+      be added to.
+    updates_collections: An optional list of collections that updates should
+      be added to.
+    name: Name of new update operation, and namespace for other dependent ops.
+
+  Returns:
+    recall: Scalar `float64` `Tensor` with the value of `true_positives` divided
+      by the sum of `true_positives` and `false_negatives`.
+    update_op: `Operation` that increments `true_positives` and
+      `false_negatives` variables appropriately, and whose value matches
+      `recall`.
+
+  Raises:
+    ValueError: If `weights` is not `None` and its shape doesn't match
+    `predictions`, or if either `metrics_collections` or `updates_collections`
+    are not a list or tuple.
+  """
+  default_name = _at_k_name('recall', k, class_id=class_id)
+  with ops.name_scope(name, default_name, (predictions, labels)) as scope:
+    _, top_k_idx = nn.top_k(predictions, k)
+    top_k_idx = math_ops.to_int64(top_k_idx)
+    tp, tp_update = _streaming_sparse_true_positive_at_k(
+        predictions_idx=top_k_idx, labels=labels, k=k, class_id=class_id,
+        weights=weights)
+    fn, fn_update = _streaming_sparse_false_negative_at_k(
+        predictions_idx=top_k_idx, labels=labels, k=k, class_id=class_id,
+        weights=weights)
+
+    metric = math_ops.div(tp, math_ops.add(tp, fn), name=scope)
+    update = math_ops.div(
+        tp_update, math_ops.add(tp_update, fn_update), name='update')
+    if metrics_collections:
+      ops.add_to_collections(metrics_collections, metric)
+    if updates_collections:
+      ops.add_to_collections(updates_collections, update)
+    return metric, update
+
+
+def recall_at_thresholds(labels, predictions, thresholds,
+                         weights=None, metrics_collections=None,
+                         updates_collections=None, name=None):
+  """Computes various recall values for different `thresholds` on `predictions`.
+
+  The `recall_at_thresholds` function creates four local variables,
+  `true_positives`, `true_negatives`, `false_positives` and `false_negatives`
+  for various values of thresholds. `recall[i]` is defined as the total weight
+  of values in `predictions` above `thresholds[i]` whose corresponding entry in
+  `labels` is `True`, divided by the total weight of `True` values in `labels`
+  (`true_positives[i] / (true_positives[i] + false_negatives[i])`).
+
+  For estimation of the metric over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the `recall`.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: A `bool` `Tensor` whose shape matches `predictions`.
+    predictions: A floating point `Tensor` of arbitrary shape and whose values
+      are in the range `[0, 1]`.
+    thresholds: A python list or tuple of float thresholds in `[0, 1]`.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
+    metrics_collections: An optional list of collections that `recall` should be
+      added to.
+    updates_collections: An optional list of collections that `update_op` should
+      be added to.
+    name: An optional variable_scope name.
+
+  Returns:
+    recall: A float `Tensor` of shape `[len(thresholds)]`.
+    update_op: An operation that increments the `true_positives`,
+      `true_negatives`, `false_positives` and `false_negatives` variables that
+      are used in the computation of `recall`.
+
+  Raises:
+    ValueError: If `predictions` and `labels` have mismatched shapes, or if
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      either `metrics_collections` or `updates_collections` are not a list or
+      tuple.
+  """
+  with variable_scope.variable_scope(name, 'recall_at_thresholds',
+                                     (predictions, labels, weights)):
+    values, update_ops = _confusion_matrix_at_thresholds(
+        labels, predictions, thresholds, weights, includes=('tp', 'fn'))
+    tp = values['tp']
+    fn = values['fn']
+
+    # Avoid division by zero.
+    epsilon = 1e-7
+    def compute_recall(name):
+      return math_ops.div(tp, epsilon + tp + fn, name='recall_' + name)
+
+    rec = compute_recall('value')
+    with ops.control_dependencies(update_ops.values()):
+      update_op = compute_recall('update_op')
+
+    if metrics_collections:
+      ops.add_to_collections(metrics_collections, rec)
+
+    if updates_collections:
+      ops.add_to_collections(updates_collections, update_op)
+
+    return rec, update_op
+
+
+def root_mean_squared_error(labels, predictions, weights=None,
+                            metrics_collections=None,
+                            updates_collections=None,
+                            name=None):
+  """Computes the root mean squared error between the labels and predictions.
+
+  The `root_mean_squared_error` function creates two local variables,
+  `total` and `count` that are used to compute the root mean squared error.
+  This average is weighted by `weights`, and it is ultimately returned as
+  `root_mean_squared_error`: an idempotent operation that takes the square root
+  of the division of `total` by `count`.
+
+  For estimation of the metric over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the
+  `root_mean_squared_error`. Internally, a `squared_error` operation computes
+  the element-wise square of the difference between `predictions` and `labels`.
+  Then `update_op` increments `total` with the reduced sum of the product of
+  `weights` and `squared_error`, and it increments `count` with the reduced sum
+  of `weights`.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: A `Tensor` of the same shape as `predictions`.
+    predictions: A `Tensor` of arbitrary shape.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
+    metrics_collections: An optional list of collections that
+      `root_mean_squared_error` should be added to.
+    updates_collections: An optional list of collections that `update_op` should
+      be added to.
+    name: An optional variable_scope name.
+
+  Returns:
+    root_mean_squared_error: A `Tensor` representing the current mean, the value
+      of `total` divided by `count`.
+    update_op: An operation that increments the `total` and `count` variables
+      appropriately and whose value matches `root_mean_squared_error`.
+
+  Raises:
+    ValueError: If `predictions` and `labels` have mismatched shapes, or if
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      either `metrics_collections` or `updates_collections` are not a list or
+      tuple.
+  """
+  labels, predictions, weights = _remove_squeezable_dimensions(
+      labels, predictions, weights)
+  predictions.get_shape().assert_is_compatible_with(labels.get_shape())
+  value_tensor, update_op = mean_squared_error(
+      labels, predictions, weights, None, None,
+      name or 'root_mean_squared_error')
+
+  rmse = math_ops.sqrt(value_tensor)
+  with ops.control_dependencies([update_op]):
+    update_op = math_ops.sqrt(update_op)
+
+  if metrics_collections:
+    ops.add_to_collections(metrics_collections, rmse)
+
+  if updates_collections:
+    ops.add_to_collections(updates_collections, update_op)
+
+  return rmse, update_op
+
+
+def sensitivity_at_specificity(
+    labels, predictions, specificity, weights=None, num_thresholds=200,
+    metrics_collections=None, updates_collections=None, name=None):
+  """Computes the specificity at a given sensitivity.
+
+  The `sensitivity_at_specificity` function creates four local
+  variables, `true_positives`, `true_negatives`, `false_positives` and
+  `false_negatives` that are used to compute the sensitivity at the given
+  specificity value. The threshold for the given specificity value is computed
+  and used to evaluate the corresponding sensitivity.
+
+  For estimation of the metric over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the
+  `sensitivity`. `update_op` increments the `true_positives`, `true_negatives`,
+  `false_positives` and `false_negatives` counts with the weight of each case
+  found in the `predictions` and `labels`.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  For additional information about specificity and sensitivity, see the
+  following: https://en.wikipedia.org/wiki/Sensitivity_and_specificity
+
+  Args:
+    labels: A `bool` `Tensor` whose shape matches `predictions`.
+    predictions: A floating point `Tensor` of arbitrary shape and whose values
+      are in the range `[0, 1]`.
+    specificity: A scalar value in range `[0, 1]`.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
+    num_thresholds: The number of thresholds to use for matching the given
+      specificity.
+    metrics_collections: An optional list of collections that `sensitivity`
+      should be added to.
+    updates_collections: An optional list of collections that `update_op` should
+      be added to.
+    name: An optional variable_scope name.
+
+  Returns:
+    sensitivity: A scalar `Tensor` representing the sensitivity at the given
+      `specificity` value.
+    update_op: An operation that increments the `true_positives`,
+      `true_negatives`, `false_positives` and `false_negatives` variables
+      appropriately and whose value matches `sensitivity`.
+
+  Raises:
+    ValueError: If `predictions` and `labels` have mismatched shapes, if
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      `specificity` is not between 0 and 1, or if either `metrics_collections`
+      or `updates_collections` are not a list or tuple.
+  """
+  if specificity < 0 or specificity > 1:
+    raise ValueError('`specificity` must be in the range [0, 1].')
+
+  with variable_scope.variable_scope(name, 'sensitivity_at_specificity',
+                                     (predictions, labels, weights)):
+    kepsilon = 1e-7  # to account for floating point imprecisions
+    thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
+                  for i in range(num_thresholds-2)]
+    thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
+
+    values, update_ops = _confusion_matrix_at_thresholds(
+        labels, predictions, thresholds, weights)
+    tp = values['tp']
+    fn = values['fn']
+    tn = values['tn']
+    fp = values['fp']
+
+    def compute_sensitivity_at_specificity(name):
+      specificities = math_ops.div(tn, tn + fp + kepsilon)
+      tf_index = math_ops.argmin(math_ops.abs(specificities - specificity), 0)
+      tf_index = math_ops.cast(tf_index, dtypes.int32)
+
+      # Now, we have the implicit threshold, so compute the sensitivity:
+      return math_ops.div(tp[tf_index],
+                          tp[tf_index] + fn[tf_index] + kepsilon,
+                          name)
+
+    sensitivity = compute_sensitivity_at_specificity('value')
+    with ops.control_dependencies(update_ops.values()):
+      update_op = compute_sensitivity_at_specificity('update_op')
+
+    if metrics_collections:
+      ops.add_to_collections(metrics_collections, sensitivity)
+
+    if updates_collections:
+      ops.add_to_collections(updates_collections, update_op)
+
+    return sensitivity, update_op
+
+
+def _expand_and_tile(tensor, multiple, dim=0, name=None):
+  """Slice `tensor` shape in 2, then tile along the sliced dimension.
+
+  A new dimension is inserted in shape of `tensor` before `dim`, then values are
+  tiled `multiple` times along the new dimension.
+
+  Args:
+    tensor: Input `Tensor` or `SparseTensor`.
+    multiple: Integer, number of times to tile.
+    dim: Integer, dimension along which to tile.
+    name: Name of operation.
+
+  Returns:
+    `Tensor` result of expanding and tiling `tensor`.
+
+  Raises:
+    ValueError: if `multiple` is less than 1, or `dim` is not in
+    `[-rank(tensor), rank(tensor)]`.
+  """
+  if multiple < 1:
+    raise ValueError('Invalid multiple %s, must be > 0.' % multiple)
+  with ops.name_scope(
+      name, 'expand_and_tile', (tensor, multiple, dim)) as scope:
+    # Sparse.
+    if isinstance(tensor, sparse_tensor.SparseTensorValue):
+      tensor = sparse_tensor.SparseTensor.from_value(tensor)
+    if isinstance(tensor, sparse_tensor.SparseTensor):
+      if dim < 0:
+        expand_dims = array_ops.reshape(
+            array_ops.size(tensor.shape) + dim, [1])
+      else:
+        expand_dims = [dim]
+      expanded_shape = array_ops.concat(
+          0, (array_ops.slice(tensor.shape, [0], expand_dims), [1],
+              array_ops.slice(tensor.shape, expand_dims, [-1])),
+          name='expanded_shape')
+      expanded = sparse_ops.sparse_reshape(
+          tensor, shape=expanded_shape, name='expand')
+      if multiple == 1:
+        return expanded
+      return sparse_ops.sparse_concat(
+          dim - 1 if dim < 0 else dim, [expanded] * multiple, name=scope)
+
+    # Dense.
+    expanded = array_ops.expand_dims(
+        tensor, dim if (dim >= 0) else (dim - 1), name='expand')
+    if multiple == 1:
+      return expanded
+    ones = array_ops.ones_like(array_ops.shape(tensor))
+    tile_multiples = array_ops.concat(
+        0, (ones[:dim], (multiple,), ones[dim:]), name='multiples')
+    return array_ops.tile(expanded, tile_multiples, name=scope)
+
+
+def _num_relevant(labels, k):
+  """Computes number of relevant values for each row in labels.
+
+  For labels with shape [D1, ... DN, num_labels], this is the minimum of
+  `num_labels` and `k`.
+
+  Args:
+    labels: `int64` `Tensor` or `SparseTensor` with shape
+      [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
+      target classes for the associated prediction. Commonly, N=1 and `labels`
+      has shape [batch_size, num_labels].
+    k: Integer, k for @k metric.
+
+  Returns:
+    Integer `Tensor` of shape [D1, ... DN], where each value is the number of
+    relevant values for that row.
+
+  Raises:
+    ValueError: if inputs have invalid dtypes or values.
+  """
+  if k < 1:
+    raise ValueError('Invalid k=%s.' % k)
+  with ops.name_scope(None, 'num_relevant', (labels,)) as scope:
+    # For SparseTensor, calculate separate count for each row.
+    if isinstance(
+        labels, (sparse_tensor.SparseTensor, sparse_tensor.SparseTensorValue)):
+      labels_sizes = sets.set_size(labels)
+      return math_ops.minimum(labels_sizes, k, name=scope)
+
+    # For dense Tensor, calculate scalar count based on last dimension, and
+    # tile across labels shape.
+    labels_shape = array_ops.shape(labels)
+    labels_size = labels_shape[-1]
+    num_relevant_scalar = math_ops.minimum(labels_size, k)
+    return array_ops.fill(labels_shape[0:-1], num_relevant_scalar, name=scope)
+
+
+def _sparse_average_precision_at_k(labels, predictions, k):
+  """Computes average precision@k of predictions with respect to sparse labels.
+
+  From en.wikipedia.org/wiki/Information_retrieval#Average_precision, formula
+  for each row is:
+
+    AveP = sum_{i=1...k} P_{i} * rel_{i} / num_relevant_items
+
+  A "row" is the elements in dimension [D1, ... DN] of `predictions`, `labels`,
+  and the result `Tensors`. In the common case, this is [batch_size]. Each row
+  of the results contains the average precision for that row.
+
+  Internally, a `top_k` operation computes a `Tensor` indicating the top `k`
+  `predictions`. Set operations applied to `top_k` and `labels` calculate the
+  true positives, which are used to calculate the precision ("P_{i}" term,
+  above).
+
+  Args:
+    labels: `int64` `Tensor` or `SparseTensor` with shape
+      [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
+      target classes for the associated prediction. Commonly, N=1 and `labels`
+      has shape [batch_size, num_labels]. [D1, ... DN] must match
+      `predictions`. Values should be in range [0, num_classes), where
+      num_classes is the last dimension of `predictions`. Values outside this
+      range are ignored.
+    predictions: Float `Tensor` with shape [D1, ... DN, num_classes] where
+      N >= 1. Commonly, N=1 and `predictions` has shape
+      [batch size, num_classes]. The final dimension contains the logit values
+      for each class. [D1, ... DN] must match `labels`.
+    k: Integer, k for @k metric. This will calculate an average precision for
+      range `[1,k]`, as documented above.
+
+  Returns:
+    `float64` `Tensor` of shape [D1, ... DN], where each value is the average
+    precision for that row.
+
+  Raises:
+    ValueError: if k is invalid.
+  """
+  if k < 1:
+    raise ValueError('Invalid k=%s.' % k)
+  with ops.name_scope(
+      None, 'average_precision', (predictions, labels, k)) as scope:
+    # Calculate top k indices to produce [D1, ... DN, k] tensor.
+    _, predictions_idx = nn.top_k(predictions, k)
+    predictions_idx = math_ops.to_int64(predictions_idx, name='predictions_idx')
+
+    # Expand dims to produce [D1, ... DN, k, 1] tensor. This gives us a separate
+    # prediction for each k, so we can calculate separate true positive values
+    # for each k.
+    predictions_idx_per_k = array_ops.expand_dims(
+        predictions_idx, -1, name='predictions_idx_per_k')
+
+    # Replicate labels k times to produce [D1, ... DN, k, num_labels] tensor.
+    labels_per_k = _expand_and_tile(
+        labels, multiple=k, dim=-1, name='labels_per_k')
+
+    # The following tensors are all of shape [D1, ... DN, k], containing values
+    # per row, per k value.
+    # `relevant_per_k` (int32) - Relevance indicator, 1 if the prediction at
+    #     that k value is correct, 0 otherwise. This is the "rel_{i}" term from
+    #     the formula above.
+    # `tp_per_k` (int32) - True positive counts.
+    # `retrieved_per_k` (int32) - Number of predicted values at each k. This is
+    #     the precision denominator.
+    # `precision_per_k` (float64) - Precision at each k. This is the "P_{i}"
+    #     term from the formula above.
+    # `relevant_precision_per_k` (float64) - Relevant precisions; i.e.,
+    #     precisions at all k for which relevance indicator is true.
+    relevant_per_k = _sparse_true_positive_at_k(
+        labels_per_k, predictions_idx_per_k, name='relevant_per_k')
+    tp_per_k = math_ops.cumsum(relevant_per_k, axis=-1, name='tp_per_k')
+    retrieved_per_k = math_ops.cumsum(
+        array_ops.ones_like(relevant_per_k), axis=-1, name='retrieved_per_k')
+    precision_per_k = math_ops.div(
+        math_ops.to_double(tp_per_k), math_ops.to_double(retrieved_per_k),
+        name='precision_per_k')
+    relevant_precision_per_k = math_ops.mul(
+        precision_per_k, math_ops.to_double(relevant_per_k),
+        name='relevant_precision_per_k')
+
+    # Reduce along k dimension to get the sum, yielding a [D1, ... DN] tensor.
+    precision_sum = math_ops.reduce_sum(
+        relevant_precision_per_k, reduction_indices=(-1,), name='precision_sum')
+
+    # Divide by number of relevant items to get average precision. These are
+    # the "num_relevant_items" and "AveP" terms from the formula above.
+    num_relevant_items = math_ops.to_double(_num_relevant(labels, k))
+    return math_ops.div(precision_sum, num_relevant_items, name=scope)
+
+
+def sparse_average_precision_at_k(labels,
+                                  predictions,
+                                  k,
+                                  weights=None,
+                                  metrics_collections=None,
+                                  updates_collections=None,
+                                  name=None):
+  """Computes average precision@k of predictions with respect to sparse labels.
+
+  `sparse_average_precision_at_k` creates two local variables,
+  `average_precision_at_<k>/total` and `average_precision_at_<k>/max`, that
+  are used to compute the frequency. This frequency is ultimately returned as
+  `average_precision_at_<k>`: an idempotent operation that simply divides
+  `average_precision_at_<k>/total` by `average_precision_at_<k>/max`.
+
+  For estimation of the metric over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the
+  `precision_at_<k>`. Internally, a `top_k` operation computes a `Tensor`
+  indicating the top `k` `predictions`. Set operations applied to `top_k` and
+  `labels` calculate the true positives and false positives weighted by
+  `weights`. Then `update_op` increments `true_positive_at_<k>` and
+  `false_positive_at_<k>` using these values.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: `int64` `Tensor` or `SparseTensor` with shape
+      [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
+      target classes for the associated prediction. Commonly, N=1 and `labels`
+      has shape [batch_size, num_labels]. [D1, ... DN] must match
+      `predictions_`. Values should be in range [0, num_classes), where
+      num_classes is the last dimension of `predictions`. Values outside this
+      range are ignored.
+    predictions: Float `Tensor` with shape [D1, ... DN, num_classes] where
+      N >= 1. Commonly, N=1 and `predictions` has shape
+      [batch size, num_classes]. The final dimension contains the logit values
+      for each class. [D1, ... DN] must match `labels`.
+    k: Integer, k for @k metric. This will calculate an average precision for
+      range `[1,k]`, as documented above.
+    weights: An optional `Tensor` whose shape is broadcastable to the first
+      [D1, ... DN] dimensions of `predictions` and `labels`.
+    metrics_collections: An optional list of collections that values should
+      be added to.
+    updates_collections: An optional list of collections that updates should
+      be added to.
+    name: Name of new update operation, and namespace for other dependent ops.
+
+  Returns:
+    mean_average_precision: Scalar `float64` `Tensor` with the mean average
+      precision values.
+    update: `Operation` that increments  variables appropriately, and whose
+      value matches `metric`.
+  """
+  default_name = _at_k_name('average_precision', k)
+  with ops.name_scope(name, default_name, (predictions, labels)) as scope:
+    # Calculate per-example average precision, and apply weights.
+    average_precision = _sparse_average_precision_at_k(
+        predictions=predictions, labels=labels, k=k)
+    if weights is not None:
+      weights = math_ops.to_double(weights)
+      average_precision = math_ops.mul(average_precision, weights)
+
+    # Create accumulation variables and update ops for max average precision and
+    # total average precision.
+    with ops.name_scope(None, 'max', (average_precision,)) as max_scope:
+      # `max` is the max possible precision. Since max for any row is 1.0:
+      # - For the unweighted case, this is just the number of rows.
+      # - For the weighted case, it's the sum of the weights broadcast across
+      #   `average_precision` rows.
+      max_var = _local_variable(
+          array_ops.zeros([], dtype=dtypes.float64), name=max_scope)
+      if weights is None:
+        batch_max = math_ops.to_double(
+            array_ops.size(average_precision, name='batch_max'))
+      else:
+        # TODO(ptucker): More efficient way to broadcast?
+        broadcast_weights = math_ops.mul(
+            weights, array_ops.ones_like(average_precision),
+            name='broadcast_weights')
+        batch_max = math_ops.reduce_sum(broadcast_weights, name='batch_max')
+      max_update = state_ops.assign_add(max_var, batch_max, name='update')
+    with ops.name_scope(None, 'total', (average_precision,)) as total_scope:
+      total_var = _local_variable(
+          array_ops.zeros([], dtype=dtypes.float64), name=total_scope)
+      batch_total = math_ops.reduce_sum(average_precision, name='batch_total')
+      total_update = state_ops.assign_add(total_var, batch_total, name='update')
+
+    # Divide total by max to get mean, for both vars and the update ops.
+    mean_average_precision = _safe_scalar_div(total_var, max_var, name='mean')
+    update = _safe_scalar_div(total_update, max_update, name=scope)
+
+    if metrics_collections:
+      ops.add_to_collections(metrics_collections, mean_average_precision)
+    if updates_collections:
+      ops.add_to_collections(updates_collections, update)
+
+    return mean_average_precision, update
+
+
+def _sparse_false_positive_at_k(labels,
+                                predictions_idx,
+                                class_id=None,
+                                weights=None):
+  """Calculates false positives for precision@k.
+
+  If `class_id` is specified, calculate binary true positives for `class_id`
+      only.
+  If `class_id` is not specified, calculate metrics for `k` predicted vs
+      `n` label classes, where `n` is the 2nd dimension of `labels_sparse`.
+
+  Args:
+    labels: `int64` `Tensor` or `SparseTensor` with shape
+      [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
+      target classes for the associated prediction. Commonly, N=1 and `labels`
+      has shape [batch_size, num_labels]. [D1, ... DN] must match
+      `predictions_idx`.
+    predictions_idx: 1-D or higher `int64` `Tensor` with last dimension `k`,
+      top `k` predicted classes. For rank `n`, the first `n-1` dimensions must
+      match `labels`.
+    class_id: Class for which we want binary metrics.
+    weights: `Tensor` whose shape is broadcastable to the first [D1, ... DN]
+      dimensions of `predictions_idx` and `labels`.
+
+  Returns:
+    A [D1, ... DN] `Tensor` of false positive counts.
+  """
+  with ops.name_scope(None, 'false_positives', (predictions_idx, labels)):
+    labels, predictions_idx = _maybe_select_class_id(labels,
+                                                     predictions_idx,
+                                                     class_id)
+    fp = sets.set_size(sets.set_difference(
+        predictions_idx, labels, aminusb=True))
+    fp = math_ops.to_double(fp)
+    if weights is not None:
+      weights = math_ops.to_double(weights)
+      fp = math_ops.mul(fp, weights)
+    return fp
+
+
+def _streaming_sparse_false_positive_at_k(labels,
+                                          predictions_idx,
+                                          k=None,
+                                          class_id=None,
+                                          weights=None,
+                                          name=None):
+  """Calculates weighted per step false positives for precision@k.
+
+  If `class_id` is specified, calculate binary true positives for `class_id`
+      only.
+  If `class_id` is not specified, calculate metrics for `k` predicted vs
+      `n` label classes, where `n` is the 2nd dimension of `labels`.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: `int64` `Tensor` or `SparseTensor` with shape
+      [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
+      target classes for the associated prediction. Commonly, N=1 and `labels`
+      has shape [batch_size, num_labels]. [D1, ... DN] must match
+      `predictions_idx`.
+    predictions_idx: 1-D or higher `int64` `Tensor` with last dimension `k`,
+      top `k` predicted classes. For rank `n`, the first `n-1` dimensions must
+      match `labels`.
+    k: Integer, k for @k metric. This is only used for default op name.
+    class_id: Class for which we want binary metrics.
+    weights: `Tensor` whose shape is broadcastable to the first [D1, ... DN]
+      dimensions of `predictions_idx` and `labels`.
+    name: Name of new variable, and namespace for other dependent ops.
+
+  Returns:
+    A tuple of `Variable` and update `Operation`.
+
+  Raises:
+    ValueError: If `weights` is not `None` and has an incomptable shape.
+  """
+  default_name = _at_k_name('false_positive', k, class_id=class_id)
+  with ops.name_scope(name, default_name, (predictions_idx, labels)) as scope:
+    fp = _sparse_false_positive_at_k(
+        predictions_idx=predictions_idx, labels=labels, class_id=class_id,
+        weights=weights)
+    batch_total_fp = math_ops.to_double(math_ops.reduce_sum(fp))
+
+    var = _local_variable(array_ops.zeros([], dtype=dtypes.float64), name=scope)
+    return var, state_ops.assign_add(var, batch_total_fp, name='update')
+
+
+def _sparse_precision_at_k(labels,
+                           top_k_idx,
+                           k=None,
+                           class_id=None,
+                           weights=None,
+                           metrics_collections=None,
+                           updates_collections=None,
+                           name=None):
+  """Computes precision@k of the top-k indices with respect to sparse labels.
+
+  This method contains the code shared by streaming_sparse_precision_at_k and
+  streaming_sparse_precision_at_top_k. Refer to those methods for more details.
+
+  Args:
+    labels: `int64` `Tensor` or `SparseTensor` with shape
+      [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
+      target classes for the associated prediction. Commonly, N=1 and `labels`
+      has shape [batch_size, num_labels]. [D1, ... DN] must match
+      `predictions_idx`. Values should be in range [0, num_classes), where
+      num_classes is the last dimension of `predictions`. Values outside this
+      range are ignored.
+    top_k_idx: Integer `Tensor` with shape [D1, ... DN, k] where
+      N >= 1. Commonly, N=1 and top_k_idx has shape [batch size, k].
+      The final dimension contains the indices of top-k labels. [D1, ... DN]
+      must match `labels`.
+    k: Integer, k for @k metric or `None`. Only used for default op name.
+    class_id: Integer class ID for which we want binary metrics. This should be
+      in range [0, num_classes), where num_classes is the last dimension of
+      `predictions`. If `class_id` is outside this range, the method returns
+      NAN.
+    weights: An optional `Tensor` whose shape is broadcastable to the first
+      [D1, ... DN] dimensions of `predictions` and `labels`.
+    metrics_collections: An optional list of collections that values should
+      be added to.
+    updates_collections: An optional list of collections that updates should
+      be added to.
+    name: Name of the metric and of the enclosing scope.
+
+  Returns:
+    precision: Scalar `float64` `Tensor` with the value of `true_positives`
+      divided by the sum of `true_positives` and `false_positives`.
+    update_op: `Operation` that increments `true_positives` and
+      `false_positives` variables appropriately, and whose value matches
+      `precision`.
+
+  Raises:
+    ValueError: If `weights` is not `None` and its shape doesn't match
+      `predictions`, or if either `metrics_collections` or `updates_collections`
+      are not a list or tuple.
+  """
+  top_k_idx = math_ops.to_int64(top_k_idx)
+  tp, tp_update = _streaming_sparse_true_positive_at_k(
+      predictions_idx=top_k_idx, labels=labels, k=k, class_id=class_id,
+      weights=weights)
+  fp, fp_update = _streaming_sparse_false_positive_at_k(
+      predictions_idx=top_k_idx, labels=labels, k=k, class_id=class_id,
+      weights=weights)
+
+  metric = math_ops.div(tp, math_ops.add(tp, fp), name=name)
+  update = math_ops.div(
+      tp_update, math_ops.add(tp_update, fp_update), name='update')
+  if metrics_collections:
+    ops.add_to_collections(metrics_collections, metric)
+  if updates_collections:
+    ops.add_to_collections(updates_collections, update)
+  return metric, update
+
+
+def sparse_precision_at_k(labels,
+                          predictions,
+                          k,
+                          class_id=None,
+                          weights=None,
+                          metrics_collections=None,
+                          updates_collections=None,
+                          name=None):
+  """Computes precision@k of the predictions with respect to sparse labels.
+
+  If `class_id` is specified, we calculate precision by considering only the
+      entries in the batch for which `class_id` is in the top-k highest
+      `predictions`, and computing the fraction of them for which `class_id` is
+      indeed a correct label.
+  If `class_id` is not specified, we'll calculate precision as how often on
+      average a class among the top-k classes with the highest predicted values
+      of a batch entry is correct and can be found in the label for that entry.
+
+  `sparse_precision_at_k` creates two local variables,
+  `true_positive_at_<k>` and `false_positive_at_<k>`, that are used to compute
+  the precision@k frequency. This frequency is ultimately returned as
+  `precision_at_<k>`: an idempotent operation that simply divides
+  `true_positive_at_<k>` by total (`true_positive_at_<k>` +
+  `false_positive_at_<k>`).
+
+  For estimation of the metric over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the
+  `precision_at_<k>`. Internally, a `top_k` operation computes a `Tensor`
+  indicating the top `k` `predictions`. Set operations applied to `top_k` and
+  `labels` calculate the true positives and false positives weighted by
+  `weights`. Then `update_op` increments `true_positive_at_<k>` and
+  `false_positive_at_<k>` using these values.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: `int64` `Tensor` or `SparseTensor` with shape
+      [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
+      target classes for the associated prediction. Commonly, N=1 and `labels`
+      has shape [batch_size, num_labels]. [D1, ... DN] must match
+      `predictions`. Values should be in range [0, num_classes), where
+      num_classes is the last dimension of `predictions`. Values outside this
+      range are ignored.
+    predictions: Float `Tensor` with shape [D1, ... DN, num_classes] where
+      N >= 1. Commonly, N=1 and predictions has shape [batch size, num_classes].
+      The final dimension contains the logit values for each class. [D1, ... DN]
+      must match `labels`.
+    k: Integer, k for @k metric.
+    class_id: Integer class ID for which we want binary metrics. This should be
+      in range [0, num_classes], where num_classes is the last dimension of
+      `predictions`. If `class_id` is outside this range, the method returns
+      NAN.
+    weights: An optional `Tensor` whose shape is broadcastable to the first
+      [D1, ... DN] dimensions of `predictions` and `labels`.
+    metrics_collections: An optional list of collections that values should
+      be added to.
+    updates_collections: An optional list of collections that updates should
+      be added to.
+    name: Name of new update operation, and namespace for other dependent ops.
+
+  Returns:
+    precision: Scalar `float64` `Tensor` with the value of `true_positives`
+      divided by the sum of `true_positives` and `false_positives`.
+    update_op: `Operation` that increments `true_positives` and
+      `false_positives` variables appropriately, and whose value matches
+      `precision`.
+
+  Raises:
+    ValueError: If `weights` is not `None` and its shape doesn't match
+      `predictions`, or if either `metrics_collections` or `updates_collections`
+      are not a list or tuple.
+  """
+  default_name = _at_k_name('precision', k, class_id=class_id)
+  with ops.name_scope(name, default_name,
+                      (predictions, labels, weights)) as scope:
+    _, top_k_idx = nn.top_k(predictions, k)
+    return _sparse_precision_at_k(
+        top_k_idx=top_k_idx,
+        labels=labels,
+        k=k,
+        class_id=class_id,
+        weights=weights,
+        metrics_collections=metrics_collections,
+        updates_collections=updates_collections,
+        name=scope)
+
+
+def specificity_at_sensitivity(
+    labels, predictions, sensitivity, weights=None, num_thresholds=200,
+    metrics_collections=None, updates_collections=None, name=None):
+  """Computes the specificity at a given sensitivity.
+
+  The `specificity_at_sensitivity` function creates four local
+  variables, `true_positives`, `true_negatives`, `false_positives` and
+  `false_negatives` that are used to compute the specificity at the given
+  sensitivity value. The threshold for the given sensitivity value is computed
+  and used to evaluate the corresponding specificity.
+
+  For estimation of the metric over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the
+  `specificity`. `update_op` increments the `true_positives`, `true_negatives`,
+  `false_positives` and `false_negatives` counts with the weight of each case
+  found in the `predictions` and `labels`.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  For additional information about specificity and sensitivity, see the
+  following: https://en.wikipedia.org/wiki/Sensitivity_and_specificity
+
+  Args:
+    labels: A `bool` `Tensor` whose shape matches `predictions`.
+    predictions: A floating point `Tensor` of arbitrary shape and whose values
+      are in the range `[0, 1]`.
+    sensitivity: A scalar value in range `[0, 1]`.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
+    num_thresholds: The number of thresholds to use for matching the given
+      sensitivity.
+    metrics_collections: An optional list of collections that `specificity`
+      should be added to.
+    updates_collections: An optional list of collections that `update_op` should
+      be added to.
+    name: An optional variable_scope name.
+
+  Returns:
+    specificity: A scalar `Tensor` representing the specificity at the given
+      `specificity` value.
+    update_op: An operation that increments the `true_positives`,
+      `true_negatives`, `false_positives` and `false_negatives` variables
+      appropriately and whose value matches `specificity`.
+
+  Raises:
+    ValueError: If `predictions` and `labels` have mismatched shapes, if
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      `sensitivity` is not between 0 and 1, or if either `metrics_collections`
+      or `updates_collections` are not a list or tuple.
+  """
+  if sensitivity < 0 or sensitivity > 1:
+    raise ValueError('`sensitivity` must be in the range [0, 1].')
+
+  with variable_scope.variable_scope(name, 'specificity_at_sensitivity',
+                                     (predictions, labels, weights)):
+    kepsilon = 1e-7  # to account for floating point imprecisions
+    thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
+                  for i in range(num_thresholds-2)]
+    thresholds = [0.0 - kepsilon] + thresholds + [1.0 - kepsilon]
+
+    values, update_ops = _confusion_matrix_at_thresholds(
+        labels, predictions, thresholds, weights)
+    tp = values['tp']
+    fn = values['fn']
+    tn = values['tn']
+    fp = values['fp']
+
+    def compute_specificity_at_sensitivity(name):
+      """Computes the specificity at the given sensitivity.
+
+      Args:
+        name: The name of the operation.
+
+      Returns:
+        The specificity using the aggregated values.
+      """
+      sensitivities = math_ops.div(tp, tp + fn + kepsilon)
+
+      # We'll need to use this trick until tf.argmax allows us to specify
+      # whether we should use the first or last index in case of ties.
+      min_val = math_ops.reduce_min(math_ops.abs(sensitivities - sensitivity))
+      indices_at_minval = math_ops.equal(
+          math_ops.abs(sensitivities - sensitivity), min_val)
+      indices_at_minval = math_ops.to_int64(indices_at_minval)
+      indices_at_minval = math_ops.cumsum(indices_at_minval)
+      tf_index = math_ops.argmax(indices_at_minval, 0)
+      tf_index = math_ops.cast(tf_index, dtypes.int32)
+
+      # Now, we have the implicit threshold, so compute the specificity:
+      return math_ops.div(tn[tf_index],
+                          tn[tf_index] + fp[tf_index] + kepsilon,
+                          name)
+
+    specificity = compute_specificity_at_sensitivity('value')
+    with ops.control_dependencies(update_ops.values()):
+      update_op = compute_specificity_at_sensitivity('update_op')
+
+    if metrics_collections:
+      ops.add_to_collections(metrics_collections, specificity)
+
+    if updates_collections:
+      ops.add_to_collections(updates_collections, update_op)
+
+    return specificity, update_op
diff --git a/tensorflow/python/ops/nn.py b/tensorflow/python/ops/nn.py
index 601984799c0668..c1470892739226 100644
--- a/tensorflow/python/ops/nn.py
+++ b/tensorflow/python/ops/nn.py
@@ -110,6 +110,7 @@
 @@depthwise_conv2d_native
 @@separable_conv2d
 @@atrous_conv2d
+@@atrous_conv2d_transpose
 @@conv2d_transpose
 @@conv1d
 @@conv3d
@@ -232,7 +233,7 @@
 
 TensorFlow provides a number of methods for constructing Recurrent
 Neural Networks.  Most accept an `RNNCell`-subclassed object
-(see the documentation for `tf.nn.rnn_cell`).
+(see the documentation for `tf.contrib.rnn`).
 
 @@dynamic_rnn
 @@rnn
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index afacef7acd9eaa..d2a5b259965ce5 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -32,7 +32,7 @@
 from tensorflow.python.ops import sparse_ops
 
 
-def log_poisson_loss(log_input, targets, compute_full_loss=False, name=None):
+def log_poisson_loss(targets, log_input, compute_full_loss=False, name=None):
   """Computes log Poisson loss given `log_input`.
 
   Gives the log-likelihood loss between the prediction and the target under the
@@ -57,8 +57,8 @@ def log_poisson_loss(log_input, targets, compute_full_loss=False, name=None):
       = exp(c) - z * c [+ z * log(z) - z + 0.5 * log(2 * pi * z)]
 
   Args:
-    log_input: A `Tensor` of type `float32` or `float64`.
     targets: A `Tensor` of the same type and shape as `log_input`.
+    log_input: A `Tensor` of type `float32` or `float64`.
     compute_full_loss: whether to compute the full loss. If false, a constant
       term is dropped in favor of more efficient optimization.
     name: A name for the operation (optional).
@@ -292,7 +292,8 @@ def zero_fraction(value, name=None):
 
   ```python
       z = tf.Relu(...)
-      summ = tf.contrib.deprecated.scalar_summary('sparsity', tf.nn.zero_fraction(z))
+      summ = tf.contrib.deprecated.scalar_summary('sparsity',
+      tf.nn.zero_fraction(z))
   ```
 
   Args:
@@ -351,11 +352,16 @@ def depthwise_conv2d(input, filter, strides, padding, name=None):
 
     return nn_ops.depthwise_conv2d_native(
         input, filter, strides, padding, name=name)
+
+
 # pylint: enable=redefined-builtin
 
 
 # pylint: disable=redefined-builtin,line-too-long
-def separable_conv2d(input, depthwise_filter, pointwise_filter, strides,
+def separable_conv2d(input,
+                     depthwise_filter,
+                     pointwise_filter,
+                     strides,
                      padding,
                      name=None):
   """2-D convolution with separable filters.
@@ -418,12 +424,11 @@ def separable_conv2d(input, depthwise_filter, pointwise_filter, strides,
     # If any of channel numbers is unknown, then the comparison below returns
     # None. See TensorShape.__gt__().
     if channel_multiplier * in_channels > out_channels:
-      raise ValueError(
-          "Refusing to perform an overparameterized separable "
-          "convolution: channel_multiplier * in_channels = "
-          "%d * %d = %d > %d = out_channels" %
-          (channel_multiplier, in_channels,
-           channel_multiplier * in_channels, out_channels))
+      raise ValueError("Refusing to perform an overparameterized separable "
+                       "convolution: channel_multiplier * in_channels = "
+                       "%d * %d = %d > %d = out_channels" %
+                       (channel_multiplier, in_channels,
+                        channel_multiplier * in_channels, out_channels))
 
     # The layout of the ops in the graph are expected to be as follows:
     # depthwise_conv2d  // Conv2D op corresponding to native deptwise conv.
@@ -432,6 +437,8 @@ def separable_conv2d(input, depthwise_filter, pointwise_filter, strides,
         input, depthwise_filter, strides, padding, name="depthwise")
     return nn_ops.conv2d(
         depthwise, pointwise_filter, [1, 1, 1, 1], padding="VALID", name=name)
+
+
 # pylint: enable=redefined-builtin,line-too-long
 
 
@@ -463,15 +470,15 @@ def sufficient_statistics(x, axes, shift=None, keep_dims=False, name=None):
   with ops.name_scope(name, "sufficient_statistics", [x, shift]):
     x = ops.convert_to_tensor(x, name="x")
     x_shape = x.get_shape()
-    if x_shape.is_fully_defined():
+    if all(x_shape[d].value is not None for d in axes):
       counts = 1
       for d in axes:
         counts *= x_shape[d].value
       counts = constant_op.constant(counts, dtype=x.dtype)
     else:  # shape needs to be inferred at runtime.
-      x_dims = array_ops.gather(array_ops.shape(x), axes)
-      counts = math_ops.cast(
-          math_ops.reduce_prod(x_dims), x.dtype, name="count")
+      x_dims = array_ops.gather(
+          math_ops.cast(array_ops.shape(x), x.dtype), axes)
+      counts = math_ops.reduce_prod(x_dims, name="count")
     if shift is not None:
       shift = ops.convert_to_tensor(shift, name="shift")
       m_ss = math_ops.sub(x, shift)
@@ -591,10 +598,8 @@ def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=False):
 
     # Note that we use keep_dims=True for our reductions regardless of the arg;
     # this is so that the results remain broadcast-compatible with the inputs.
-    weighted_input_sum = math_ops.reduce_sum(frequency_weights * x,
-                                             axes,
-                                             name="weighted_input_sum",
-                                             keep_dims=True)
+    weighted_input_sum = math_ops.reduce_sum(
+        frequency_weights * x, axes, name="weighted_input_sum", keep_dims=True)
 
     # The shape of the weights isn't necessarily the same as x's
     # shape, just broadcast-compatible with it -- so this expression
@@ -605,10 +610,7 @@ def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=False):
     broadcasted_weights = frequency_weights + array_ops.zeros_like(x)
 
     sum_of_weights = math_ops.reduce_sum(
-        broadcasted_weights,
-        axes,
-        name="sum_of_weights",
-        keep_dims=True)
+        broadcasted_weights, axes, name="sum_of_weights", keep_dims=True)
 
     divisor = math_ops.reciprocal(sum_of_weights, name="inv_weight_sum")
 
@@ -625,8 +627,8 @@ def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=False):
 
     if not keep_dims:
       weighted_mean = array_ops.squeeze(weighted_mean, squeeze_dims=axes)
-      weighted_variance = array_ops.squeeze(weighted_variance,
-                                            squeeze_dims=axes)
+      weighted_variance = array_ops.squeeze(
+          weighted_variance, squeeze_dims=axes)
 
     if needs_cast:
       weighted_mean = math_ops.cast(weighted_mean, dtypes.float16)
@@ -692,13 +694,16 @@ def batch_normalization(x,
                       if offset is not None else -mean * inv)
 
 
-def fused_batch_norm(x, scale, offset,  # pylint: disable=invalid-name
-                     mean=None,
-                     variance=None,
-                     epsilon=0.001,
-                     data_format="NHWC",
-                     is_training=True,
-                     name=None):
+def fused_batch_norm(
+    x,
+    scale,
+    offset,  # pylint: disable=invalid-name
+    mean=None,
+    variance=None,
+    epsilon=0.001,
+    data_format="NHWC",
+    is_training=True,
+    name=None):
   r"""Batch normalization.
 
   As described in http://arxiv.org/abs/1502.03167.
@@ -960,9 +965,10 @@ def _compute_sampled_logits(weights,
     # true_logits is a float tensor, ones_like(true_logits) is a float tensor
     # of ones. We then divide by num_true to ensure the per-example labels sum
     # to 1.0, i.e. form a proper probability distribution.
-    out_labels = array_ops.concat(1,
-                                  [array_ops.ones_like(true_logits) / num_true,
-                                   array_ops.zeros_like(sampled_logits)])
+    out_labels = array_ops.concat(1, [
+        array_ops.ones_like(true_logits) / num_true,
+        array_ops.zeros_like(sampled_logits)
+    ])
 
   return out_logits, out_labels
 
@@ -981,8 +987,10 @@ def nce_loss(weights,
   """Computes and returns the noise-contrastive estimation training loss.
 
   See [Noise-contrastive estimation: A new estimation principle for
-  unnormalized statistical models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf).
-  Also see our [Candidate Sampling Algorithms Reference](../../extras/candidate_sampling.pdf)
+  unnormalized statistical
+  models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf).
+  Also see our [Candidate Sampling Algorithms
+  Reference](../../extras/candidate_sampling.pdf)
 
   Note: By default this uses a log-uniform (Zipfian) distribution for sampling,
   so your labels must be sorted in order of decreasing frequency to achieve
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 35610cc5540453..b2c6cf713898b8 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -1078,6 +1078,151 @@ def conv2d_transpose(value,
                                             name=name)
 
 
+def atrous_conv2d_transpose(value,
+                            filters,
+                            output_shape,
+                            rate,
+                            padding,
+                            name=None):
+  """The transpose of `atrous_conv2d`.
+
+  This operation is sometimes called "deconvolution" after [Deconvolutional
+  Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf), but is
+  actually the transpose (gradient) of `atrous_conv2d` rather than an actual
+  deconvolution.
+
+  Args:
+    value: A 4-D `Tensor` of type `float`. It needs to be in the default `NHWC`
+      format. Its shape is `[batch, in_height, in_width, in_channels]`.
+    filters: A 4-D `Tensor` with the same type as `value` and shape
+      `[filter_height, filter_width, out_channels, in_channels]`. `filters`'
+      `in_channels` dimension must match that of `value`. Atrous convolution is
+      equivalent to standard convolution with upsampled filters with effective
+      height `filter_height + (filter_height - 1) * (rate - 1)` and effective
+      width `filter_width + (filter_width - 1) * (rate - 1)`, produced by
+      inserting `rate - 1` zeros along consecutive elements across the
+      `filters`' spatial dimensions.
+    output_shape: A 1-D `Tensor` of shape representing the output shape of the
+      deconvolution op.
+    rate: A positive int32. The stride with which we sample input values across
+      the `height` and `width` dimensions. Equivalently, the rate by which we
+      upsample the filter values by inserting zeros across the `height` and
+      `width` dimensions. In the literature, the same parameter is sometimes
+      called `input stride` or `dilation`.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
+    name: Optional name for the returned tensor.
+
+  Returns:
+    A `Tensor` with the same type as `value`.
+
+  Raises:
+    ValueError: If input/output depth does not match `filters`' shape, or if
+      padding is other than `'VALID'` or `'SAME'`, or if the `rate` is less
+      than one, or if the output_shape is not a tensor with 4 elements.
+  """
+  with ops.name_scope(name, "atrous_conv2d_transpose",
+                      [value, filters, output_shape]) as name:
+    value = ops.convert_to_tensor(value, name="value")
+    filters = ops.convert_to_tensor(filters, name="filters")
+    if not value.get_shape()[3].is_compatible_with(filters.get_shape()[3]):
+      raise ValueError(
+          "value's input channels does not match filters' input channels, "
+          "{} != {}".format(value.get_shape()[3], filters.get_shape()[3]))
+    if rate < 1:
+      raise ValueError("rate {} cannot be less than one".format(rate))
+
+    if rate == 1:
+      return conv2d_transpose(value,
+                              filters,
+                              output_shape,
+                              strides=[1, 1, 1, 1],
+                              padding=padding,
+                              data_format="NHWC")
+
+    output_shape_ = ops.convert_to_tensor(output_shape, name="output_shape")
+    if not output_shape_.get_shape().is_compatible_with(tensor_shape.vector(4)):
+      raise ValueError("output_shape must have shape (4,), got {}"
+                       .format(output_shape_.get_shape()))
+
+    if isinstance(output_shape, (list, np.ndarray)):
+      # output_shape's shape should be == [4] if reached this point.
+      if not filters.get_shape()[2].is_compatible_with(output_shape[3]):
+        raise ValueError(
+            "output_shape does not match filter's output channels, "
+            "{} != {}".format(output_shape[3], filters.get_shape()[2]))
+
+    # We have two padding contributions. The first is used for converting "SAME"
+    # to "VALID". The second is required so that the height and width of the
+    # zero-padded value tensor are multiples of rate.
+
+    # Padding required to reduce to "VALID" convolution
+    if padding == "SAME":
+      # Handle filters whose shape is unknown during graph creation.
+      if filters.get_shape().is_fully_defined():
+        filter_shape = filters.get_shape().as_list()
+      else:
+        filter_shape = array_ops.shape(filters)
+      filter_height, filter_width = filter_shape[0], filter_shape[1]
+
+      # Spatial dimensions of the filters and the upsampled filters in which we
+      # introduce (rate - 1) zeros between consecutive filter values.
+      filter_height_up = filter_height + (filter_height - 1) * (rate - 1)
+      filter_width_up = filter_width + (filter_width - 1) * (rate - 1)
+
+      pad_height = filter_height_up - 1
+      pad_width = filter_width_up - 1
+
+      # When pad_height (pad_width) is odd, we pad more to bottom (right),
+      # following the same convention as conv2d().
+      pad_top = pad_height // 2
+      pad_bottom = pad_height - pad_top
+      pad_left = pad_width // 2
+      pad_right = pad_width - pad_left
+    elif padding == "VALID":
+      pad_top = 0
+      pad_bottom = 0
+      pad_left = 0
+      pad_right = 0
+    else:
+      raise ValueError("padding must be either VALID or SAME:"
+                       " {}".format(padding))
+
+    in_height = output_shape[1] + pad_top + pad_bottom
+    in_width = output_shape[2] + pad_left + pad_right
+
+    # More padding so that rate divides the height and width of the input.
+    pad_bottom_extra = (rate - in_height % rate) % rate
+    pad_right_extra = (rate - in_width % rate) % rate
+
+    # The paddings argument to space_to_batch is just the extra padding
+    # component.
+    space_to_batch_pad = [[0, pad_bottom_extra], [0, pad_right_extra]]
+
+    value = array_ops.space_to_batch(input=value,
+                                     paddings=space_to_batch_pad,
+                                     block_size=rate)
+
+    input_sizes = [rate * rate * output_shape[0],
+                   (in_height + pad_bottom_extra) // rate,
+                   (in_width + pad_right_extra) // rate,
+                   output_shape[3]]
+
+    value = gen_nn_ops.conv2d_backprop_input(input_sizes=input_sizes,
+                                             filter=filters,
+                                             out_backprop=value,
+                                             strides=[1, 1, 1, 1],
+                                             padding="VALID",
+                                             data_format="NHWC")
+
+    # The crops argument to batch_to_space includes both padding components.
+    batch_to_space_crop = [[pad_top, pad_bottom + pad_bottom_extra],
+                           [pad_left, pad_right + pad_right_extra]]
+
+    return array_ops.batch_to_space(input=value,
+                                    crops=batch_to_space_crop,
+                                    block_size=rate)
+
+
 def conv3d_transpose(value,
                      filter,
                      output_shape,
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index f10a32d6211494..a4504cc9f84a05 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Tests for miscellaneous functionality in tensorflow.ops.nn."""
 from __future__ import absolute_import
 from __future__ import division
@@ -101,8 +100,8 @@ def testLogPoissonLoss(self):
     y_np = self._log_poisson_loss(x_np, z_np, compute_full_loss=False)
     y_np_stirling = self._log_poisson_loss(x_np, z_np, compute_full_loss=True)
     with self.test_session():
-      y_tf = tf.nn.log_poisson_loss(x_np, z_np, compute_full_loss=False)
-      y_tf_stirling = tf.nn.log_poisson_loss(x_np, z_np, compute_full_loss=True)
+      y_tf = tf.nn.log_poisson_loss(z_np, x_np, compute_full_loss=False)
+      y_tf_stirling = tf.nn.log_poisson_loss(z_np, x_np, compute_full_loss=True)
       y_tf_np = y_tf.eval()
       y_tf_np_stirling = y_tf_stirling.eval()
     eps = 1e-3
@@ -115,8 +114,8 @@ def testGradient(self):
     z_np = np.random.randint(0, 5, size=x_shape).astype(np.float64)
     with self.test_session():
       x_tf = tf.constant(x_np)
-      y_tf = tf.nn.log_poisson_loss(x_tf, z_np, compute_full_loss=False)
-      y_tf_stirling = tf.nn.log_poisson_loss(x_tf, z_np, compute_full_loss=True)
+      y_tf = tf.nn.log_poisson_loss(z_np, x_tf, compute_full_loss=False)
+      y_tf_stirling = tf.nn.log_poisson_loss(z_np, x_tf, compute_full_loss=True)
       err = tf.test.compute_gradient_error(x_tf, x_shape, y_tf, x_shape)
       err_stirling = tf.test.compute_gradient_error(x_tf, x_shape,
                                                     y_tf_stirling, x_shape)
@@ -160,8 +159,8 @@ class L2LossTest(tf.test.TestCase):
   def testL2Loss(self):
     for dtype in [tf.float32, tf.float64]:
       with self.test_session():
-        x = tf.constant([1.0, 0.0, 3.0, 2.0], shape=[2, 2], name="x",
-                        dtype=dtype)
+        x = tf.constant(
+            [1.0, 0.0, 3.0, 2.0], shape=[2, 2], name="x", dtype=dtype)
         l2loss = tf.nn.l2_loss(x)
         value = l2loss.eval()
       self.assertAllClose(7.0, value)
@@ -330,9 +329,8 @@ def testShapedDropoutUnknownShape(self):
     y_dim = 30
     keep_prob = 0.5
     x = tf.constant(1.0, shape=[x_dim, y_dim], dtype=tf.float32)
-    dropout_x = tf.nn.dropout(x,
-                              keep_prob,
-                              noise_shape=tf.placeholder(tf.int32))
+    dropout_x = tf.nn.dropout(
+        x, keep_prob, noise_shape=tf.placeholder(tf.int32))
     self.assertEqual(x.get_shape(), dropout_x.get_shape())
 
   def testInvalidKeepProb(self):
@@ -389,25 +387,30 @@ def _GenerateTestInputs(self):
     np.random.seed(0)
     weights = np.random.randn(self._num_classes, self._dim).astype(np.float32)
     biases = np.random.randn(self._num_classes).astype(np.float32)
-    hidden_acts = np.random.randn(self._batch_size, self._dim).astype(
-        np.float32)
+    hidden_acts = np.random.randn(self._batch_size,
+                                  self._dim).astype(np.float32)
     sharded_weights = [
-        weights[[row for row in range(self._num_classes)
-                 if row % self._num_shards == shard]]
-        for shard in range(self._num_shards)]
+        weights[[
+            row for row in range(self._num_classes)
+            if row % self._num_shards == shard
+        ]] for shard in range(self._num_shards)
+    ]
     return weights, biases, hidden_acts, sharded_weights
 
-  def _ComputeSampledLogitsNP(self, true_w, true_b, sampled_w, sampled_b,
+  def _ComputeSampledLogitsNP(self,
+                              true_w,
+                              true_b,
+                              sampled_w,
+                              sampled_b,
                               hidden_acts,
                               num_true=1,
                               true_expected=None,
                               sampled_expected=None):
 
     batch_size, dim = hidden_acts.shape
-    true_logits = np.sum(
-        hidden_acts.reshape((batch_size, 1, dim)) * true_w.reshape(
-            (batch_size, num_true, dim)),
-        axis=2)
+    true_logits = np.sum(hidden_acts.reshape(
+        (batch_size, 1, dim)) * true_w.reshape((batch_size, num_true, dim)),
+                         axis=2)
     true_b = true_b.reshape((batch_size, num_true))
     true_logits += true_b
     sampled_logits = np.dot(hidden_acts, sampled_w.T) + sampled_b
@@ -423,9 +426,17 @@ def _ComputeSampledLogitsNP(self, true_w, true_b, sampled_w, sampled_b,
 
     return out_logits, out_labels
 
-  def _ComputeSampledLogitsTF(self, weights, biases, hidden_acts, labels,
-                              num_sampled, num_classes, num_true, sampled_vals,
-                              subtract_log_q, remove_accidental_hits,
+  def _ComputeSampledLogitsTF(self,
+                              weights,
+                              biases,
+                              hidden_acts,
+                              labels,
+                              num_sampled,
+                              num_classes,
+                              num_true,
+                              sampled_vals,
+                              subtract_log_q,
+                              remove_accidental_hits,
                               name="sampled_loss_TF"):
     # Should be called from within a `with test_session():` block
     if isinstance(weights, list):
@@ -433,11 +444,10 @@ def _ComputeSampledLogitsTF(self, weights, biases, hidden_acts, labels,
     else:
       weights_tf = tf.constant(weights)
     biases_tf = tf.constant(biases)
-    hidden_acts_tf = tf.constant(hidden_acts,
-                                 shape=(self._batch_size, self._dim))
-    labels_tf = tf.constant(labels,
-                            dtype=tf.int64,
-                            shape=(self._batch_size, num_true))
+    hidden_acts_tf = tf.constant(
+        hidden_acts, shape=(self._batch_size, self._dim))
+    labels_tf = tf.constant(
+        labels, dtype=tf.int64, shape=(self._batch_size, num_true))
 
     pred_logits_tf, pred_labels_tf = _compute_sampled_logits(
         weights_tf,
@@ -464,16 +474,26 @@ def testComputeSampledLogitsShapes(self):
 
     with self.test_session() as sess:
       for num_true_test in range(1, 5):
-        labels = np.random.randint(low=0, high=self._num_classes,
-                                   size=self._batch_size * num_true_test)
+        labels = np.random.randint(
+            low=0,
+            high=self._num_classes,
+            size=self._batch_size * num_true_test)
         true_w, true_b = weights[labels], biases[labels]
 
         logits_np, labels_np = self._ComputeSampledLogitsNP(
-            true_w, true_b, sampled_w, sampled_b, hidden_acts,
+            true_w,
+            true_b,
+            sampled_w,
+            sampled_b,
+            hidden_acts,
             num_true=num_true_test)
 
         logits_tf, labels_tf = self._ComputeSampledLogitsTF(
-            weights, biases, hidden_acts, labels, num_sampled,
+            weights,
+            biases,
+            hidden_acts,
+            labels,
+            num_sampled,
             self._num_classes,
             num_true=num_true_test,
             sampled_vals=test_sampled_vals,
@@ -500,16 +520,26 @@ def testComputeSampledLogitsValues(self):
     with self.test_session() as sess:
       for num_true_test in range(1, 5):
         # Generate test data for this run
-        labels = np.random.randint(low=0, high=self._num_classes,
-                                   size=self._batch_size * num_true_test)
+        labels = np.random.randint(
+            low=0,
+            high=self._num_classes,
+            size=self._batch_size * num_true_test)
         true_w, true_b = weights[labels], biases[labels]
 
         # Test 1: Without accidental hit removal or subtract_log_q
         logits_np, labels_np = self._ComputeSampledLogitsNP(
-            true_w, true_b, sampled_w, sampled_b, hidden_acts,
+            true_w,
+            true_b,
+            sampled_w,
+            sampled_b,
+            hidden_acts,
             num_true=num_true_test)
         logits_tf, labels_tf = self._ComputeSampledLogitsTF(
-            weights, biases, hidden_acts, labels, num_sampled,
+            weights,
+            biases,
+            hidden_acts,
+            labels,
+            num_sampled,
             self._num_classes,
             num_true=num_true_test,
             sampled_vals=test_sampled_vals,
@@ -523,7 +553,11 @@ def testComputeSampledLogitsValues(self):
 
         # Test 2: With accidental hit removal, no subtract_log_q
         logits_tf, labels_tf = self._ComputeSampledLogitsTF(
-            weights, biases, hidden_acts, labels, num_sampled,
+            weights,
+            biases,
+            hidden_acts,
+            labels,
+            num_sampled,
             self._num_classes,
             num_true=num_true_test,
             sampled_vals=test_sampled_vals,
@@ -545,12 +579,20 @@ def testComputeSampledLogitsValues(self):
 
         # Test 3: With subtract_log_q, no accidental hit removal
         logits_np, labels_np = self._ComputeSampledLogitsNP(
-            true_w, true_b, sampled_w, sampled_b, hidden_acts,
+            true_w,
+            true_b,
+            sampled_w,
+            sampled_b,
+            hidden_acts,
             num_true=num_true_test,
             true_expected=true_exp,
             sampled_expected=sampled_exp)
         logits_tf, labels_tf = self._ComputeSampledLogitsTF(
-            weights, biases, hidden_acts, labels, num_sampled,
+            weights,
+            biases,
+            hidden_acts,
+            labels,
+            num_sampled,
             self._num_classes,
             num_true=num_true_test,
             sampled_vals=test_sampled_vals,
@@ -564,10 +606,18 @@ def testComputeSampledLogitsValues(self):
 
         # Test 4: Test 1, with sharded weights
         logits_np, labels_np = self._ComputeSampledLogitsNP(
-            true_w, true_b, sampled_w, sampled_b, hidden_acts,
+            true_w,
+            true_b,
+            sampled_w,
+            sampled_b,
+            hidden_acts,
             num_true=num_true_test)
         logits_tf, labels_tf = self._ComputeSampledLogitsTF(
-            sharded_weights, biases, hidden_acts, labels, num_sampled,
+            sharded_weights,
+            biases,
+            hidden_acts,
+            labels,
+            num_sampled,
             self._num_classes,
             num_true=num_true_test,
             sampled_vals=test_sampled_vals,
@@ -604,7 +654,11 @@ def _SigmoidCrossEntropyWithLogits(logits, targets):
 
     with self.test_session():
       logits_np, labels_np = self._ComputeSampledLogitsNP(
-          true_w, true_b, sampled_w, sampled_b, hidden_acts,
+          true_w,
+          true_b,
+          sampled_w,
+          sampled_b,
+          hidden_acts,
           true_expected=true_exp,
           sampled_expected=sampled_exp)
       nce_loss_np = np.sum(
@@ -615,14 +669,15 @@ def _SigmoidCrossEntropyWithLogits(logits, targets):
       biases_tf = tf.constant(biases)
       inputs_tf = tf.constant(hidden_acts)
 
-      nce_loss_tf = tf.nn.nce_loss(weights_tf,
-                                   biases_tf,
-                                   inputs_tf,
-                                   labels_tf,
-                                   num_sampled=1,
-                                   num_classes=self._num_classes,
-                                   num_true=1,
-                                   sampled_values=test_sampled_vals)
+      nce_loss_tf = tf.nn.nce_loss(
+          weights_tf,
+          biases_tf,
+          inputs_tf,
+          labels_tf,
+          num_sampled=1,
+          num_classes=self._num_classes,
+          num_true=1,
+          sampled_values=test_sampled_vals)
 
       self.assertAllClose(nce_loss_np, nce_loss_tf.eval(), 1e-4)
 
@@ -662,7 +717,11 @@ def _SoftmaxCrossEntropyWithLogits(logits, targets):
 
     with self.test_session():
       logits_np, labels_np = self._ComputeSampledLogitsNP(
-          true_w, true_b, sampled_w, sampled_b, hidden_acts,
+          true_w,
+          true_b,
+          sampled_w,
+          sampled_b,
+          hidden_acts,
           true_expected=true_exp,
           sampled_expected=sampled_exp)
       sampled_softmax_loss_np = _SoftmaxCrossEntropyWithLogits(logits_np,
@@ -684,8 +743,8 @@ def _SoftmaxCrossEntropyWithLogits(logits, targets):
           sampled_values=test_sampled_vals,
           remove_accidental_hits=False)
 
-      self.assertAllClose(
-          sampled_softmax_loss_np, sampled_softmax_loss_tf.eval(), 1e-4)
+      self.assertAllClose(sampled_softmax_loss_np,
+                          sampled_softmax_loss_tf.eval(), 1e-4)
 
       # Test with sharded weights
       sampled_softmax_loss_tf = tf.nn.sampled_softmax_loss(
@@ -699,8 +758,8 @@ def _SoftmaxCrossEntropyWithLogits(logits, targets):
           sampled_values=test_sampled_vals,
           remove_accidental_hits=False)
 
-      self.assertAllClose(
-          sampled_softmax_loss_np, sampled_softmax_loss_tf.eval(), 1e-4)
+      self.assertAllClose(sampled_softmax_loss_np,
+                          sampled_softmax_loss_tf.eval(), 1e-4)
 
 
 class CReluTest(tf.test.TestCase):
diff --git a/tensorflow/python/ops/parsing_ops.py b/tensorflow/python/ops/parsing_ops.py
index fa99e3a49b2c4e..0d52004a55ac0a 100644
--- a/tensorflow/python/ops/parsing_ops.py
+++ b/tensorflow/python/ops/parsing_ops.py
@@ -291,7 +291,7 @@ def parse_example(serialized, features, name=None, example_names=None):
   ```
   {"ft": SparseTensor(indices=[[0, 0], [0, 1], [2, 0]],
                       values=[1.0, 2.0, 3.0],
-                      shape=(3, 2)) }
+                      dense_shape=(3, 2)) }
   ```
 
   Given two `Example` input protos in `serialized`:
@@ -705,7 +705,7 @@ def parse_single_sequence_example(
   `sequence_features` contains `VarLenFeature` and `FixedLenSequenceFeature`
   objects. Each `VarLenFeature` is mapped to a `SparseTensor`, and each
   `FixedLenSequenceFeature` is mapped to a `Tensor`, each of the specified type.
-  The shape will be `(T,) + df.shape` for `FixedLenSequenceFeature` `df`, where
+  The shape will be `(T,) + df.dense_shape` for `FixedLenSequenceFeature` `df`, where
   `T` is the length of the associated `FeatureList` in the `SequenceExample`.
   For instance, `FixedLenSequenceFeature([])` yields a scalar 1-D `Tensor` of
   static shape `[None]` and dynamic shape `[T]`, while
diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index 61536ab4a05b77..8b31862da4482e 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -305,6 +305,15 @@ def state_saving_rnn(cell, inputs, state_saver, state_name,
   return (outputs, state)
 
 
+def _on_device(fn, device):
+  """Build the subgraph defined by lambda `fn` on `device` if it's not None."""
+  if device:
+    with ops.device(device):
+      return fn()
+  else:
+    return fn()
+
+
 # pylint: disable=unused-argument
 def _rnn_step(
     time, sequence_length, min_sequence_length, max_sequence_length,
@@ -366,7 +375,9 @@ def _rnn_step(
 
   def _copy_one_through(output, new_output):
     copy_cond = (time >= sequence_length)
-    return array_ops.where(copy_cond, output, new_output)
+    return _on_device(
+        lambda: array_ops.where(copy_cond, output, new_output),
+        device=new_output.op.device)
 
   def _copy_some_through(flat_new_output, flat_new_state):
     # Use broadcasting select to determine which values should get
@@ -1082,7 +1093,7 @@ def raw_rnn(cell, loop_fn,
   inputs_ta = tf.TensorArray(dtype=tf.float32, size=max_time)
   inputs_ta = inputs_ta.unpack(inputs)
 
-  cell = tf.nn.rnn_cell.LSTMCell(num_units)
+  cell = tf.contrib.rnn.LSTMCell(num_units)
 
   def loop_fn(time, cell_output, cell_state, loop_state):
     emit_output = cell_output  # == None for time == 0
@@ -1296,11 +1307,17 @@ def body(time, elements_finished, current_input,
       loop_state = loop_state if next_loop_state is None else next_loop_state
 
       def _copy_some_through(current, candidate):
+        """Copy some tensors through via array_ops.where."""
         current_flat = nest.flatten(current)
         candidate_flat = nest.flatten(candidate)
+        # pylint: disable=g-long-lambda,cell-var-from-loop
         result_flat = [
-            array_ops.where(elements_finished, current_i, candidate_i)
+            _on_device(
+                lambda: array_ops.where(
+                    elements_finished, current_i, candidate_i),
+                device=candidate_i.op.device)
             for (current_i, candidate_i) in zip(current_flat, candidate_flat)]
+        # pylint: enable=g-long-lambda,cell-var-from-loop
         return nest.pack_sequence_as(
             structure=current, flat_sequence=result_flat)
 
diff --git a/tensorflow/python/ops/seq2seq.py b/tensorflow/python/ops/seq2seq.py
index 5bda634aeecad9..924b8b03c4c4b3 100644
--- a/tensorflow/python/ops/seq2seq.py
+++ b/tensorflow/python/ops/seq2seq.py
@@ -995,7 +995,7 @@ def sequence_loss_by_example(logits, targets, weights,
     weights: List of 1D batch-sized float-Tensors of the same length as logits.
     average_across_timesteps: If set, divide the returned cost by the total
       label weight.
-    softmax_loss_function: Function (inputs-batch, labels-batch) -> loss-batch
+    softmax_loss_function: Function (labels-batch, inputs-batch) -> loss-batch
       to be used instead of the standard softmax (the default if this is None).
     name: Optional name for this operation, default: "sequence_loss_by_example".
 
@@ -1018,9 +1018,9 @@ def sequence_loss_by_example(logits, targets, weights,
         # violates our general scalar strictness policy.
         target = array_ops.reshape(target, [-1])
         crossent = nn_ops.sparse_softmax_cross_entropy_with_logits(
-            logit, target)
+            logits=logit, labels=target)
       else:
-        crossent = softmax_loss_function(logit, target)
+        crossent = softmax_loss_function(target, logit)
       log_perp_list.append(crossent * weight)
     log_perps = math_ops.add_n(log_perp_list)
     if average_across_timesteps:
diff --git a/tensorflow/python/ops/sets.py b/tensorflow/python/ops/sets.py
new file mode 100644
index 00000000000000..6b76f106998da2
--- /dev/null
+++ b/tensorflow/python/ops/sets.py
@@ -0,0 +1,184 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python layer for sets.
+
+@@set_size
+@@set_intersection
+@@set_union
+@@set_difference
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import gen_set_ops
+
+
+_VALID_DTYPES = set([
+    dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
+    dtypes.uint8, dtypes.uint16, dtypes.string])
+
+
+def set_size(a, validate_indices=True):
+  """Compute number of unique elements along last dimension of `a`.
+
+  Args:
+    a: `SparseTensor`, with indices sorted in row-major order.
+    validate_indices: Whether to validate the order and range of sparse indices
+       in `a`.
+
+  Returns:
+    `int32` `Tensor` of set sizes. For `a` ranked `n`, this is a `Tensor` with
+    rank `n-1`, and the same 1st `n-1` dimensions as `a`. Each value is the
+    number of unique elements in the corresponding `[0...n-1]` dimension of `a`.
+
+  Raises:
+    TypeError: If `a` is an invalid types.
+  """
+  a = sparse_tensor.convert_to_tensor_or_sparse_tensor(a, name="a")
+  if not isinstance(a, sparse_tensor.SparseTensor):
+    raise TypeError("Expected `SparseTensor`, got %s." % a)
+  if a.values.dtype.base_dtype not in _VALID_DTYPES:
+    raise TypeError("Invalid dtype %s." % a.values.dtype)
+  # pylint: disable=protected-access
+  return gen_set_ops.set_size(a.indices, a.values, a.shape, validate_indices)
+
+ops.NotDifferentiable("SetSize")
+
+
+ops.NotDifferentiable("DenseToDenseSetOperation")
+ops.NotDifferentiable("DenseToSparseSetOperation")
+ops.NotDifferentiable("SparseToSparseSetOperation")
+
+
+def _set_operation(a, b, set_operation, validate_indices=True):
+  """Compute set operation of elements in last dimension of `a` and `b`.
+
+  All but the last dimension of `a` and `b` must match.
+
+  Args:
+    a: `Tensor` or `SparseTensor` of the same type as `b`. If sparse, indices
+        must be sorted in row-major order.
+    b: `Tensor` or `SparseTensor` of the same type as `a`. Must be
+        `SparseTensor` if `a` is `SparseTensor`. If sparse, indices must be
+        sorted in row-major order.
+    set_operation: String indicating set operaiton. See
+        SetOperationOp::SetOperationFromContext for valid values.
+    validate_indices: Whether to validate the order and range of sparse indices
+       in `a` and `b`.
+
+  Returns:
+    A `SparseTensor` with the same rank as `a` and `b`, and all but the last
+    dimension the same. Elements along the last dimension contain the results
+    of the set operation.
+
+  Raises:
+    TypeError: If inputs are invalid types.
+    ValueError: If `a` is sparse and `b` is dense.
+  """
+  a = sparse_tensor.convert_to_tensor_or_sparse_tensor(a, name="a")
+  if a.dtype.base_dtype not in _VALID_DTYPES:
+    raise TypeError("'a' invalid dtype %s." % a.dtype)
+  b = sparse_tensor.convert_to_tensor_or_sparse_tensor(b, name="b")
+  if b.dtype.base_dtype != a.dtype.base_dtype:
+    raise TypeError("Types don't match, %s vs %s." % (a.dtype, b.dtype))
+  # pylint: disable=protected-access
+  if isinstance(a, sparse_tensor.SparseTensor):
+    if isinstance(b, sparse_tensor.SparseTensor):
+      indices, values, shape = gen_set_ops.sparse_to_sparse_set_operation(
+          a.indices, a.values, a.shape, b.indices, b.values, b.dense_shape,
+          set_operation, validate_indices)
+    else:
+      raise ValueError("Sparse,Dense is not supported, but Dense,Sparse is. "
+                       "Please flip the order of your inputs.")
+  elif isinstance(b, sparse_tensor.SparseTensor):
+    indices, values, shape = gen_set_ops.dense_to_sparse_set_operation(
+        a, b.indices, b.values, b.dense_shape, set_operation, validate_indices)
+  else:
+    indices, values, shape = gen_set_ops.dense_to_dense_set_operation(
+        a, b, set_operation, validate_indices)
+  # pylint: enable=protected-access
+  return sparse_tensor.SparseTensor(indices, values, shape)
+
+
+def set_intersection(a, b, validate_indices=True):
+  """Compute set intersection of elements in last dimension of `a` and `b`.
+
+  All but the last dimension of `a` and `b` must match.
+
+  Args:
+    a: `Tensor` or `SparseTensor` of the same type as `b`. If sparse, indices
+        must be sorted in row-major order.
+    b: `Tensor` or `SparseTensor` of the same type as `a`. Must be
+        `SparseTensor` if `a` is `SparseTensor`. If sparse, indices must be
+        sorted in row-major order.
+    validate_indices: Whether to validate the order and range of sparse indices
+       in `a` and `b`.
+
+  Returns:
+    A `SparseTensor` with the same rank as `a` and `b`, and all but the last
+    dimension the same. Elements along the last dimension contain the
+    intersections.
+  """
+  return _set_operation(a, b, "intersection", validate_indices)
+
+
+def set_difference(a, b, aminusb=True, validate_indices=True):
+  """Compute set difference of elements in last dimension of `a` and `b`.
+
+  All but the last dimension of `a` and `b` must match.
+
+  Args:
+    a: `Tensor` or `SparseTensor` of the same type as `b`. If sparse, indices
+        must be sorted in row-major order.
+    b: `Tensor` or `SparseTensor` of the same type as `a`. Must be
+        `SparseTensor` if `a` is `SparseTensor`. If sparse, indices must be
+        sorted in row-major order.
+    aminusb: Whether to subtract `b` from `a`, vs vice versa.
+    validate_indices: Whether to validate the order and range of sparse indices
+       in `a` and `b`.
+
+  Returns:
+    A `SparseTensor` with the same rank as `a` and `b`, and all but the last
+    dimension the same. Elements along the last dimension contain the
+    differences.
+  """
+  return _set_operation(a, b, "a-b" if aminusb else "b-a", validate_indices)
+
+
+def set_union(a, b, validate_indices=True):
+  """Compute set union of elements in last dimension of `a` and `b`.
+
+  All but the last dimension of `a` and `b` must match.
+
+  Args:
+    a: `Tensor` or `SparseTensor` of the same type as `b`. If sparse, indices
+        must be sorted in row-major order.
+    b: `Tensor` or `SparseTensor` of the same type as `a`. Must be
+        `SparseTensor` if `a` is `SparseTensor`. If sparse, indices must be
+        sorted in row-major order.
+    validate_indices: Whether to validate the order and range of sparse indices
+       in `a` and `b`.
+
+  Returns:
+    A `SparseTensor` with the same rank as `a` and `b`, and all but the last
+    dimension the same. Elements along the last dimension contain the
+    unions.
+  """
+  return _set_operation(a, b, "union", validate_indices)
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index 4363be820fd28e..f79370e2934a54 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -70,8 +70,8 @@
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_sparse_ops import *
-
 # pylint: enable=wildcard-import
+from tensorflow.python.util import deprecation
 
 
 def _convert_to_sparse_tensor(sp_input):
@@ -115,7 +115,11 @@ def _convert_to_sparse_tensors(sp_inputs):
 
 
 # pylint: disable=protected-access
-def sparse_concat(concat_dim, sp_inputs, name=None, expand_nonconcat_dim=False):
+def sparse_concat(axis,
+                  sp_inputs,
+                  name=None,
+                  expand_nonconcat_dim=False,
+                  concat_dim=None):
   """Concatenates a list of `SparseTensor` along the specified dimension.
 
   Concatenation is with respect to the dense versions of each sparse input.
@@ -143,7 +147,7 @@ def sparse_concat(concat_dim, sp_inputs, name=None, expand_nonconcat_dim=False):
   values across all inputs. This is due to the need for an internal sort in
   order to concatenate efficiently across an arbitrary dimension.
 
-  For example, if `concat_dim = 1` and the inputs are
+  For example, if `axis = 1` and the inputs are
 
       sp_inputs[0]: shape = [2, 3]
       [0, 2]: "a"
@@ -168,7 +172,7 @@ def sparse_concat(concat_dim, sp_inputs, name=None, expand_nonconcat_dim=False):
       [    a] concat [  d e  ] = [    a   d e  ]
       [b c  ]        [       ]   [b c          ]
 
-  Another example, if 'concat_dim = 1' and the inputs are
+  Another example, if 'axis = 1' and the inputs are
 
       sp_inputs[0]: shape = [3, 3]
       [0, 2]: "a"
@@ -197,12 +201,13 @@ def sparse_concat(concat_dim, sp_inputs, name=None, expand_nonconcat_dim=False):
 
 
   Args:
-    concat_dim: Dimension to concatenate along. Must be in range [-rank, rank),
+    axis: Dimension to concatenate along. Must be in range [-rank, rank),
       where rank is the number of dimensions in each input `SparseTensor`.
     sp_inputs: List of `SparseTensor` to concatenate.
     name: A name prefix for the returned tensors (optional).
     expand_nonconcat_dim: Whether to allow the expansion in the non-concat
       dimensions. Defaulted to False.
+    concat_dim: The old (deprecated) name for axis.
 
   Returns:
     A `SparseTensor` with the concatenated output.
@@ -210,6 +215,8 @@ def sparse_concat(concat_dim, sp_inputs, name=None, expand_nonconcat_dim=False):
   Raises:
     TypeError: If `sp_inputs` is not a list of `SparseTensor`.
   """
+  axis = deprecation.deprecated_argument_lookup("axis", axis, "concat_dim",
+                                                concat_dim)
   sp_inputs = _convert_to_sparse_tensors(sp_inputs)
 
   if len(sp_inputs) == 1:  # Degenerate case of one tensor.
@@ -217,20 +224,20 @@ def sparse_concat(concat_dim, sp_inputs, name=None, expand_nonconcat_dim=False):
 
   inds = [sp_input.indices for sp_input in sp_inputs]
   vals = [sp_input.values for sp_input in sp_inputs]
-  shapes = [sp_input.shape for sp_input in sp_inputs]
+  shapes = [sp_input.dense_shape for sp_input in sp_inputs]
 
   if expand_nonconcat_dim:
     max_shape = math_ops.reduce_max(
         array_ops.concat(0, [array_ops.reshape(shape, [1, -1])
                              for shape in shapes]), 0)
     shapes = [array_ops.concat(0, [
-        max_shape[:concat_dim], shape[-1:] if concat_dim == -1 else
-        shape[concat_dim:concat_dim + 1], [] if concat_dim == -1 else
-        max_shape[concat_dim + 1:]
+        max_shape[:axis], shape[-1:] if axis == -1 else
+        shape[axis:axis + 1], [] if axis == -1 else
+        max_shape[axis + 1:]
     ]) for shape in shapes]
 
   output_ind, output_val, output_shape = (gen_sparse_ops._sparse_concat(
-      inds, vals, shapes, concat_dim, name=name))
+      inds, vals, shapes, axis, name=name))
 
   return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
 
@@ -294,14 +301,16 @@ def sparse_add(a, b, thresh=0):
     thresh = ops.convert_to_tensor(
         thresh, dtype=a.values.dtype.real_dtype, name="thresh")
     output_ind, output_val, output_shape = (gen_sparse_ops._sparse_add(
-        a.indices, a.values, a.shape, b.indices, b.values, b.shape, thresh))
+        a.indices, a.values, a.dense_shape,
+        b.indices, b.values, b.dense_shape,
+        thresh))
     return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
   else:
     # swap to make `a` the SparseTensor.
     if isinstance(b, sparse_classes):
       a, b = b, a
-    return gen_sparse_ops._sparse_tensor_dense_add(a.indices, a.values, a.shape,
-                                                   b)
+    return gen_sparse_ops._sparse_tensor_dense_add(
+        a.indices, a.values, a.dense_shape, b)
 
 
 def sparse_dense_cwise_add(sp_t, dense_t):
@@ -325,8 +334,8 @@ def sparse_dense_cwise_add(sp_t, dense_t):
     output: the SparseTensor output.
   """
   result = gen_sparse_ops.sparse_dense_cwise_add(sp_t.indices, sp_t.values,
-                                                 sp_t.shape, dense_t)
-  return sparse_tensor.SparseTensor(sp_t.indices, result, sp_t.shape)
+                                                 sp_t.dense_shape, dense_t)
+  return sparse_tensor.SparseTensor(sp_t.indices, result, sp_t.dense_shape)
 
 
 def sparse_reorder(sp_input, name=None):
@@ -367,10 +376,10 @@ def sparse_reorder(sp_input, name=None):
   sp_input = _convert_to_sparse_tensor(sp_input)
 
   reordered_ind, reordered_val = (gen_sparse_ops._sparse_reorder(
-      sp_input.indices, sp_input.values, sp_input.shape, name=name))
+      sp_input.indices, sp_input.values, sp_input.dense_shape, name=name))
 
   return sparse_tensor.SparseTensor(reordered_ind, reordered_val,
-                                    array_ops.identity(sp_input.shape))
+                                    array_ops.identity(sp_input.dense_shape))
 
 
 def sparse_reshape(sp_input, shape, name=None):
@@ -422,19 +431,29 @@ def sparse_reshape(sp_input, shape, name=None):
 
   with ops.name_scope(name, "SparseReshape", [sp_input]) as name:
     reshaped_ind, reshaped_shape = gen_sparse_ops._sparse_reshape(
-        sp_input.indices, sp_input.shape, shape, name=name)
+        sp_input.indices, sp_input.dense_shape, shape, name=name)
 
     return sparse_tensor.SparseTensor(
         reshaped_ind, array_ops.identity(sp_input.values),
         reshaped_shape)
 
 
-def sparse_split(split_dim, num_split, sp_input, name=None):
-  """Split a `SparseTensor` into `num_split` tensors along `split_dim`.
+# TODO(aselle): Remove keyword required once for 1.0 final
+class KeywordRequired(object):
+
+  def __repr__(self):
+    # This is needed to make documentation without fully qualified module paths
+    return "KeywordRequired()"
+
 
-  If the `sp_input.shape[split_dim]` is not an integer multiple of `num_split`
-  each slice starting from 0:`shape[split_dim] % num_split` gets extra one
-  dimension. For example, if `split_dim = 1` and `num_split = 2` and the
+def sparse_split(keyword_required=KeywordRequired(),
+                 sp_input=None, num_split=None, axis=None,
+                 name=None, split_dim=None):
+  """Split a `SparseTensor` into `num_split` tensors along `axis`.
+
+  If the `sp_input.dense_shape[axis]` is not an integer multiple of `num_split`
+  each slice starting from 0:`shape[axis] % num_split` gets extra one
+  dimension. For example, if `axis = 1` and `num_split = 2` and the
   input is:
 
       input_tensor = shape = [2, 7]
@@ -452,24 +471,37 @@ def sparse_split(split_dim, num_split, sp_input, name=None):
       [      ]
 
   Args:
-    split_dim: A 0-D `int32` `Tensor`. The dimension along which to split.
-    num_split: A Python integer. The number of ways to split.
+    keyword_required: Python 2 standin for * (temporary for argument reorder)
     sp_input: The `SparseTensor` to split.
+    num_split: A Python integer. The number of ways to split.
+    axis: A 0-D `int32` `Tensor`. The dimension along which to split.
     name: A name for the operation (optional).
+    split_dim: Deprecated old name for axis.
 
   Returns:
     `num_split` `SparseTensor` objects resulting from splitting `value`.
 
   Raises:
     TypeError: If `sp_input` is not a `SparseTensor`.
+    ValueError: If the deprecated `split_dim` and `axis` are both non None.
   """
+  if not isinstance(keyword_required, KeywordRequired):
+    raise ValueError("Keyword arguments are required for this function.")
+  if sp_input is None:
+    raise ValueError("sp_input is required")
+  if num_split is None:
+    raise ValueError("num_split is required")
+  if axis is None:
+    raise ValueError("axis is required")
+  axis = deprecation.deprecated_argument_lookup("axis", axis, "split_dim",
+                                                split_dim)
   sp_input = _convert_to_sparse_tensor(sp_input)
 
   output_inds, output_vals, output_shapes = (gen_sparse_ops._sparse_split(
-      split_dim,
+      axis,
       sp_input.indices,
       sp_input.values,
-      sp_input.shape,
+      sp_input.dense_shape,
       num_split,
       name=name))
   sparse_tensors = []
@@ -577,7 +609,8 @@ def sparse_reduce_sum(sp_input, axis=None, keep_dims=False,
   """
   return gen_sparse_ops.sparse_reduce_sum(
       sp_input.indices, sp_input.values,
-      sp_input.shape, math_ops._ReductionDims(sp_input, axis, reduction_axes),
+      sp_input.dense_shape,
+      math_ops._ReductionDims(sp_input, axis, reduction_axes),
       keep_dims)
 
 
@@ -611,8 +644,8 @@ def sparse_reduce_sum_sparse(sp_input, axis=None, keep_dims=False,
   output_ind, output_val, output_shape = (
       gen_sparse_ops.sparse_reduce_sum_sparse(
           sp_input.indices, sp_input.values,
-          sp_input.shape, math_ops._ReductionDims(sp_input, axis,
-                                                  reduction_axes),
+          sp_input.dense_shape, math_ops._ReductionDims(sp_input, axis,
+                                                        reduction_axes),
           keep_dims))
 
   return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
@@ -651,7 +684,7 @@ def sparse_tensor_to_dense(sp_input,
     name: A name prefix for the returned tensors (optional).
 
   Returns:
-    A dense tensor with shape `sp_input.shape` and values specified by
+    A dense tensor with shape `sp_input.dense_shape` and values specified by
     the non-empty values in `sp_input`. Indices not in `sp_input` are assigned
     `default_value`.
 
@@ -662,7 +695,7 @@ def sparse_tensor_to_dense(sp_input,
 
   return sparse_to_dense(
       sp_input.indices,
-      sp_input.shape,
+      sp_input.dense_shape,
       sp_input.values,
       default_value=default_value,
       validate_indices=validate_indices,
@@ -673,14 +706,14 @@ def sparse_to_indicator(sp_input, vocab_size, name=None):
   """Converts a `SparseTensor` of ids into a dense bool indicator tensor.
 
   The last dimension of `sp_input.indices` is discarded and replaced with
-  the values of `sp_input`.  If `sp_input.shape = [D0, D1, ..., Dn, K]`, then
-  `output.shape = [D0, D1, ..., Dn, vocab_size]`, where
+  the values of `sp_input`.  If `sp_input.dense_shape = [D0, D1, ..., Dn, K]`,
+  then `output.shape = [D0, D1, ..., Dn, vocab_size]`, where
 
       output[d_0, d_1, ..., d_n, sp_input[d_0, d_1, ..., d_n, k]] = True
 
   and False elsewhere in `output`.
 
-  For example, if `sp_input.shape = [2, 3, 4]` with non-empty values:
+  For example, if `sp_input.dense_shape = [2, 3, 4]` with non-empty values:
 
       [0, 0, 0]: 0
       [0, 1, 0]: 10
@@ -721,7 +754,7 @@ def sparse_to_indicator(sp_input, vocab_size, name=None):
     num_entries = array_ops.shape(sp_input.indices)[0]
     new_values = array_ops.fill(array_ops.expand_dims(num_entries, 0), True)
     sp_values = sparse_tensor.SparseTensor(
-        sp_input.indices, new_values, sp_input.shape)
+        sp_input.indices, new_values, sp_input.dense_shape)
 
     sp_new = sparse_merge(sp_input, sp_values, vocab_size, name)
 
@@ -746,7 +779,7 @@ def sparse_merge(sp_ids, sp_values, vocab_size, name=None,
     - `indices` is equivalent to `sp_ids.indices` with the last
       dimension discarded and replaced with `sp_ids.values`.
     - `values` is simply `sp_values.values`.
-    - If `sp_ids.shape = [D0, D1, ..., Dn, K]`, then
+    - If `sp_ids.dense_shape = [D0, D1, ..., Dn, K]`, then
       `output.shape = [D0, D1, ..., Dn, vocab_size]`.
 
   For example, consider the following feature vectors:
@@ -787,7 +820,7 @@ def sparse_merge(sp_ids, sp_values, vocab_size, name=None,
   ```python
     SparseTensor(indices=[[0, 0], [1, 1], [1, 3], [1, 4], [2, 0], [2, 3]],
                  values=[-3, 1, 4, 1, 5, 9],
-                 shape=[3, 6])
+                 dense_shape=[3, 6])
   ```
 
   Args:
@@ -828,7 +861,8 @@ def sparse_merge(sp_ids, sp_values, vocab_size, name=None,
     new_values = sp_values.values
     new_shape = array_ops.concat(
         0,
-        [array_ops.slice(sp_ids.shape, [0], array_ops.expand_dims(rank - 1, 0)),
+        [array_ops.slice(
+            sp_ids.dense_shape, [0], array_ops.expand_dims(rank - 1, 0)),
          math_ops.cast(array_ops.pack([vocab_size]), dtypes.int64)])
 
     result = sparse_tensor.SparseTensor(new_indices, new_values, new_shape)
@@ -875,7 +909,7 @@ def sparse_retain(sp_input, to_retain):
   new_indices = array_ops.gather(sp_input.indices, where_true)
   new_values = array_ops.gather(sp_input.values, where_true)
   return sparse_tensor.SparseTensor(new_indices, new_values,
-                                    array_ops.identity(sp_input.shape))
+                                    array_ops.identity(sp_input.dense_shape))
 
 
 def sparse_reset_shape(sp_input, new_shape=None):
@@ -936,7 +970,7 @@ def sparse_reset_shape(sp_input, new_shape=None):
 
   in_indices = array_ops.identity(sp_input.indices)
   in_values = array_ops.identity(sp_input.values)
-  in_shape = array_ops.identity(sp_input.shape)
+  in_shape = array_ops.identity(sp_input.dense_shape)
 
   if new_shape is None:
     dim_low_bound = math_ops.reduce_max(in_indices, 0)
@@ -1015,12 +1049,13 @@ def sparse_fill_empty_rows(sp_input, default_value, name=None):
     default_value = ops.convert_to_tensor(
         default_value, dtype=sp_input.values.dtype)
 
-    num_rows = math_ops.cast(sp_input.shape[0], dtypes.int32)
+    num_rows = math_ops.cast(sp_input.dense_shape[0], dtypes.int32)
     all_row_indices = math_ops.cast(math_ops.range(num_rows), dtypes.int64)
     empty_row_indices, _ = array_ops.setdiff1d(all_row_indices,
                                                sp_input.indices[:, 0])
     empty_row_indicator = sparse_to_dense(
-        empty_row_indices, array_ops.expand_dims(sp_input.shape[0], -1), True,
+        empty_row_indices,
+        array_ops.expand_dims(sp_input.dense_shape[0], -1), True,
         False)
 
     empty_row_indices_as_column = array_ops.reshape(empty_row_indices, [-1, 1])
@@ -1036,7 +1071,7 @@ def sparse_fill_empty_rows(sp_input, default_value, name=None):
                                                 additional_values])
     sp_unordered_output = sparse_tensor.SparseTensor(
         all_indices_unordered,
-        all_values_unordered, sp_input.shape)
+        all_values_unordered, sp_input.dense_shape)
     sp_ordered_output = sparse_reorder(sp_unordered_output)
 
     return sp_ordered_output, empty_row_indicator
@@ -1059,7 +1094,7 @@ def serialize_sparse(sp_input, name=None):
   sp_input = _convert_to_sparse_tensor(sp_input)
 
   return gen_sparse_ops._serialize_sparse(
-      sp_input.indices, sp_input.values, sp_input.shape, name=name)
+      sp_input.indices, sp_input.values, sp_input.dense_shape, name=name)
 
 
 def serialize_many_sparse(sp_input, name=None):
@@ -1088,7 +1123,7 @@ def serialize_many_sparse(sp_input, name=None):
   sp_input = _convert_to_sparse_tensor(sp_input)
 
   return gen_sparse_ops._serialize_many_sparse(
-      sp_input.indices, sp_input.values, sp_input.shape, name=name)
+      sp_input.indices, sp_input.values, sp_input.dense_shape, name=name)
 
 
 def deserialize_many_sparse(serialized_sparse, dtype, rank=None, name=None):
@@ -1191,8 +1226,8 @@ def sparse_tensor_dense_matmul(sp_a,
   `sp_a=True`.
 
   This operation tends to perform well when A is more sparse, if the column size
-  of the product is small (e.g. matrix-vector multiplication), if sp_a.shape
-  takes on large values.
+  of the product is small (e.g. matrix-vector multiplication), if
+  `sp_a.dense_shape` takes on large values.
 
   Below is a rough speed comparison between sparse_tensor_dense_matmul,
   labelled 'sparse', and matmul(sp_a=True), labelled 'dense'.  For purposes of
@@ -1333,7 +1368,7 @@ def sparse_tensor_dense_matmul(sp_a,
     return gen_sparse_ops._sparse_tensor_dense_mat_mul(
         a_indices=sp_a.indices,
         a_values=sp_a.values,
-        a_shape=sp_a.shape,
+        a_shape=sp_a.dense_shape,
         b=b,
         adjoint_a=adjoint_a,
         adjoint_b=adjoint_b)
@@ -1388,9 +1423,9 @@ def sparse_softmax(sp_input, name=None):
   with ops.name_scope(name, "SparseSoftmax",
                       [sp_input.indices, sp_input.values]) as name:
     out_vals = gen_sparse_ops.sparse_softmax(sp_input.indices, sp_input.values,
-                                             sp_input.shape)
+                                             sp_input.dense_shape)
     return sparse_tensor.SparseTensor(
-        sp_input.indices, out_vals, sp_input.shape)
+        sp_input.indices, out_vals, sp_input.dense_shape)
 
 
 def sparse_maximum(sp_a, sp_b, name=None):
@@ -1421,12 +1456,12 @@ def sparse_maximum(sp_a, sp_b, name=None):
     out_indices, out_values = gen_sparse_ops.sparse_sparse_maximum(
         sp_a.indices,
         sp_a.values,
-        sp_a.shape,
+        sp_a.dense_shape,
         sp_b.indices,
         sp_b.values,
-        sp_b.shape,
+        sp_b.dense_shape,
         name=name)
-  return sparse_tensor.SparseTensor(out_indices, out_values, sp_a.shape)
+  return sparse_tensor.SparseTensor(out_indices, out_values, sp_a.dense_shape)
 
 
 def sparse_minimum(sp_a, sp_b, name=None):
@@ -1457,12 +1492,12 @@ def sparse_minimum(sp_a, sp_b, name=None):
     out_indices, out_values = gen_sparse_ops.sparse_sparse_minimum(
         sp_a.indices,
         sp_a.values,
-        sp_a.shape,
+        sp_a.dense_shape,
         sp_b.indices,
         sp_b.values,
-        sp_b.shape,
+        sp_b.dense_shape,
         name=name)
-  return sparse_tensor.SparseTensor(out_indices, out_values, sp_a.shape)
+  return sparse_tensor.SparseTensor(out_indices, out_values, sp_a.dense_shape)
 
 
 def sparse_transpose(sp_input, perm=None, name=None):
@@ -1505,7 +1540,7 @@ def sparse_transpose(sp_input, perm=None, name=None):
     indices = sp_input.indices
     transposed_indices = array_ops.transpose(
         array_ops.gather(array_ops.transpose(indices), perm))
-    dense_shape = sp_input.shape
+    dense_shape = sp_input.dense_shape
     transposed_dense_shape = array_ops.gather(dense_shape, perm)
     transposed_st = sparse_tensor.SparseTensor(
         transposed_indices, sp_input.values,
@@ -1536,7 +1571,7 @@ def _add_sparse_to_tensors_map(sp_input, container=None,
   sp_input = _convert_to_sparse_tensor(sp_input)
 
   return gen_sparse_ops._add_sparse_to_tensors_map(
-      sp_input.indices, sp_input.values, sp_input.shape,
+      sp_input.indices, sp_input.values, sp_input.dense_shape,
       container=container, shared_name=shared_name, name=name)
 
 
@@ -1570,7 +1605,7 @@ def _add_many_sparse_to_tensors_map(sp_input, container=None,
   sp_input = _convert_to_sparse_tensor(sp_input)
 
   return gen_sparse_ops._add_many_sparse_to_tensors_map(
-      sp_input.indices, sp_input.values, sp_input.shape,
+      sp_input.indices, sp_input.values, sp_input.dense_shape,
       container=container, shared_name=shared_name, name=name)
 
 
diff --git a/tensorflow/python/ops/standard_ops.py b/tensorflow/python/ops/standard_ops.py
index 0fafbfaa0c3db4..cc94cf1f38e141 100644
--- a/tensorflow/python/ops/standard_ops.py
+++ b/tensorflow/python/ops/standard_ops.py
@@ -39,6 +39,7 @@
 from tensorflow.python.ops.clip_ops import *
 from tensorflow.python.ops.special_math_ops import *
 # TODO(vrv): Switch to import * once we're okay with exposing the module.
+from tensorflow.python.ops.confusion_matrix import confusion_matrix
 from tensorflow.python.ops.control_flow_ops import Assert
 from tensorflow.python.ops.control_flow_ops import group
 from tensorflow.python.ops.control_flow_ops import no_op
@@ -53,7 +54,8 @@
 from tensorflow.python.ops.init_ops import *
 from tensorflow.python.ops.io_ops import *
 from tensorflow.python.ops.linalg_ops import *
-from tensorflow.python.ops.logging_ops import *
+from tensorflow.python.ops.logging_ops import Print
+from tensorflow.python.ops.logging_ops import get_summary_op
 from tensorflow.python.ops.math_ops import *
 from tensorflow.python.ops.numerics import *
 from tensorflow.python.ops.parsing_ops import *
@@ -90,6 +92,7 @@
 from tensorflow.python.ops import array_ops as _array_ops
 from tensorflow.python.ops import check_ops as _check_ops
 from tensorflow.python.ops import clip_ops as _clip_ops
+from tensorflow.python.ops import confusion_matrix as _confusion_matrix
 from tensorflow.python.ops import control_flow_ops as _control_flow_ops
 from tensorflow.python.ops import data_flow_ops as _data_flow_ops
 from tensorflow.python.ops import functional_ops as _functional_ops
@@ -243,6 +246,7 @@
     "parse_single_sequence_example",
     "serialize_many_sparse",
     "serialize_sparse",
+    "confusion_matrix",
 ]
 
 _allowed_symbols = (_allowed_symbols_array_ops +
@@ -261,6 +265,7 @@
                      _array_ops,
                      _check_ops,
                      _clip_ops,
+                     _confusion_matrix,
                      _control_flow_ops,
                      _constant_op,
                      _data_flow_ops,
diff --git a/tensorflow/python/ops/state_ops.py b/tensorflow/python/ops/state_ops.py
index 4858453b526582..daffdf79d643be 100644
--- a/tensorflow/python/ops/state_ops.py
+++ b/tensorflow/python/ops/state_ops.py
@@ -135,7 +135,7 @@
 # pylint: enable=wildcard-import
 
 
-# pylint: disable=protected-access
+# pylint: disable=protected-access,g-doc-return-or-yield,g-doc-args
 def variable_op(shape, dtype, name="Variable", set_shape=True, container="",
                 shared_name=""):
   """Create a variable Operation.
@@ -146,8 +146,6 @@ def variable_op(shape, dtype, name="Variable", set_shape=True, container="",
     shape: The shape of the tensor managed by this variable
     dtype: The underlying type of the tensor values.
     name: optional name to use for the variable op.
-    set_shape: If True, set the shape property of the returned Tensor to
-      the shape argument.
     container: An optional string. Defaults to "".
       If non-empty, this variable is placed in the given container.
       Otherwise, a default container is used.
@@ -169,6 +167,32 @@ def variable_op(shape, dtype, name="Variable", set_shape=True, container="",
   return ret
 
 
+def variable_op_v2(shape, dtype, name="Variable", container="", shared_name=""):
+  """Create a variable Operation.
+
+  See also variables.Variable.
+
+  Args:
+    shape: The shape of the tensor managed by this variable
+    dtype: The underlying type of the tensor values.
+    name: optional name to use for the variable op.
+    container: An optional string. Defaults to "".
+      If non-empty, this variable is placed in the given container.
+      Otherwise, a default container is used.
+    shared_name: An optional string. Defaults to "".
+      If non-empty, this variable is named in the given bucket
+      with this shared_name. Otherwise, the node name is used instead.
+
+  Returns:
+    A variable tensor.1;5A
+  """
+  return gen_state_ops._variable_v2(shape=shape,
+                                    dtype=dtype,
+                                    name=name,
+                                    container=container,
+                                    shared_name=shared_name)
+
+
 def init_variable(v, init, name="init"):
   """Initializes variable with "init".
 
diff --git a/tensorflow/python/ops/string_ops.py b/tensorflow/python/ops/string_ops.py
index 57e774235512b0..7dd3a8d87af8a1 100644
--- a/tensorflow/python/ops/string_ops.py
+++ b/tensorflow/python/ops/string_ops.py
@@ -56,6 +56,7 @@
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_string_ops import *
+from tensorflow.python.util import deprecation
 # pylint: enable=wildcard-import
 
 
@@ -109,6 +110,27 @@ def string_split(source, delimiter=" "):  # pylint: disable=invalid-name
   return sparse_tensor.SparseTensor(indices, values, shape)
 
 
+def reduce_join(inputs, axis=None,
+                keep_dims=False,
+                separator="",
+                name=None,
+                reduction_indices=None):
+  axis = deprecation.deprecated_argument_lookup("axis", axis,
+                                                "reduction_indices",
+                                                reduction_indices)
+  if axis is None:
+    raise ValueError("axis must be specified.")
+  return gen_string_ops.reduce_join(
+      inputs=inputs,
+      reduction_indices=axis,
+      keep_dims=keep_dims,
+      separator=separator,
+      name=name)
+
+
+reduce_join.__doc__ = deprecation.rewrite_argument_docstring(
+    gen_string_ops.reduce_join.__doc__, "reduction_indices", "axis")
+
 ops.NotDifferentiable("StringToHashBucket")
 ops.NotDifferentiable("StringToHashBucketFast")
 ops.NotDifferentiable("StringToHashBucketStrong")
diff --git a/tensorflow/python/ops/template.py b/tensorflow/python/ops/template.py
index 09955e690c3a72..e600478b42d959 100644
--- a/tensorflow/python/ops/template.py
+++ b/tensorflow/python/ops/template.py
@@ -44,7 +44,7 @@ def make_template(name_, func_, create_scope_now_=False, unique_name_=None,
      that are intended to be locals can be created by specifying
      `tf.Variable(..., trainable=false)`.
   * The function may use variable scopes and other templates internally to
-      create and reuse variables, but it shouldn't use `tf.all_variables` to
+      create and reuse variables, but it shouldn't use `tf.global_variables` to
       capture variables that are defined outside of the scope of the function.
   * Internal scopes and variable names should not depend on any arguments that
       are not supplied to `make_template`. In general you will get a ValueError
diff --git a/tensorflow/python/ops/tensor_array_ops.py b/tensorflow/python/ops/tensor_array_ops.py
index 8d03a1e23a1f65..f97de8d7234646 100644
--- a/tensorflow/python/ops/tensor_array_ops.py
+++ b/tensorflow/python/ops/tensor_array_ops.py
@@ -33,6 +33,7 @@
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util.deprecation import deprecated
 
 
 def _maybe_set_device(handle_op, value_t):
@@ -81,7 +82,7 @@ class TensorArray(object):
 
   def __init__(self, dtype, size=None, dynamic_size=None,
                clear_after_read=None, tensor_array_name=None, handle=None,
-               flow=None, infer_shape=True, elem_shape=None, name=None):
+               flow=None, infer_shape=True, element_shape=None, name=None):
     """Construct a new TensorArray or wrap an existing TensorArray handle.
 
     A note about the parameter `name`:
@@ -109,8 +110,9 @@ def __init__(self, dtype, size=None, dynamic_size=None,
         `TensorArray.flow`.
       infer_shape: (optional, default: True) If True, shape inference
         is enabled.  In this case, all elements must have the same shape.
-      elem_shape: (optional, default: None) A TensorShape object specifying
-        the shape of all the elements of the TensorArray.
+      element_shape: (optional, default: None) A `TensorShape` object specifying
+        the shape constraints of each of the elements of the TensorArray.
+        Need not be fully defined.
       name: A name for the operation (optional).
 
     Raises:
@@ -127,6 +129,9 @@ def __init__(self, dtype, size=None, dynamic_size=None,
     if handle is not None and size is not None:
       raise ValueError("Cannot provide both a handle and size "
                        "at the same time")
+    if handle is not None and element_shape is not None:
+      raise ValueError("Cannot provide both a handle and element_shape "
+                       "at the same time")
     if handle is not None and dynamic_size is not None:
       raise ValueError("Cannot provide both a handle and dynamic_size "
                        "at the same time")
@@ -140,15 +145,15 @@ def __init__(self, dtype, size=None, dynamic_size=None,
 
     self._dtype = dtype
     # Record the current static shape for the array elements. The element
-    # shape is defined either by `elem_shape` or the shape of the tensor
+    # shape is defined either by `element_shape` or the shape of the tensor
     # of the first write. If `infer_shape` is true, all writes checks for
     # shape equality.
-    if elem_shape is None:
+    if element_shape is None:
       self._infer_shape = infer_shape
-      self._elem_shape = []
+      self._element_shape = []
     else:
       self._infer_shape = True
-      self._elem_shape = [tensor_shape.TensorShape(elem_shape)]
+      self._element_shape = [tensor_shape.TensorShape(element_shape)]
     with ops.name_scope(name, "TensorArray", [handle, size, flow]) as scope:
       if handle is not None:
         self._handle = handle
@@ -156,7 +161,8 @@ def __init__(self, dtype, size=None, dynamic_size=None,
         if flow is not None:
           with ops.colocate_with(flow):
             self._handle = gen_data_flow_ops._tensor_array_v2(
-                dtype=dtype, size=size, dynamic_size=dynamic_size,
+                dtype=dtype, size=size, element_shape=element_shape,
+                dynamic_size=dynamic_size,
                 clear_after_read=clear_after_read,
                 tensor_array_name=tensor_array_name, name=scope)
         else:
@@ -165,7 +171,8 @@ def __init__(self, dtype, size=None, dynamic_size=None,
           # will retroactively set the device value of this op.
           with ops.device(None), ops.colocate_with(None, ignore_existing=True):
             self._handle = gen_data_flow_ops._tensor_array_v2(
-                dtype=dtype, size=size, dynamic_size=dynamic_size,
+                dtype=dtype, size=size, element_shape=element_shape,
+                dynamic_size=dynamic_size,
                 clear_after_read=clear_after_read,
                 tensor_array_name=tensor_array_name, name=scope)
       if flow is not None:
@@ -204,7 +211,7 @@ def grad(self, source, flow=None, name=None):
           flow = array_ops.identity(flow, name="gradient_flow")
         g = TensorArray(dtype=self._dtype, handle=g_handle, flow=flow,
                         infer_shape=self._infer_shape)
-        g._elem_shape = self._elem_shape
+        g._element_shape = self._element_shape
         return g
 
   def read(self, index, name=None):
@@ -221,8 +228,8 @@ def read(self, index, name=None):
       value = gen_data_flow_ops._tensor_array_read_v2(
           handle=self._handle, index=index, flow_in=self._flow,
           dtype=self._dtype, name=name)
-      if self._elem_shape:
-        value.set_shape(self._elem_shape[0].dims)
+      if self._element_shape:
+        value.set_shape(self._element_shape[0].dims)
       return value
 
   def write(self, index, value, name=None):
@@ -250,33 +257,42 @@ def write(self, index, value, name=None):
       ta = TensorArray(dtype=self._dtype, handle=self._handle)
       ta._flow = flow_out
       ta._infer_shape = self._infer_shape
-      ta._elem_shape = self._elem_shape
+      ta._element_shape = self._element_shape
       if ta._infer_shape:
-        val_shape = flow_out.op.inputs[2].get_shape()
-        if ta._elem_shape:
-          if not val_shape == ta._elem_shape[0]:
+        val_shape = value.get_shape()
+        if ta._element_shape:
+          if not val_shape == ta._element_shape[0]:
             raise ValueError(
                 "Inconsistent shapes: saw %s but expected %s "
-                "(and infer_shape=True)" % (val_shape, ta._elem_shape[0]))
+                "(and infer_shape=True)" % (val_shape, ta._element_shape[0]))
         else:
-          ta._elem_shape.append(val_shape)
+          ta._element_shape.append(val_shape)
       return ta
 
-  def pack(self, name=None):
-    """Return the values in the TensorArray as a packed `Tensor`.
+  def stack(self, name=None):
+    """Return the values in the TensorArray as a stacked `Tensor`.
 
     All of the values must have been written and their shapes must all match.
+    If input shapes have rank-`R`, then output shape will have rank-`(R+1)`.
 
     Args:
       name: A name for the operation (optional).
 
     Returns:
-      All the tensors in the TensorArray packed into one tensor.
+      All the tensors in the TensorArray stacked into one tensor.
     """
     with ops.colocate_with(self._handle):
-      with ops.name_scope(name, "TensorArrayPack", [self._handle]):
+      with ops.name_scope(name, "TensorArrayStack", [self._handle]):
         return self.gather(math_ops.range(0, self.size()), name=name)
 
+  @deprecated(
+      "2016-12-12",
+      "This op will be removed after the deprecation date. "
+      "Please switch to tf.stack.")
+  def pack(self, name=None):
+    return self.stack(name)
+  pack.__doc__ = stack.__doc__
+
   def gather(self, indices, name=None):
     """Return selected values in the TensorArray as a packed `Tensor`.
 
@@ -292,8 +308,8 @@ def gather(self, indices, name=None):
       The in the `TensorArray` selected by `indices`, packed into one tensor.
     """
     with ops.colocate_with(self._handle):
-      if self._elem_shape:
-        element_shape = self._elem_shape[0]
+      if self._element_shape:
+        element_shape = self._element_shape[0]
       else:
         element_shape = tensor_shape.TensorShape(None)
       value = gen_data_flow_ops._tensor_array_gather_v2(
@@ -303,8 +319,8 @@ def gather(self, indices, name=None):
           dtype=self._dtype,
           name=name,
           element_shape=element_shape)
-      if self._elem_shape and self._elem_shape[0].dims is not None:
-        value.set_shape([None] + self._elem_shape[0].dims)
+      if self._element_shape and self._element_shape[0].dims is not None:
+        value.set_shape([None] + self._element_shape[0].dims)
       return value
 
   def concat(self, name=None):
@@ -319,9 +335,9 @@ def concat(self, name=None):
     Returns:
       All the tensors in the TensorArray concatenated into one tensor.
     """
-    if self._elem_shape and self._elem_shape[0].dims is not None:
-      element_shape_except0 = tensor_shape.TensorShape(self._elem_shape[0].dims[
-          1:])
+    if self._element_shape and self._element_shape[0].dims is not None:
+      element_shape_except0 = (
+          tensor_shape.TensorShape(self._element_shape[0].dims[1:]))
     else:
       element_shape_except0 = tensor_shape.TensorShape(None)
     with ops.colocate_with(self._handle):
@@ -331,29 +347,39 @@ def concat(self, name=None):
           dtype=self._dtype,
           name=name,
           element_shape_except0=element_shape_except0)
-      if self._elem_shape and self._elem_shape[0].dims is not None:
-        value.set_shape([None] + self._elem_shape[0].dims[1:])
+      if self._element_shape and self._element_shape[0].dims is not None:
+        value.set_shape([None] + self._element_shape[0].dims[1:])
       return value
 
-  def unpack(self, value, name=None):
-    """Pack the values of a `Tensor` in the TensorArray.
+  def unstack(self, value, name=None):
+    """Unstack the values of a `Tensor` in the TensorArray.
 
+    If input value shapes have rank-`R`, then the output TensorArray will
+    contain elements whose shapes are rank-`(R-1)`.
     Args:
-      value: (N+1)-D.  Tensor of type `dtype`.  The Tensor to unpack.
+      value: (N+1)-D.  Tensor of type `dtype`.  The Tensor to unstack.
       name: A name for the operation (optional).
 
     Returns:
-      A new TensorArray object with flow that ensures the unpack occurs.
+      A new TensorArray object with flow that ensures the unstack occurs.
       Use this object all for subsequent operations.
 
     Raises:
       ValueError: if the shape inference fails.
     """
-    with ops.name_scope(name, "TensorArrayPack", [self._handle, value]):
+    with ops.name_scope(name, "TensorArrayUnstack", [self._handle, value]):
       num_elements = array_ops.shape(value)[0]
       return self.scatter(
           indices=math_ops.range(0, num_elements), value=value, name=name)
 
+  @deprecated(
+      "2016-12-12",
+      "This op will be removed after the deprecation date. "
+      "Please switch to tf.unstack.")
+  def unpack(self, value, name=None):
+    return self.unstack(value, name)
+  unpack.__doc__ = unstack.__doc__
+
   def scatter(self, indices, value, name=None):
     """Scatter the values of a `Tensor` in specific indices of a `TensorArray`.
 
@@ -381,19 +407,20 @@ def scatter(self, indices, value, name=None):
       ta = TensorArray(dtype=self._dtype, handle=self._handle)
       ta._flow = flow_out
       ta._infer_shape = self._infer_shape
-      ta._elem_shape = self._elem_shape
+      ta._element_shape = self._element_shape
       if ta._infer_shape:
         val_shape = flow_out.op.inputs[2].get_shape()
-        elem_shape = tensor_shape.unknown_shape()
+        element_shape = tensor_shape.unknown_shape()
         if val_shape.dims is not None:
-          elem_shape = tensor_shape.TensorShape(val_shape.dims[1:])
-        if ta._elem_shape:
-          if not elem_shape == ta._elem_shape[0]:
+          element_shape = tensor_shape.TensorShape(val_shape.dims[1:])
+        if ta._element_shape:
+          if not element_shape == ta._element_shape[0]:
             raise ValueError(
                 "Inconsistent shapes: saw %s but expected %s "
-                "(and infer_shape=True)" % (elem_shape, ta._elem_shape[0]))
+                "(and infer_shape=True)"
+                % (element_shape, ta._element_shape[0]))
         else:
-          ta._elem_shape.append(elem_shape)
+          ta._element_shape.append(element_shape)
       return ta
 
   def split(self, value, lengths, name=None):
@@ -424,22 +451,23 @@ def split(self, value, lengths, name=None):
       ta = TensorArray(dtype=self._dtype, handle=self._handle)
       ta._flow = flow_out
       ta._infer_shape = self._infer_shape
-      ta._elem_shape = self._elem_shape
+      ta._element_shape = self._element_shape
       if ta._infer_shape:
         val_shape = flow_out.op.inputs[1].get_shape()
         clengths = tensor_util.constant_value(flow_out.op.inputs[2])
-        elem_shape = tensor_shape.unknown_shape()
+        element_shape = tensor_shape.unknown_shape()
         if val_shape.dims is not None:
           if clengths is not None and clengths.max() == clengths.min():
-            elem_shape = tensor_shape.TensorShape(
+            element_shape = tensor_shape.TensorShape(
                 [clengths[0]] + val_shape.dims[1:])
-        if ta._elem_shape:
-          if not elem_shape == ta._elem_shape[0]:
+        if ta._element_shape:
+          if not element_shape == ta._element_shape[0]:
             raise ValueError(
                 "Inconsistent shapes: saw %s but expected %s "
-                "(and infer_shape=True)" % (elem_shape, ta._elem_shape[0]))
+                "(and infer_shape=True)"
+                % (element_shape, ta._element_shape[0]))
         else:
-          ta._elem_shape.append(elem_shape)
+          ta._element_shape.append(element_shape)
       return ta
 
   def size(self, name=None):
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index 9fceeb2c355f4b..ae9208c7ffd82a 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -283,8 +283,9 @@ def _true_getter(name, shape=None, dtype=dtypes.float32,  # pylint: disable=miss
                      initializer=None, regularizer=None, reuse=None,
                      trainable=True, collections=None, caching_device=None,
                      partitioner=None, validate_shape=True):
+      is_scalar = shape is not None and not shape
       # Partitioned variable case
-      if partitioner is not None:
+      if partitioner is not None and not is_scalar:
         if not callable(partitioner):
           raise ValueError(
               "Partitioner must be callable, but received: %s" % partitioner)
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index 9f03ae6264d9f1..8460718f18a584 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -82,7 +82,7 @@ class Variable(object):
   ```
 
   The most common initialization pattern is to use the convenience function
-  `global_variable_initializers()` to add an Op to the graph that initializes
+  `global_variables_initializer()` to add an Op to the graph that initializes
   all the variables. You then run that Op after launching the graph.
 
   ```python
@@ -492,7 +492,7 @@ def eval(self, session=None):
 
     ```python
     v = tf.Variable([1, 2])
-    init = tf.global_variable_initializers()
+    init = tf.global_variables_initializer()
 
     with tf.Session() as sess:
         sess.run(init)
@@ -1251,7 +1251,7 @@ def assert_variables_initialized(var_list=None):
   if not var_list:
     var_list = []
     for op in ops.get_default_graph().get_operations():
-      if op.type in ["Variable", "AutoReloadVariable"]:
+      if op.type in ["Variable", "VariableV2", "AutoReloadVariable"]:
         var_list.append(op.outputs[0])
   if not var_list:
     return None
diff --git a/tensorflow/python/saved_model/builder.py b/tensorflow/python/saved_model/builder.py
index 1e050055cd8ee6..53072b684f931e 100644
--- a/tensorflow/python/saved_model/builder.py
+++ b/tensorflow/python/saved_model/builder.py
@@ -88,7 +88,7 @@ def __init__(self, export_dir):
     if file_io.file_exists(export_dir):
       raise AssertionError(
           "Export directory already exists. Please specify a different export "
-          "directory.")
+          "directory: %s" % export_dir)
 
     file_io.recursive_create_dir(self._export_dir)
 
diff --git a/tensorflow/python/saved_model/main_op.py b/tensorflow/python/saved_model/main_op.py
index 362cde858e0369..13892b13a58ac2 100644
--- a/tensorflow/python/saved_model/main_op.py
+++ b/tensorflow/python/saved_model/main_op.py
@@ -42,7 +42,6 @@ def main_op():
   init_tables = tf_data_flow_ops.initialize_all_tables()
   return tf.group(init, init_local, init_tables)
 
-
 def main_op_with_restore(restore_op_name):
   """Returns a main op to init variables, tables and restore the graph.
 
@@ -56,7 +55,6 @@ def main_op_with_restore(restore_op_name):
   Returns:
     The set of ops to be run as part of the main op upon the load operation.
   """
-  simple_main_op = main_op()
-  with ops.control_dependency([simple_main_op]):
-    restore = restore_op_name
-  return tf.group(restore)
+  with ops.control_dependencies([main_op()]):
+    main_op_with_restore = tf.group(restore_op_name)
+  return main_op_with_restore
diff --git a/tensorflow/python/saved_model/saved_model_test.py b/tensorflow/python/saved_model/saved_model_test.py
index bf5b186b80d21d..cbad1ddbe7a1a7 100644
--- a/tensorflow/python/saved_model/saved_model_test.py
+++ b/tensorflow/python/saved_model/saved_model_test.py
@@ -27,6 +27,7 @@
 from tensorflow.python.saved_model import builder as saved_model_builder
 from tensorflow.python.saved_model import constants
 from tensorflow.python.saved_model import loader
+from tensorflow.python.saved_model import main_op
 from tensorflow.python.saved_model import signature_def_utils
 from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.util import compat
@@ -397,6 +398,41 @@ def testAssets(self):
           compat.as_bytes("ignored.txt"))
       self.assertFalse(file_io.file_exists(ignored_asset_path))
 
+  def testCustomMainOp(self):
+    export_dir = os.path.join(tf.test.get_temp_dir(), "test_main_op")
+    builder = saved_model_builder.SavedModelBuilder(export_dir)
+
+    with self.test_session(graph=tf.Graph()) as sess:
+      # Add `v1` and `v2` variables to the graph.
+      v1 = tf.Variable(1, name="v1")
+      tf.add_to_collection("v", v1)
+      v2 = tf.Variable(2, name="v2")
+      tf.add_to_collection("v", v2)
+
+      # Initialize another variable `v3` to 42.
+      v3 = tf.Variable(42, name="v3")
+      tf.add_to_collection("v", v3)
+
+      # Set up an assignment op to be run as part of the main_op.
+      with tf.control_dependencies([main_op.main_op()]):
+        add_v1_v2 = tf.add(v1._ref(), v2._ref())
+        custom_main_op = tf.group(tf.assign(v3, add_v1_v2))
+
+      sess.run(custom_main_op)
+      builder.add_meta_graph_and_variables(
+          sess, ["foo"], main_op=custom_main_op)
+
+    # Save the SavedModel to disk.
+    builder.save()
+
+    with self.test_session(graph=tf.Graph()) as sess:
+      loader.load(sess, ["foo"], export_dir)
+      self.assertEqual(1, tf.get_collection("v")[0].eval())
+      self.assertEqual(2, tf.get_collection("v")[1].eval())
+      # Evaluates to the sum of the first two variables and assigned as part of
+      # the main_op, following a restore.
+      self.assertEqual(3, tf.get_collection("v")[2].eval())
+
   def testLegacyInitOp(self):
     export_dir = os.path.join(tf.test.get_temp_dir(), "test_legacy_init_op")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
diff --git a/tensorflow/python/summary/event_accumulator_test.py b/tensorflow/python/summary/event_accumulator_test.py
index 6d659e27e33455..65797138494322 100644
--- a/tensorflow/python/summary/event_accumulator_test.py
+++ b/tensorflow/python/summary/event_accumulator_test.py
@@ -645,7 +645,7 @@ def testTFSummaryScalar(self):
       ipt = tf.placeholder(tf.float32)
       tf.summary.scalar('scalar1', ipt)
       tf.summary.scalar('scalar2', ipt * ipt)
-      merged = tf.contrib.deprecated.merge_all_summaries()
+      merged = tf.summary.merge_all()
       writer.add_graph(sess.graph)
       for i in xrange(10):
         summ = sess.run(merged, feed_dict={ipt: i})
@@ -692,7 +692,7 @@ def testTFSummaryImage(self):
         tf.summary.image('images', ipt, max_outputs=2)
       with tf.name_scope('3'):
         tf.summary.image('images', ipt, max_outputs=3)
-      merged = tf.contrib.deprecated.merge_all_summaries()
+      merged = tf.summary.merge_all()
       writer.add_graph(sess.graph)
       for i in xrange(10):
         summ = sess.run(merged)
diff --git a/tensorflow/python/tools/freeze_graph_test.py b/tensorflow/python/tools/freeze_graph_test.py
index a59e39119811c8..f2042edc1f7865 100644
--- a/tensorflow/python/tools/freeze_graph_test.py
+++ b/tensorflow/python/tools/freeze_graph_test.py
@@ -76,6 +76,7 @@ def _testFreezeGraph(self, saver_write_version):
 
       self.assertEqual(4, len(output_graph_def.node))
       for node in output_graph_def.node:
+        self.assertNotEqual("VariableV2", node.op)
         self.assertNotEqual("Variable", node.op)
 
       with tf.Session() as sess:
diff --git a/tensorflow/python/training/basic_session_run_hooks.py b/tensorflow/python/training/basic_session_run_hooks.py
index 542396003c1af9..150eb8d655391c 100644
--- a/tensorflow/python/training/basic_session_run_hooks.py
+++ b/tensorflow/python/training/basic_session_run_hooks.py
@@ -217,6 +217,68 @@ def after_run(self, run_context, run_values):
       run_context.request_stop()
 
 
+class CheckpointSaverListener(object):
+  """An interface for event hooks that depend on a checkpoint.
+
+  CheckpointSaverListeners are similar to SessionRunHooks, and can be useful to
+  track training, report progress, and more.  The distinction is that
+  CheckpointSaverListeners run only in steps when CheckpointSaverHook is
+  triggered, and provide callbacks to run before or after the checkpoint is
+  generated.  This is in contrast to SessionRunHooks, which may run in steps
+  when no checkpoint is written, and which have no guaranteed execution order
+  in any case.  CheckpointSaverListeners use the observer pattern and notify at
+  the following points:
+   - when a session starts being used
+   - before each call to `Saver.save()`
+   - after each call to `Saver.save()`
+   - when the session closed
+
+  Custom CheckpointSaverListeners look like this:
+    class ExampleCheckpointSaverListerner(CheckpointSaverListener):
+      def begin(self):
+        # You can add ops to the graph here.
+        print('Starting the session.')
+        self.your_tensor = ...
+
+      def before_save(self, session, global_step_value):
+        print('About to write a checkpoint')
+
+      def after_save(self, session, global_step_value):
+        print('Done writing checkpoint.')
+
+      def end(self, session, global_step_value):
+        print('Done with the session.')
+
+  A CheckpointSaverListener may simply take some action after every checkpoint.
+  It is also possible for the listener to use its own schedule to act less
+  frequently, based on wall clock time or on global_step_value.  In this case,
+  implementors must be careful about what happens at end().  When end is called,
+  The CheckpointSaverHook will have already triggered after_save() in the same
+  global_step, but the listener may or may not have actually acted on it.
+  The listener may want to be sure to act at end() if there is a fresh
+  checkpoint available, but should not act twice if after_save() already handled
+  it.  In this case, end() should have logic to detect the situation and do the
+  right thing, similar to what CheckpointSaverHook.end() does using
+  self._timer.last_triggered_step().
+
+  To use such listeners, pass them in the checkpoint_listeners argument to
+  graph_actions._monitored_train().  If using tf.Learn Estimators, create a
+  custom Estimator and override _get_checkpoint_listeners().
+  """
+
+  def begin(self):
+    pass
+
+  def before_save(self, session, global_step_value):
+    pass
+
+  def after_save(self, session, global_step_value):
+    pass
+
+  def end(self, session, global_step_value):
+    pass
+
+
 class CheckpointSaverHook(session_run_hook.SessionRunHook):
   """Saves checkpoints every N steps or seconds."""
 
@@ -226,7 +288,8 @@ def __init__(self,
                save_steps=None,
                saver=None,
                checkpoint_basename="model.ckpt",
-               scaffold=None):
+               scaffold=None,
+               listeners=None):
     """Initialize CheckpointSaverHook monitor.
 
     Args:
@@ -236,6 +299,10 @@ def __init__(self,
       saver: `Saver` object, used for saving.
       checkpoint_basename: `str`, base name for the checkpoint files.
       scaffold: `Scaffold`, use to get saver object.
+      listeners: List of `CheckpointSaverListener` subclass instances.
+        Used for callbacks that run immediately after the corresponding
+        CheckpointSaverHook callbacks, only in steps where the
+        CheckpointSaverHook was triggered.
 
     Raises:
       ValueError: One of `save_steps` or `save_secs` should be set.
@@ -252,12 +319,15 @@ def __init__(self,
     self._scaffold = scaffold
     self._timer = _SecondOrStepTimer(every_secs=save_secs,
                                      every_steps=save_steps)
+    self._listeners = listeners or []
 
   def begin(self):
     self._global_step_tensor = training_util.get_global_step()
     if self._global_step_tensor is None:
       raise RuntimeError(
           "Global step should be created to use CheckpointSaverHook.")
+    for l in self._listeners:
+      l.begin()
 
   def before_run(self, run_context):  # pylint: disable=unused-argument
     if self._timer.last_triggered_step() is None:
@@ -288,16 +358,25 @@ def end(self, session):
     last_step = session.run(training_util.get_global_step())
     if last_step != self._timer.last_triggered_step():
       self._save(last_step, session)
+    for l in self._listeners:
+      l.end(session, last_step)
 
   def _save(self, step, session):
     """Saves the latest checkpoint."""
     logging.info("Saving checkpoints for %d into %s.", step, self._save_path)
+
+    for l in self._listeners:
+      l.before_save(session, step)
+
     self._get_saver().save(session, self._save_path, global_step=step)
     self._summary_writer.add_session_log(
         SessionLog(
             status=SessionLog.CHECKPOINT, checkpoint_path=self._save_path),
         step)
 
+    for l in self._listeners:
+      l.after_save(session, step)
+
   def _get_saver(self):
     if self._saver is not None:
       return self._saver
diff --git a/tensorflow/python/training/basic_session_run_hooks_test.py b/tensorflow/python/training/basic_session_run_hooks_test.py
index 1b8ebd11f3f078..2847b1e05c6370 100644
--- a/tensorflow/python/training/basic_session_run_hooks_test.py
+++ b/tensorflow/python/training/basic_session_run_hooks_test.py
@@ -32,6 +32,34 @@
 from tensorflow.python.training import monitored_session
 
 
+class MockCheckpointSaverListener(
+    basic_session_run_hooks.CheckpointSaverListener):
+
+  def __init__(self):
+    self.begin_count = 0
+    self.before_save_count = 0
+    self.after_save_count = 0
+    self.end_count = 0
+
+  def begin(self):
+    self.begin_count += 1
+
+  def before_save(self, session, global_step):
+    self.before_save_count += 1
+
+  def after_save(self, session, global_step):
+    self.after_save_count += 1
+
+  def end(self, session, global_step):
+    self.end_count += 1
+
+  def get_counts(self):
+    return {'begin': self.begin_count,
+            'before_save': self.before_save_count,
+            'after_save': self.after_save_count,
+            'end': self.end_count}
+
+
 class SecondOrStepTimerTest(tf.test.TestCase):
 
   def test_raise_in_both_secs_and_steps(self):
@@ -247,6 +275,26 @@ def test_save_secs_saves_in_first_step(self):
         self.assertEqual(1, tf.contrib.framework.load_variable(
             self.model_dir, self.global_step.name))
 
+  def test_save_secs_calls_listeners_at_begin_and_end(self):
+    with self.graph.as_default():
+      listener = MockCheckpointSaverListener()
+      hook = tf.train.CheckpointSaverHook(
+          self.model_dir, save_secs=2, scaffold=self.scaffold,
+          listeners=[listener])
+      hook.begin()
+      self.scaffold.finalize()
+      with tf.Session() as sess:
+        sess.run(self.scaffold.init_op)
+        mon_sess = monitored_session._HookedSession(sess, [hook])
+        mon_sess.run(self.train_op)  # hook runs here
+        mon_sess.run(self.train_op)  # hook won't run here, so it does at end
+        hook.end(sess)  # hook runs here
+      self.assertEqual({'begin': 1,
+                        'before_save': 2,
+                        'after_save': 2,
+                        'end': 1},
+                       listener.get_counts())
+
   def test_save_secs_saves_periodically(self):
     with self.graph.as_default():
       hook = tf.train.CheckpointSaverHook(
@@ -277,6 +325,33 @@ def test_save_secs_saves_periodically(self):
         self.assertEqual(6, tf.contrib.framework.load_variable(
             self.model_dir, self.global_step.name))
 
+  def test_save_secs_calls_listeners_periodically(self):
+    with self.graph.as_default():
+      listener = MockCheckpointSaverListener()
+      hook = tf.train.CheckpointSaverHook(
+          self.model_dir, save_secs=2, scaffold=self.scaffold,
+          listeners=[listener])
+      hook.begin()
+      self.scaffold.finalize()
+      with tf.Session() as sess:
+        sess.run(self.scaffold.init_op)
+        mon_sess = monitored_session._HookedSession(sess, [hook])
+        mon_sess.run(self.train_op)  # hook runs here
+        mon_sess.run(self.train_op)
+        time.sleep(2.5)
+        mon_sess.run(self.train_op)  # hook runs here
+        mon_sess.run(self.train_op)
+        mon_sess.run(self.train_op)
+        time.sleep(2.5)
+        mon_sess.run(self.train_op)  # hook runs here
+        mon_sess.run(self.train_op)  # hook won't run here, so it does at end
+        hook.end(sess)  # hook runs here
+      self.assertEqual({'begin': 1,
+                        'before_save': 4,
+                        'after_save': 4,
+                        'end': 1},
+                       listener.get_counts())
+
   def test_save_steps_saves_in_first_step(self):
     with self.graph.as_default():
       hook = tf.train.CheckpointSaverHook(
@@ -334,8 +409,8 @@ def test_save_saves_at_end(self):
 
   def test_summary_writer_defs(self):
     testing.FakeSummaryWriter.install()
-    tf.train.SummaryWriterCache.clear()
-    summary_writer = tf.train.SummaryWriterCache.get(self.model_dir)
+    tf.summary.FileWriterCache.clear()
+    summary_writer = tf.summary.FileWriterCache.get(self.model_dir)
 
     with self.graph.as_default():
       hook = tf.train.CheckpointSaverHook(
diff --git a/tensorflow/python/training/device_setter.py b/tensorflow/python/training/device_setter.py
index d4c10e18839a93..7f403f49275692 100644
--- a/tensorflow/python/training/device_setter.py
+++ b/tensorflow/python/training/device_setter.py
@@ -198,7 +198,7 @@ def replica_device_setter(ps_tasks=0, ps_device="/job:ps",
   if ps_ops is None:
     # TODO(sherrym): Variables in the LOCAL_VARIABLES collection should not be
     # placed in the parameter server.
-    ps_ops = ["Variable"]
+    ps_ops = ["Variable", "VariableV2"]
 
   if not merge_devices:
     logging.warning(
diff --git a/tensorflow/python/training/input.py b/tensorflow/python/training/input.py
index 4fdda9686000ae..7d043bf593d90d 100644
--- a/tensorflow/python/training/input.py
+++ b/tensorflow/python/training/input.py
@@ -407,7 +407,8 @@ def _as_original_type(original_tensors, tensor_list):
     return tensor_list
 
 
-def _store_sparse_tensors(tensor_list, enqueue_many, shared_map_ops=None):
+def _store_sparse_tensors(tensor_list, enqueue_many, keep_input,
+                          shared_map_ops=None):
   """Store SparseTensors for feeding into batch, etc.
 
   If `shared_map_ops` is provided, the underlying `SparseTensorsMap` objects
@@ -425,6 +426,8 @@ def _store_sparse_tensors(tensor_list, enqueue_many, shared_map_ops=None):
   Args:
     tensor_list: List of `Tensor` and `SparseTensor` objects.
     enqueue_many: Python `Boolean`.
+    keep_input: Must be a scalar bool Tensor (not a Python bool). If False,
+      don't store.
     shared_map_ops: (optional) List of `Operation` objects from a previous
       call to `_store_sparse_tensors`.  If not `None`, the op types should be
       one of `AddSparseToTensorsMap` or `AddManySparseToTensorsMap` in the
@@ -440,43 +443,67 @@ def _store_sparse_tensors(tensor_list, enqueue_many, shared_map_ops=None):
   def _sparse_meta_data(t, storing_op, map_op):
     if not isinstance(t, sparse_tensor.SparseTensor):
       return _SparseMetaData(False, None, None)
-    rank = t.shape.get_shape().with_rank(1)[0]
+    rank = t.dense_shape.get_shape().with_rank(1)[0]
     if enqueue_many:
       rank -= 1
-    # If a shared map_op was provided, use that.  Otherwise use the name of
+    # If a shared map_op was provided, use that. Otherwise use the name of
     # the operation used to store the SparseTensor.
     return _SparseMetaData(
         sparse=True, map_op=map_op or storing_op, rank=rank)
 
   def _maybe_store(t, shared_map_op):
+    """Store Sparse tensor, if necessary."""
     if not isinstance(t, sparse_tensor.SparseTensor):
       return t
     map_op_name = shared_map_op.name if shared_map_op else None
-    return (_store_many_sparse(t, shared_name=map_op_name) if enqueue_many
-            else _store_sparse(t, shared_name=map_op_name))
+    def _maybe_store_sparse(t, map_op_name, keep_input):
+      return control_flow_ops.cond(
+          keep_input,
+          lambda: _store_sparse(t, shared_name=map_op_name),
+          lambda: constant_op.constant(-1, dtypes.int64))
+    def _maybe_store_many_sparse(t, map_op_name, keep_input):
+      out_tensor = control_flow_ops.cond(
+          keep_input,
+          lambda: _store_many_sparse(t, shared_name=map_op_name),
+          lambda: -1 * array_ops.ones(array_ops.shape(t)[0:1], dtypes.int64))
+      out_tensor.set_shape([None])  # necessary when t.ndims is unknown
+      return out_tensor
+    store_f = _maybe_store_many_sparse if enqueue_many else _maybe_store_sparse
+    return store_f(t, map_op_name, keep_input)
 
   stored_list = [
       _maybe_store(t, shared_map_op) for t, shared_map_op
       in zip(tensor_list, maybe_shared_map_ops)]
+  # Since the output of `_store{_many}_sparse is wrapped in a tf.cond `Merge`,
+  # we can't just get the Op of the resulting tensor.
+  def _sparse_op(stored):
+    for input_tensor in stored.op.inputs:
+      if input_tensor.op.type in ("AddSparseToTensorsMap",
+                                  "AddManySparseToTensorsMap"):
+        return input_tensor.op
+    # If there was no sparse input, then the original stored Tensor wasn't
+    # sparse and we can just return the original Tensor's Op.
+    return stored.op
   sparse_info_list = [
-      _sparse_meta_data(t, stored.op, shared_map_op)
+      _sparse_meta_data(t, _sparse_op(stored), shared_map_op)
       for t, stored, shared_map_op
       in zip(tensor_list, stored_list, maybe_shared_map_ops)]
-  # expand dims of stored tensors by 1 for proper enqueue shape
+  # Expand dims of stored tensors by 1 for proper enqueue shape
   stored_list = [
       array_ops.expand_dims(s, [-1]) if s_info.sparse else s
       for s, s_info in zip(stored_list, sparse_info_list)]
   return stored_list, sparse_info_list
 
 
-def _store_sparse_tensors_join(tensor_list_list, enqueue_many):
+def _store_sparse_tensors_join(tensor_list_list, enqueue_many, keep_input):
   """Store SparseTensors for feeding into batch_join, etc."""
   (s0, sparse_info_list) = _store_sparse_tensors(
-      tensor_list_list[0], enqueue_many)
+      tensor_list_list[0], enqueue_many, keep_input)
   stored_list_list = [s0]
   for tensor_list in tensor_list_list[1:]:
     s, sparse_info_candidate = _store_sparse_tensors(
-        tensor_list, enqueue_many, [st.map_op for st in sparse_info_list])
+        tensor_list, enqueue_many, keep_input,
+        [st.map_op for st in sparse_info_list])
     if sparse_info_list != sparse_info_candidate:
       raise ValueError("Inconsistent SparseTensors list: %s vs. %s"
                        % (tensor_list_list[0], tensor_list))
@@ -498,8 +525,7 @@ def _restore_sparse_tensors(stored_list, sparse_info_list):
                       sparse_handles=array_ops.squeeze(s, [1]),
                       rank=(info.rank + 1).value)
       if info.sparse else s
-      for (s, info)
-      in zip(stored_list, sparse_info_list)]
+      for (s, info) in zip(stored_list, sparse_info_list)]
   return tensors if received_sequence else tensors[0]
 
 
@@ -518,6 +544,12 @@ def _validate_join(tensor_list_list):
   return tensor_list_list
 
 
+def _validate_tensor_or_none(tensor_or_none):
+  if tensor_or_none is not None:
+    return ops.convert_to_tensor(tensor_or_none)
+  return tensor_or_none
+
+
 def _dtypes(tensor_list_list):
   all_types = [[t.dtype for t in tl] for tl in tensor_list_list]
   types = all_types[0]
@@ -571,19 +603,35 @@ def _shapes(tensor_list_list, shapes, enqueue_many):
   return shapes
 
 
-def _enqueue_join(queue, tensor_list_list, enqueue_many):
+def _enqueue_join(queue, tensor_list_list, enqueue_many, keep_input):
+  """Enqueue `tensor_list_list` in `queue`."""
   if enqueue_many:
-    enqueue_ops = [queue.enqueue_many(tl) for tl in tensor_list_list]
+    enqueue_fn = queue.enqueue_many
+  else:
+    enqueue_fn = queue.enqueue
+  if keep_input is None:
+    enqueue_ops = [enqueue_fn(tl) for tl in tensor_list_list]
   else:
-    enqueue_ops = [queue.enqueue(tl) for tl in tensor_list_list]
+    enqueue_ops = [control_flow_ops.cond(
+        keep_input,
+        lambda: enqueue_fn(tl),
+        control_flow_ops.no_op) for tl in tensor_list_list]
   queue_runner.add_queue_runner(queue_runner.QueueRunner(queue, enqueue_ops))
 
 
-def _enqueue(queue, tensor_list, threads, enqueue_many):
+def _enqueue(queue, tensor_list, threads, enqueue_many, keep_input):
+  """Enqueue `tensor_list` in `queue`."""
   if enqueue_many:
-    enqueue_ops = [queue.enqueue_many(tensor_list)] * threads
+    enqueue_fn = queue.enqueue_many
   else:
-    enqueue_ops = [queue.enqueue(tensor_list)] * threads
+    enqueue_fn = queue.enqueue
+  if keep_input is None:
+    enqueue_ops = [enqueue_fn(tensor_list)] * threads
+  else:
+    enqueue_ops = [control_flow_ops.cond(
+        keep_input,
+        lambda: enqueue_fn(tensor_list),
+        control_flow_ops.no_op)] * threads
   queue_runner.add_queue_runner(queue_runner.QueueRunner(queue, enqueue_ops))
 
 
@@ -592,6 +640,144 @@ def _which_queue(dynamic_pad):
           else data_flow_ops.FIFOQueue)
 
 
+def _batch(tensors, batch_size, keep_input, num_threads=1, capacity=32,
+           enqueue_many=False, shapes=None, dynamic_pad=False,
+           allow_smaller_final_batch=False, shared_name=None,
+           name=None):
+  """Helper function for `batch` and `maybe_batch`."""
+  tensor_list = _as_tensor_list(tensors)
+  with ops.name_scope(name, "batch", list(tensor_list) + [keep_input]) as name:
+    tensor_list = _validate(tensor_list)
+    keep_input = _validate_tensor_or_none(keep_input)
+    (tensor_list, sparse_info) = _store_sparse_tensors(
+        tensor_list, enqueue_many, keep_input)
+    types = _dtypes([tensor_list])
+    shapes = _shapes([tensor_list], shapes, enqueue_many)
+    # TODO(josh11b,mrry): Switch to BatchQueue once it is written.
+    queue = _which_queue(dynamic_pad)(
+        capacity=capacity, dtypes=types, shapes=shapes, shared_name=shared_name)
+    _enqueue(queue, tensor_list, num_threads, enqueue_many, keep_input)
+    summary.scalar("queue/%s/fraction_of_%d_full" % (queue.name, capacity),
+                   math_ops.cast(queue.size(), dtypes.float32) *
+                   (1. / capacity))
+
+    if allow_smaller_final_batch:
+      dequeued = queue.dequeue_up_to(batch_size, name=name)
+    else:
+      dequeued = queue.dequeue_many(batch_size, name=name)
+    dequeued = _restore_sparse_tensors(dequeued, sparse_info)
+    return _as_original_type(tensors, dequeued)
+
+
+# TODO(josh11b): Add a thread_multiplier or num_threads (that has to be
+# a multiple of len(tensor_list_list)?) parameter, to address the use
+# case where you want more parallelism than you can support different
+# readers (either because you don't have that many files or can't
+# read that many files in parallel due to the number of seeks required).
+# Once this is done, batch() can be written as a call to batch_join().
+def _batch_join(tensors_list, batch_size, keep_input, capacity=32,
+                enqueue_many=False, shapes=None, dynamic_pad=False,
+                allow_smaller_final_batch=False, shared_name=None, name=None):
+  """Helper function for `batch_join` and `maybe_batch_join`."""
+  tensor_list_list = _as_tensor_list_list(tensors_list)
+  with ops.name_scope(name, "batch_join",
+                      _flatten(tensor_list_list) + [keep_input]) as name:
+    tensor_list_list = _validate_join(tensor_list_list)
+    keep_input = _validate_tensor_or_none(keep_input)
+    tensor_list_list, sparse_info = _store_sparse_tensors_join(
+        tensor_list_list, enqueue_many, keep_input)
+    types = _dtypes(tensor_list_list)
+    shapes = _shapes(tensor_list_list, shapes, enqueue_many)
+    # TODO(josh11b,mrry): Switch to BatchQueue once it is written.
+    queue = _which_queue(dynamic_pad)(
+        capacity=capacity, dtypes=types, shapes=shapes, shared_name=shared_name)
+    _enqueue_join(queue, tensor_list_list, enqueue_many, keep_input)
+    summary.scalar("queue/%s/fraction_of_%d_full" % (queue.name, capacity),
+                   math_ops.cast(queue.size(), dtypes.float32) *
+                   (1. / capacity))
+
+    if allow_smaller_final_batch:
+      dequeued = queue.dequeue_up_to(batch_size, name=name)
+    else:
+      dequeued = queue.dequeue_many(batch_size, name=name)
+    dequeued = _restore_sparse_tensors(dequeued, sparse_info)
+    # tensors_list was validated to not be empty.
+    return _as_original_type(tensors_list[0], dequeued)
+
+
+def _shuffle_batch(tensors, batch_size, capacity, min_after_dequeue,
+                   keep_input, num_threads=1, seed=None, enqueue_many=False,
+                   shapes=None, allow_smaller_final_batch=False,
+                   shared_name=None, name=None):
+  """Helper function for `shuffle_batch` and `maybe_shuffle_batch`."""
+  tensor_list = _as_tensor_list(tensors)
+  with ops.name_scope(name, "shuffle_batch",
+                      list(tensor_list) + [keep_input]) as name:
+    tensor_list = _validate(tensor_list)
+    keep_input = _validate_tensor_or_none(keep_input)
+    tensor_list, sparse_info = _store_sparse_tensors(
+        tensor_list, enqueue_many, keep_input)
+    types = _dtypes([tensor_list])
+    shapes = _shapes([tensor_list], shapes, enqueue_many)
+    queue = data_flow_ops.RandomShuffleQueue(
+        capacity=capacity, min_after_dequeue=min_after_dequeue, seed=seed,
+        dtypes=types, shapes=shapes, shared_name=shared_name)
+    _enqueue(queue, tensor_list, num_threads, enqueue_many, keep_input)
+    full = (math_ops.cast(math_ops.maximum(0, queue.size() - min_after_dequeue),
+                          dtypes.float32) *
+            (1. / (capacity - min_after_dequeue)))
+    # Note that name contains a '/' at the end so we intentionally do not place
+    # a '/' after %s below.
+    summary_name = (
+        "queue/%sfraction_over_%d_of_%d_full" %
+        (name, min_after_dequeue, capacity - min_after_dequeue))
+    summary.scalar(summary_name, full)
+
+    if allow_smaller_final_batch:
+      dequeued = queue.dequeue_up_to(batch_size, name=name)
+    else:
+      dequeued = queue.dequeue_many(batch_size, name=name)
+    dequeued = _restore_sparse_tensors(dequeued, sparse_info)
+    return _as_original_type(tensors, dequeued)
+
+
+def _shuffle_batch_join(tensors_list, batch_size, capacity,
+                        min_after_dequeue, keep_input, seed=None,
+                        enqueue_many=False, shapes=None,
+                        allow_smaller_final_batch=False, shared_name=None,
+                        name=None):
+  """Helper function for `shuffle_batch_join` and `maybe_shuffle_batch_join`."""
+  tensor_list_list = _as_tensor_list_list(tensors_list)
+  with ops.name_scope(name, "shuffle_batch_join",
+                      _flatten(tensor_list_list) + [keep_input]) as name:
+    tensor_list_list = _validate_join(tensor_list_list)
+    keep_input = _validate_tensor_or_none(keep_input)
+    tensor_list_list, sparse_info = _store_sparse_tensors_join(
+        tensor_list_list, enqueue_many, keep_input)
+    types = _dtypes(tensor_list_list)
+    shapes = _shapes(tensor_list_list, shapes, enqueue_many)
+    queue = data_flow_ops.RandomShuffleQueue(
+        capacity=capacity, min_after_dequeue=min_after_dequeue, seed=seed,
+        dtypes=types, shapes=shapes, shared_name=shared_name)
+    _enqueue_join(queue, tensor_list_list, enqueue_many, keep_input)
+    full = (math_ops.cast(math_ops.maximum(0, queue.size() - min_after_dequeue),
+                          dtypes.float32) *
+            (1. / (capacity - min_after_dequeue)))
+    # Note that name contains a '/' at the end so we intentionally do not place
+    # a '/' after %s below.
+    summary_name = (
+        "queue/%sfraction_over_%d_of_%d_full" %
+        (name, min_after_dequeue, capacity - min_after_dequeue))
+    summary.scalar(summary_name, full)
+
+    if allow_smaller_final_batch:
+      dequeued = queue.dequeue_up_to(batch_size, name=name)
+    else:
+      dequeued = queue.dequeue_many(batch_size, name=name)
+    dequeued = _restore_sparse_tensors(dequeued, sparse_info)
+    # tensors_list was validated to not be empty.
+    return _as_original_type(tensors_list[0], dequeued)
+
 # Batching functions ----------------------------------------------------------
 
 
@@ -671,35 +857,69 @@ def batch(tensors, batch_size, num_threads=1, capacity=32,
     ValueError: If the `shapes` are not specified, and cannot be
       inferred from the elements of `tensors`.
   """
-  tensor_list = _as_tensor_list(tensors)
-  with ops.name_scope(name, "batch", tensor_list) as name:
-    tensor_list = _validate(tensor_list)
-    (tensor_list, sparse_info) = _store_sparse_tensors(
-        tensor_list, enqueue_many)
-    types = _dtypes([tensor_list])
-    shapes = _shapes([tensor_list], shapes, enqueue_many)
-    # TODO(josh11b,mrry): Switch to BatchQueue once it is written.
-    queue = _which_queue(dynamic_pad)(
-        capacity=capacity, dtypes=types, shapes=shapes, shared_name=shared_name)
-    _enqueue(queue, tensor_list, num_threads, enqueue_many)
-    summary.scalar("queue/%s/fraction_of_%d_full" % (queue.name, capacity),
-                   math_ops.cast(queue.size(), dtypes.float32) *
-                   (1. / capacity))
+  return _batch(
+      tensors,
+      batch_size,
+      keep_input=True,
+      num_threads=num_threads,
+      capacity=capacity,
+      enqueue_many=enqueue_many,
+      shapes=shapes,
+      dynamic_pad=dynamic_pad,
+      allow_smaller_final_batch=allow_smaller_final_batch,
+      shared_name=shared_name,
+      name=name)
+
+
+def maybe_batch(tensors, keep_input, batch_size, num_threads=1, capacity=32,
+                enqueue_many=False, shapes=None, dynamic_pad=False,
+                allow_smaller_final_batch=False, shared_name=None, name=None):
+  """Conditionally creates batches of tensors based on `keep_input`.
+
+  See docstring in `batch` for more details.
 
-    if allow_smaller_final_batch:
-      dequeued = queue.dequeue_up_to(batch_size, name=name)
-    else:
-      dequeued = queue.dequeue_many(batch_size, name=name)
-    dequeued = _restore_sparse_tensors(dequeued, sparse_info)
-    return _as_original_type(tensors, dequeued)
+  Args:
+    tensors: The list or dictionary of tensors to enqueue.
+    keep_input: A `bool` scalar Tensor.  This tensor controls whether the input
+      is added to the queue or not.  If it evaluates `True`, then `tensors` are
+      added to the queue; otherwise they are dropped.  This tensor essentially
+      acts as a filtering mechanism.
+    batch_size: The new batch size pulled from the queue.
+    num_threads: The number of threads enqueuing `tensors`.
+    capacity: An integer. The maximum number of elements in the queue.
+    enqueue_many: Whether each tensor in `tensors` is a single example.
+    shapes: (Optional) The shapes for each example.  Defaults to the
+      inferred shapes for `tensors`.
+    dynamic_pad: Boolean.  Allow variable dimensions in input shapes.
+      The given dimensions are padded upon dequeue so that tensors within a
+      batch have the same shapes.
+    allow_smaller_final_batch: (Optional) Boolean. If `True`, allow the final
+      batch to be smaller if there are insufficient items left in the queue.
+    shared_name: (Optional). If set, this queue will be shared under the given
+      name across multiple sessions.
+    name: (Optional) A name for the operations.
+
+  Returns:
+    A list or dictionary of tensors with the same types as `tensors`.
+
+  Raises:
+    ValueError: If the `shapes` are not specified, and cannot be
+      inferred from the elements of `tensors`.
+  """
+  return _batch(
+      tensors,
+      batch_size,
+      keep_input,
+      num_threads=num_threads,
+      capacity=capacity,
+      enqueue_many=enqueue_many,
+      shapes=shapes,
+      dynamic_pad=dynamic_pad,
+      allow_smaller_final_batch=allow_smaller_final_batch,
+      shared_name=shared_name,
+      name=name)
 
 
-# TODO(josh11b): Add a thread_multiplier or num_threads (that has to be
-# a multiple of len(tensor_list_list)?) parameter, to address the use
-# case where you want more parallelism than you can support different
-# readers (either because you don't have that many files or can't
-# read that many files in parallel due to the number of seeks required).
-# Once this is done, batch() can be written as a call to batch_join().
 def batch_join(tensors_list, batch_size, capacity=32, enqueue_many=False,
                shapes=None, dynamic_pad=False, allow_smaller_final_batch=False,
                shared_name=None, name=None):
@@ -784,28 +1004,67 @@ def batch_join(tensors_list, batch_size, capacity=32, enqueue_many=False,
     ValueError: If the `shapes` are not specified, and cannot be
       inferred from the elements of `tensor_list_list`.
   """
-  tensor_list_list = _as_tensor_list_list(tensors_list)
-  with ops.name_scope(name, "batch_join", _flatten(tensor_list_list)) as name:
-    tensor_list_list = _validate_join(tensor_list_list)
-    tensor_list_list, sparse_info = _store_sparse_tensors_join(
-        tensor_list_list, enqueue_many)
-    types = _dtypes(tensor_list_list)
-    shapes = _shapes(tensor_list_list, shapes, enqueue_many)
-    # TODO(josh11b,mrry): Switch to BatchQueue once it is written.
-    queue = _which_queue(dynamic_pad)(
-        capacity=capacity, dtypes=types, shapes=shapes, shared_name=shared_name)
-    _enqueue_join(queue, tensor_list_list, enqueue_many)
-    summary.scalar("queue/%s/fraction_of_%d_full" % (queue.name, capacity),
-                   math_ops.cast(queue.size(), dtypes.float32) *
-                   (1. / capacity))
+  return _batch_join(
+      tensors_list,
+      batch_size,
+      keep_input=True,
+      capacity=capacity,
+      enqueue_many=enqueue_many,
+      shapes=shapes,
+      dynamic_pad=dynamic_pad,
+      allow_smaller_final_batch=allow_smaller_final_batch,
+      shared_name=shared_name,
+      name=name)
+
+
+def maybe_batch_join(tensors_list, keep_input, batch_size, capacity=32,
+                     enqueue_many=False, shapes=None, dynamic_pad=False,
+                     allow_smaller_final_batch=False, shared_name=None,
+                     name=None):
+  """Runs a list of tensors to conditionally fill a queue to create batches.
+
+  See docstring in `batch_join` for more details.
 
-    if allow_smaller_final_batch:
-      dequeued = queue.dequeue_up_to(batch_size, name=name)
-    else:
-      dequeued = queue.dequeue_many(batch_size, name=name)
-    dequeued = _restore_sparse_tensors(dequeued, sparse_info)
-    # tensors_list was validated to not be empty.
-    return _as_original_type(tensors_list[0], dequeued)
+  Args:
+    tensors_list: A list of tuples or dictionaries of tensors to enqueue.
+    keep_input: A `bool` scalar Tensor.  This tensor controls whether the input
+      is added to the queue or not.  If it evaluates `True`, then `tensors` are
+      added to the queue; otherwise they are dropped.  This tensor essentially
+      acts as a filtering mechanism.
+    batch_size: An integer. The new batch size pulled from the queue.
+    capacity: An integer. The maximum number of elements in the queue.
+    enqueue_many: Whether each tensor in `tensor_list_list` is a single
+      example.
+    shapes: (Optional) The shapes for each example.  Defaults to the
+      inferred shapes for `tensor_list_list[i]`.
+    dynamic_pad: Boolean.  Allow variable dimensions in input shapes.
+      The given dimensions are padded upon dequeue so that tensors within a
+      batch have the same shapes.
+    allow_smaller_final_batch: (Optional) Boolean. If `True`, allow the final
+      batch to be smaller if there are insufficient items left in the queue.
+    shared_name: (Optional) If set, this queue will be shared under the given
+      name across multiple sessions.
+    name: (Optional) A name for the operations.
+
+  Returns:
+    A list or dictionary of tensors with the same number and types as
+    `tensors_list[i]`.
+
+  Raises:
+    ValueError: If the `shapes` are not specified, and cannot be
+      inferred from the elements of `tensor_list_list`.
+  """
+  return _batch_join(
+      tensors_list,
+      batch_size,
+      keep_input,
+      capacity=capacity,
+      enqueue_many=enqueue_many,
+      shapes=shapes,
+      dynamic_pad=dynamic_pad,
+      allow_smaller_final_batch=allow_smaller_final_batch,
+      shared_name=shared_name,
+      name=name)
 
 
 def shuffle_batch(tensors, batch_size, capacity, min_after_dequeue,
@@ -890,33 +1149,71 @@ def shuffle_batch(tensors, batch_size, capacity, min_after_dequeue,
     ValueError: If the `shapes` are not specified, and cannot be
       inferred from the elements of `tensors`.
   """
-  tensor_list = _as_tensor_list(tensors)
-  with ops.name_scope(name, "shuffle_batch", tensor_list) as name:
-    tensor_list = _validate(tensor_list)
-    tensor_list, sparse_info = _store_sparse_tensors(
-        tensor_list, enqueue_many)
-    types = _dtypes([tensor_list])
-    shapes = _shapes([tensor_list], shapes, enqueue_many)
-    queue = data_flow_ops.RandomShuffleQueue(
-        capacity=capacity, min_after_dequeue=min_after_dequeue, seed=seed,
-        dtypes=types, shapes=shapes, shared_name=shared_name)
-    _enqueue(queue, tensor_list, num_threads, enqueue_many)
-    full = (math_ops.cast(math_ops.maximum(0, queue.size() - min_after_dequeue),
-                          dtypes.float32) *
-            (1. / (capacity - min_after_dequeue)))
-    # Note that name contains a '/' at the end so we intentionally do not place
-    # a '/' after %s below.
-    summary_name = (
-        "queue/%sfraction_over_%d_of_%d_full" %
-        (name, min_after_dequeue, capacity - min_after_dequeue))
-    summary.scalar(summary_name, full)
+  return _shuffle_batch(
+      tensors,
+      batch_size,
+      capacity,
+      min_after_dequeue,
+      keep_input=True,
+      num_threads=num_threads,
+      seed=seed,
+      enqueue_many=enqueue_many,
+      shapes=shapes,
+      allow_smaller_final_batch=allow_smaller_final_batch,
+      shared_name=shared_name,
+      name=name)
+
+
+def maybe_shuffle_batch(tensors, batch_size, capacity, min_after_dequeue,
+                        keep_input, num_threads=1, seed=None,
+                        enqueue_many=False, shapes=None,
+                        allow_smaller_final_batch=False, shared_name=None,
+                        name=None):
+  """Creates batches by randomly shuffling conditionally-enqueued tensors.
+
+  See docstring in `shuffle_batch` for more details.
 
-    if allow_smaller_final_batch:
-      dequeued = queue.dequeue_up_to(batch_size, name=name)
-    else:
-      dequeued = queue.dequeue_many(batch_size, name=name)
-    dequeued = _restore_sparse_tensors(dequeued, sparse_info)
-    return _as_original_type(tensors, dequeued)
+  Args:
+    tensors: The list or dictionary of tensors to enqueue.
+    batch_size: The new batch size pulled from the queue.
+    capacity: An integer. The maximum number of elements in the queue.
+    min_after_dequeue: Minimum number elements in the queue after a
+      dequeue, used to ensure a level of mixing of elements.
+    keep_input: A `bool` scalar Tensor.  This tensor controls whether the input
+      is added to the queue or not.  If it evaluates `True`, then `tensors` are
+      added to the queue; otherwise they are dropped.  This tensor essentially
+      acts as a filtering mechanism.
+    num_threads: The number of threads enqueuing `tensor_list`.
+    seed: Seed for the random shuffling within the queue.
+    enqueue_many: Whether each tensor in `tensor_list` is a single example.
+    shapes: (Optional) The shapes for each example.  Defaults to the
+      inferred shapes for `tensor_list`.
+    allow_smaller_final_batch: (Optional) Boolean. If `True`, allow the final
+      batch to be smaller if there are insufficient items left in the queue.
+    shared_name: (Optional) If set, this queue will be shared under the given
+      name across multiple sessions.
+    name: (Optional) A name for the operations.
+
+  Returns:
+    A list or dictionary of tensors with the types as `tensors`.
+
+  Raises:
+    ValueError: If the `shapes` are not specified, and cannot be
+      inferred from the elements of `tensors`.
+  """
+  return _shuffle_batch(
+      tensors,
+      batch_size,
+      capacity,
+      min_after_dequeue,
+      keep_input,
+      num_threads=num_threads,
+      seed=seed,
+      enqueue_many=enqueue_many,
+      shapes=shapes,
+      allow_smaller_final_batch=allow_smaller_final_batch,
+      shared_name=shared_name,
+      name=name)
 
 
 def shuffle_batch_join(tensors_list, batch_size, capacity,
@@ -993,32 +1290,67 @@ def shuffle_batch_join(tensors_list, batch_size, capacity,
     ValueError: If the `shapes` are not specified, and cannot be
       inferred from the elements of `tensors_list`.
   """
-  tensor_list_list = _as_tensor_list_list(tensors_list)
-  with ops.name_scope(name, "shuffle_batch_join",
-                      _flatten(tensor_list_list)) as name:
-    tensor_list_list = _validate_join(tensor_list_list)
-    tensor_list_list, sparse_info = _store_sparse_tensors_join(
-        tensor_list_list, enqueue_many)
-    types = _dtypes(tensor_list_list)
-    shapes = _shapes(tensor_list_list, shapes, enqueue_many)
-    queue = data_flow_ops.RandomShuffleQueue(
-        capacity=capacity, min_after_dequeue=min_after_dequeue, seed=seed,
-        dtypes=types, shapes=shapes, shared_name=shared_name)
-    _enqueue_join(queue, tensor_list_list, enqueue_many)
-    full = (math_ops.cast(math_ops.maximum(0, queue.size() - min_after_dequeue),
-                          dtypes.float32) *
-            (1. / (capacity - min_after_dequeue)))
-    # Note that name contains a '/' at the end so we intentionally do not place
-    # a '/' after %s below.
-    summary_name = (
-        "queue/%sfraction_over_%d_of_%d_full" %
-        (name, min_after_dequeue, capacity - min_after_dequeue))
-    summary.scalar(summary_name, full)
+  return _shuffle_batch_join(
+      tensors_list,
+      batch_size,
+      capacity,
+      min_after_dequeue,
+      keep_input=True,
+      seed=seed,
+      enqueue_many=enqueue_many,
+      shapes=shapes,
+      allow_smaller_final_batch=allow_smaller_final_batch,
+      shared_name=shared_name,
+      name=name)
+
+
+def maybe_shuffle_batch_join(tensors_list, batch_size, capacity,
+                             min_after_dequeue, keep_input, seed=None,
+                             enqueue_many=False, shapes=None,
+                             allow_smaller_final_batch=False, shared_name=None,
+                             name=None):
+  """Create batches by randomly shuffling conditionally-enqueued tensors.
+
+  See docstring in `shuffle_batch_join` for more details.
 
-    if allow_smaller_final_batch:
-      dequeued = queue.dequeue_up_to(batch_size, name=name)
-    else:
-      dequeued = queue.dequeue_many(batch_size, name=name)
-    dequeued = _restore_sparse_tensors(dequeued, sparse_info)
-    # tensors_list was validated to not be empty.
-    return _as_original_type(tensors_list[0], dequeued)
+  Args:
+    tensors_list: A list of tuples or dictionaries of tensors to enqueue.
+    batch_size: An integer. The new batch size pulled from the queue.
+    capacity: An integer. The maximum number of elements in the queue.
+    min_after_dequeue: Minimum number elements in the queue after a
+      dequeue, used to ensure a level of mixing of elements.
+    keep_input: A `bool` scalar Tensor.  If provided, this tensor controls
+      whether the input is added to the queue or not.  If it evaluates `True`,
+      then `tensors_list` are added to the queue; otherwise they are dropped.
+      This tensor essentially acts as a filtering mechanism.
+    seed: Seed for the random shuffling within the queue.
+    enqueue_many: Whether each tensor in `tensor_list_list` is a single
+      example.
+    shapes: (Optional) The shapes for each example.  Defaults to the
+      inferred shapes for `tensors_list[i]`.
+    allow_smaller_final_batch: (Optional) Boolean. If `True`, allow the final
+      batch to be smaller if there are insufficient items left in the queue.
+    shared_name: (optional). If set, this queue will be shared under the given
+      name across multiple sessions.
+    name: (Optional) A name for the operations.
+
+  Returns:
+    A list or dictionary of tensors with the same number and types as
+    `tensors_list[i]`.
+
+  Raises:
+    ValueError: If the `shapes` are not specified, and cannot be
+      inferred from the elements of `tensors_list`.
+  """
+  return _shuffle_batch_join(
+      tensors_list,
+      batch_size,
+      capacity,
+      min_after_dequeue,
+      keep_input,
+      seed=seed,
+      enqueue_many=enqueue_many,
+      shapes=shapes,
+      allow_smaller_final_batch=allow_smaller_final_batch,
+      shared_name=shared_name,
+      name=name)
diff --git a/tensorflow/python/training/input_test.py b/tensorflow/python/training/input_test.py
index 8f3470fc55814e..93aae621b3006e 100644
--- a/tensorflow/python/training/input_test.py
+++ b/tensorflow/python/training/input_test.py
@@ -700,14 +700,14 @@ def testCannotInferRankError(self):
         tf.train.batch([x], batch_size=2)
 
   def testBatchedSparseTensorInferredShape(self):
-    sparse = tf.SparseTensor(indices=[[0]], values=[1.0], shape=[1])
-    self.assertAllEqual((1,), sparse.shape.get_shape().as_list())
+    sparse = tf.SparseTensor(indices=[[0]], values=[1.0], dense_shape=[1])
+    self.assertAllEqual((1,), sparse.dense_shape.get_shape().as_list())
     batched = tf.train.batch([sparse], batch_size=2)
     self.assertAllEqual((2,), batched.shape.get_shape().as_list())
 
   def testBatchedSparseTensorInferredShapeEnqueueMany(self):
-    sparse = tf.SparseTensor(indices=[[0]], values=[1.0], shape=[1])
-    self.assertAllEqual((1,), sparse.shape.get_shape().as_list())
+    sparse = tf.SparseTensor(indices=[[0]], values=[1.0], dense_shape=[1])
+    self.assertAllEqual((1,), sparse.dense_shape.get_shape().as_list())
     batched = tf.train.batch([sparse], batch_size=2, enqueue_many=True)
     self.assertAllEqual((1,), batched.shape.get_shape().as_list())
 
@@ -716,7 +716,7 @@ def testBatchedSparseTensorInferredShapeUnknownRank(self):
         indices=tf.placeholder(tf.int64),
         values=tf.placeholder(tf.float32),
         shape=tf.placeholder(tf.int64))
-    self.assertIs(None, sparse.shape.get_shape().num_elements())
+    self.assertIs(None, sparse.dense_shape.get_shape().num_elements())
     batched = tf.train.batch([sparse], batch_size=2)
     self.assertIs(None, batched.shape.get_shape().num_elements())
 
@@ -725,7 +725,7 @@ def testBatchedSparseTensorInferredShapeUnknownRankEnqueueMany(self):
         indices=tf.placeholder(tf.int64),
         values=tf.placeholder(tf.float32),
         shape=tf.placeholder(tf.int64))
-    self.assertIs(None, sparse.shape.get_shape().num_elements())
+    self.assertIs(None, sparse.dense_shape.get_shape().num_elements())
     batched = tf.train.batch([sparse], batch_size=2, enqueue_many=True)
     self.assertIs(None, batched.shape.get_shape().num_elements())
 
@@ -733,6 +733,83 @@ def testSingleElementDict(self):
     x = tf.train.batch({"c": [12, 12]}, batch_size=8)
     self.assertAllEqual((8, 2), x["c"].get_shape().as_list())
 
+  def _testKeepInputHelper(self, num_threads, enqueue_many):
+    with self.test_session() as sess:
+      batch_size = 5
+      num_batches = 4
+      examples = tf.Variable(0)
+      counter = examples.count_up_to(num_batches * batch_size * 2)
+      sparse_counter = tf.SparseTensor(
+          indices=tf.zeros([1, 1], dtype=tf.int64),
+          values=tf.stack([tf.cast(counter, tf.float32)]),
+          shape=[1])
+      to_batch = [counter, sparse_counter, "string"]
+      if enqueue_many:
+        to_batch = tf.train.batch(to_batch, 1)
+      keep_input = tf.squeeze(tf.equal(0, tf.mod(to_batch[0], 2)))
+      batched = tf.train.maybe_batch(
+          to_batch, keep_input, batch_size, num_threads=num_threads,
+          enqueue_many=enqueue_many)
+      tf.initialize_all_variables().run()
+      tf.initialize_local_variables().run()
+      threads = tf.train.start_queue_runners()
+
+      for _ in range(num_batches):
+        results = sess.run(batched)
+        self.assertAllEqual([0] * batch_size, np.mod(results[0], 2))
+        self.assertAllEqual([0] * batch_size, np.mod(results[1].values, 2))
+        self.assertAllEqual([b"string"] * batch_size, results[2])
+
+      # Reached the limit.
+      with self.assertRaises(tf.errors.OutOfRangeError):
+        sess.run(batched)
+      for thread in threads:
+        thread.join()
+
+  def testSingleThreadKeepInput(self):
+    self._testKeepInputHelper(1, False)
+
+  def testSingleThreadKeepInputEnqueueMany(self):
+    self._testKeepInputHelper(1, True)
+
+  def testMultipleThreadKeepInput(self):
+    self._testKeepInputHelper(5, False)
+
+  def testMultipleThreadKeepInputEnqueueMany(self):
+    self._testKeepInputHelper(5, True)
+
+  def testMaybeBatchedSparseTensorInferredShape(self):
+    sparse = tf.SparseTensor(indices=[[0]], values=[1.0], dense_shape=[1])
+    self.assertAllEqual((1,), sparse.dense_shape.get_shape().as_list())
+    batched = tf.train.maybe_batch([sparse], keep_input=True, batch_size=2)
+    self.assertAllEqual((2,), batched.shape.get_shape().as_list())
+
+  def testMaybeBatchedSparseTensorInferredShapeEnqueueMany(self):
+    sparse = tf.SparseTensor(indices=[[0]], values=[1.0], dense_shape=[1])
+    self.assertAllEqual((1,), sparse.dense_shape.get_shape().as_list())
+    batched = tf.train.maybe_batch(
+        [sparse], keep_input=True, batch_size=2, enqueue_many=True)
+    self.assertAllEqual((1,), batched.shape.get_shape().as_list())
+
+  def testMaybeBatchedSparseTensorInferredShapeUnknownRank(self):
+    sparse = tf.SparseTensor(
+        indices=tf.placeholder(tf.int64),
+        values=tf.placeholder(tf.float32),
+        shape=tf.placeholder(tf.int64))
+    self.assertIs(None, sparse.dense_shape.get_shape().num_elements())
+    batched = tf.train.maybe_batch([sparse], keep_input=True, batch_size=2)
+    self.assertIs(None, batched.shape.get_shape().num_elements())
+
+  def testMaybeBatchedSparseTensorInferredShapeUnknownRankEnqueueMany(self):
+    sparse = tf.SparseTensor(
+        indices=tf.placeholder(tf.int64),
+        values=tf.placeholder(tf.float32),
+        shape=tf.placeholder(tf.int64))
+    self.assertIs(None, sparse.dense_shape.get_shape().num_elements())
+    batched = tf.train.maybe_batch(
+        [sparse], keep_input=True, batch_size=2, enqueue_many=True)
+    self.assertIs(None, batched.shape.get_shape().num_elements())
+
 
 class BatchJoinTest(tf.test.TestCase):
 
@@ -1125,6 +1202,85 @@ def testSingleElementDict(self):
     x = tf.train.batch_join([{"c": [12, 12]}], batch_size=8)
     self.assertAllEqual((8, 2), x["c"].get_shape().as_list())
 
+  def _testKeepInputHelper(self, num_threads, enqueue_many):
+    with self.test_session() as sess:
+      batch_size = 5
+      num_batches = 4
+      examples = tf.Variable(0)
+      counter = examples.count_up_to(num_batches * batch_size * 2)
+      sparse_counter = tf.SparseTensor(
+          indices=tf.zeros([1, 1], dtype=tf.int64),
+          values=tf.stack([tf.cast(counter, tf.float32)]),
+          shape=[1])
+      to_batch = [counter, sparse_counter, "string"]
+      if enqueue_many:
+        to_batch = tf.train.batch(to_batch, 1)
+      keep_input = tf.squeeze(tf.equal(0, tf.mod(to_batch[0], 2)))
+      batched = tf.train.maybe_batch_join(
+          [to_batch] * num_threads, keep_input, batch_size,
+          enqueue_many=enqueue_many)
+      tf.initialize_all_variables().run()
+      tf.initialize_local_variables().run()
+      threads = tf.train.start_queue_runners()
+
+      for _ in range(num_batches):
+        results = sess.run(batched)
+        self.assertAllEqual([0] * batch_size, np.mod(results[0], 2),)
+        self.assertAllEqual([0] * batch_size, np.mod(results[1].values, 2),)
+        self.assertAllEqual([b"string"] * batch_size, results[2])
+
+      # Reached the limit.
+      with self.assertRaises(tf.errors.OutOfRangeError):
+        sess.run(batched)
+      for thread in threads:
+        thread.join()
+
+  def testSingleThreadKeepInput(self):
+    self._testKeepInputHelper(1, False)
+
+  def testSingleThreadKeepInputEnqueueMany(self):
+    self._testKeepInputHelper(1, True)
+
+  def testMultipleThreadKeepInput(self):
+    self._testKeepInputHelper(5, False)
+
+  def testMultipleThreadKeepInputEnqueueMany(self):
+    self._testKeepInputHelper(5, True)
+
+  def testMaybeBatchedSparseTensorInferredShape(self):
+    sparse = tf.SparseTensor(indices=[[0]], values=[1.0], dense_shape=[1])
+    self.assertAllEqual((1,), sparse.dense_shape.get_shape().as_list())
+    batched = tf.train.maybe_batch_join(
+        [[sparse]], keep_input=True, batch_size=2)
+    self.assertAllEqual((2,), batched.shape.get_shape().as_list())
+
+  def testMaybeBatchedSparseTensorInferredShapeEnqueueMany(self):
+    sparse = tf.SparseTensor(indices=[[0]], values=[1.0], dense_shape=[1])
+    self.assertAllEqual((1,), sparse.dense_shape.get_shape().as_list())
+    batched = tf.train.maybe_batch_join(
+        [[sparse]], keep_input=True, batch_size=2, enqueue_many=True)
+    self.assertAllEqual((1,), batched.shape.get_shape().as_list())
+
+  def testMaybeBatchedSparseTensorInferredShapeUnknownRank(self):
+    sparse = tf.SparseTensor(
+        indices=tf.placeholder(tf.int64),
+        values=tf.placeholder(tf.float32),
+        shape=tf.placeholder(tf.int64))
+    self.assertIs(None, sparse.dense_shape.get_shape().num_elements())
+    batched = tf.train.maybe_batch_join(
+        [[sparse]], keep_input=True, batch_size=2)
+    self.assertIs(None, batched.shape.get_shape().num_elements())
+
+  def testMaybeBatchedSparseTensorInferredShapeUnknownRankEnqueueMany(self):
+    sparse = tf.SparseTensor(
+        indices=tf.placeholder(tf.int64),
+        values=tf.placeholder(tf.float32),
+        shape=tf.placeholder(tf.int64))
+    self.assertIs(None, sparse.dense_shape.get_shape().num_elements())
+    batched = tf.train.maybe_batch_join(
+        [[sparse]], keep_input=True, batch_size=2, enqueue_many=True)
+    self.assertIs(None, batched.shape.get_shape().num_elements())
+
 
 class ShuffleBatchTest(tf.test.TestCase):
 
@@ -1351,6 +1507,83 @@ def testSharedName(self):
           "s: 'SHARED_NAME_XYZ'",
           batched[0].op.inputs[0].op.node_def.attr["shared_name"])
 
+  def _testKeepInputHelper(self, num_threads, enqueue_many):
+    with self.test_session() as sess:
+      batch_size = 5
+      num_batches = 4
+      examples = tf.Variable(0)
+      counter = examples.count_up_to(num_batches * batch_size * 2)
+      sparse_counter = tf.SparseTensor(
+          indices=tf.zeros([1, 1], dtype=tf.int64),
+          values=tf.stack([tf.cast(counter, tf.float32)]),
+          shape=[1])
+      to_batch = [counter, sparse_counter, "string"]
+      if enqueue_many:
+        to_batch = tf.train.batch(to_batch, 1)
+      keep_input = tf.squeeze(tf.equal(0, tf.mod(to_batch[0], 2)))
+      batched = tf.train.maybe_shuffle_batch(
+          to_batch, batch_size, 10, 1, keep_input, num_threads=num_threads,
+          enqueue_many=enqueue_many)
+      tf.initialize_all_variables().run()
+      tf.initialize_local_variables().run()
+      threads = tf.train.start_queue_runners()
+
+      for _ in range(num_batches):
+        results = sess.run(batched)
+        self.assertAllEqual([0] * batch_size, np.mod(results[0], 2))
+        self.assertAllEqual([0] * batch_size, np.mod(results[1].values, 2))
+        self.assertAllEqual([b"string"] * batch_size, results[2])
+
+      # Reached the limit.
+      with self.assertRaises(tf.errors.OutOfRangeError):
+        sess.run(batched)
+      for thread in threads:
+        thread.join()
+
+  def testSingleThreadKeepInput(self):
+    self._testKeepInputHelper(1, False)
+
+  def testSingleThreadKeepInputEnqueueMany(self):
+    self._testKeepInputHelper(1, True)
+
+  def testMultipleThreadKeepInput(self):
+    self._testKeepInputHelper(5, False)
+
+  def testMultipleThreadKeepInputEnqueueMany(self):
+    self._testKeepInputHelper(5, True)
+
+  def testMaybeBatchedSparseTensorInferredShape(self):
+    sparse = tf.SparseTensor(indices=[[0]], values=[1.0], dense_shape=[1])
+    self.assertAllEqual((1,), sparse.dense_shape.get_shape().as_list())
+    batched = tf.train.maybe_shuffle_batch([sparse], 2, 10, 1, True)
+    self.assertAllEqual((2,), batched.shape.get_shape().as_list())
+
+  def testMaybeBatchedSparseTensorInferredShapeEnqueueMany(self):
+    sparse = tf.SparseTensor(indices=[[0]], values=[1.0], dense_shape=[1])
+    self.assertAllEqual((1,), sparse.dense_shape.get_shape().as_list())
+    batched = tf.train.maybe_shuffle_batch(
+        [sparse], 2, 10, 1, True, enqueue_many=True)
+    self.assertAllEqual((1,), batched.shape.get_shape().as_list())
+
+  def testMaybeBatchedSparseTensorInferredShapeUnknownRank(self):
+    sparse = tf.SparseTensor(
+        indices=tf.placeholder(tf.int64),
+        values=tf.placeholder(tf.float32),
+        shape=tf.placeholder(tf.int64))
+    self.assertIs(None, sparse.dense_shape.get_shape().num_elements())
+    batched = tf.train.maybe_shuffle_batch([sparse], 2, 10, 1, True)
+    self.assertIs(None, batched.shape.get_shape().num_elements())
+
+  def testMaybeBatchedSparseTensorInferredShapeUnknownRankEnqueueMany(self):
+    sparse = tf.SparseTensor(
+        indices=tf.placeholder(tf.int64),
+        values=tf.placeholder(tf.float32),
+        shape=tf.placeholder(tf.int64))
+    self.assertIs(None, sparse.dense_shape.get_shape().num_elements())
+    batched = tf.train.maybe_shuffle_batch(
+        [sparse], 2, 10, 1, True, enqueue_many=True)
+    self.assertIs(None, batched.shape.get_shape().num_elements())
+
 
 class ShuffleBatchJoinTest(tf.test.TestCase):
 
@@ -1581,6 +1814,83 @@ def testSharedName(self):
           "s: 'SHARED_NAME_XYZ'",
           batched[0].op.inputs[0].op.node_def.attr["shared_name"])
 
+  def _testKeepInputHelper(self, num_threads, enqueue_many):
+    with self.test_session() as sess:
+      batch_size = 5
+      num_batches = 4
+      examples = tf.Variable(0)
+      counter = examples.count_up_to(num_batches * batch_size * 2)
+      sparse_counter = tf.SparseTensor(
+          indices=tf.zeros([1, 1], dtype=tf.int64),
+          values=tf.stack([tf.cast(counter, tf.float32)]),
+          shape=[1])
+      to_batch = [counter, sparse_counter, "string"]
+      if enqueue_many:
+        to_batch = tf.train.batch(to_batch, 1)
+      keep_input = tf.squeeze(tf.equal(0, tf.mod(to_batch[0], 2)))
+      batched = tf.train.maybe_shuffle_batch_join(
+          [to_batch] * num_threads, batch_size, 10, 1, keep_input,
+          enqueue_many=enqueue_many)
+      tf.initialize_all_variables().run()
+      tf.initialize_local_variables().run()
+      threads = tf.train.start_queue_runners()
+
+      for _ in range(num_batches):
+        results = sess.run(batched)
+        self.assertAllEqual([0] * batch_size, np.mod(results[0], 2))
+        self.assertAllEqual([0] * batch_size, np.mod(results[1].values, 2))
+        self.assertAllEqual([b"string"] * batch_size, results[2])
+
+      # Reached the limit.
+      with self.assertRaises(tf.errors.OutOfRangeError):
+        sess.run(batched)
+      for thread in threads:
+        thread.join()
+
+  def testSingleThreadKeepInput(self):
+    self._testKeepInputHelper(1, False)
+
+  def testSingleThreadKeepInputEnqueueMany(self):
+    self._testKeepInputHelper(1, True)
+
+  def testMultipleThreadKeepInput(self):
+    self._testKeepInputHelper(5, False)
+
+  def testMultipleThreadKeepInputEnqueueMany(self):
+    self._testKeepInputHelper(5, True)
+
+  def testMaybeBatchedSparseTensorInferredShape(self):
+    sparse = tf.SparseTensor(indices=[[0]], values=[1.0], dense_shape=[1])
+    self.assertAllEqual((1,), sparse.dense_shape.get_shape().as_list())
+    batched = tf.train.maybe_shuffle_batch_join([[sparse]], 2, 10, 1, True)
+    self.assertAllEqual((2,), batched.shape.get_shape().as_list())
+
+  def testMaybeBatchedSparseTensorInferredShapeEnqueueMany(self):
+    sparse = tf.SparseTensor(indices=[[0]], values=[1.0], dense_shape=[1])
+    self.assertAllEqual((1,), sparse.dense_shape.get_shape().as_list())
+    batched = tf.train.maybe_shuffle_batch_join(
+        [[sparse]], 2, 10, 1, True, enqueue_many=True)
+    self.assertAllEqual((1,), batched.shape.get_shape().as_list())
+
+  def testMaybeBatchedSparseTensorInferredShapeUnknownRank(self):
+    sparse = tf.SparseTensor(
+        indices=tf.placeholder(tf.int64),
+        values=tf.placeholder(tf.float32),
+        shape=tf.placeholder(tf.int64))
+    self.assertIs(None, sparse.dense_shape.get_shape().num_elements())
+    batched = tf.train.maybe_shuffle_batch_join([[sparse]], 2, 10, 1, True)
+    self.assertIs(None, batched.shape.get_shape().num_elements())
+
+  def testMaybeBatchedSparseTensorInferredShapeUnknownRankEnqueueMany(self):
+    sparse = tf.SparseTensor(
+        indices=tf.placeholder(tf.int64),
+        values=tf.placeholder(tf.float32),
+        shape=tf.placeholder(tf.int64))
+    self.assertIs(None, sparse.dense_shape.get_shape().num_elements())
+    batched = tf.train.maybe_shuffle_batch_join(
+        [[sparse]], 2, 10, 1, True, enqueue_many=True)
+    self.assertIs(None, batched.shape.get_shape().num_elements())
+
 
 if __name__ == "__main__":
   tf.test.main()
diff --git a/tensorflow/python/training/moving_averages.py b/tensorflow/python/training/moving_averages.py
index 45438b1342698e..ff9b0e5ae21527 100644
--- a/tensorflow/python/training/moving_averages.py
+++ b/tensorflow/python/training/moving_averages.py
@@ -170,17 +170,19 @@ def _zero_debias(unbiased_var, value, decay):
   with variable_scope.variable_scope(
       unbiased_var.op.name, values=[unbiased_var, value, decay]) as scope:
     with ops.colocate_with(unbiased_var):
+      with ops.control_dependencies(None):
+        biased_initializer = init_ops.zeros_initializer(
+            unbiased_var.get_shape(), dtype=unbiased_var.dtype)
+        local_step_initializer = init_ops.ones_initializer()
       biased_var = variable_scope.get_variable(
-          "biased",
-          initializer=init_ops.zeros_initializer(
-              unbiased_var.get_shape(), dtype=unbiased_var.dtype),
-          trainable=False)
+          "biased", initializer=biased_initializer, trainable=False)
       # Initializing the local_step to `0` would cause problems with the
       # debiasing equation, so we instead initialize to `1`.
       local_step = variable_scope.get_variable(
           "local_step",
-          shape=[], dtype=unbiased_var.dtype,
-          initializer=init_ops.ones_initializer(),
+          shape=[],
+          dtype=unbiased_var.dtype,
+          initializer=local_step_initializer,
           trainable=False)
 
       # Get an update ops for both shadow variables.
@@ -329,7 +331,7 @@ def apply(self, var_list=None):
 
     shadow variables are created with `trainable=False` and added to the
     `GraphKeys.ALL_VARIABLES` collection.  They will be returned by calls to
-    `tf.all_variables()`.
+    `tf.global_variables()`.
 
     Returns an op that updates all shadow variables as described above.
 
@@ -376,7 +378,7 @@ def apply(self, var_list=None):
           avg = slot_creator.create_zeros_slot(
               var,
               self._name,
-              colocate_with_primary=(var.op.type == "Variable"))
+              colocate_with_primary=(var.op.type in ["Variable", "VariableV2"]))
           if self._zero_debias:
             zero_debias_true.add(avg)
       self._averages[var] = avg
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index cb4e1de2353b7f..4884bd282b1956 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -56,6 +56,7 @@
 
 # Op names which identify variable reads which should be saved.
 _VARIABLE_OPS = set(["Variable",
+                     "VariableV2",
                      "AutoReloadVariable",
                      "ReadVariableOp",
                      "ResourceGather"])
@@ -500,6 +501,11 @@ def OpListToDict(op_list):
     for var in op_list:
       if isinstance(var, BaseSaverBuilder.SaveableObject):
         names_to_saveables[var.name] = var
+      elif isinstance(var, variables.PartitionedVariable):
+        if var.name in names_to_saveables:
+          raise ValueError("At least two variables have the same name: %s" %
+                           var.name)
+        names_to_saveables[var.name] = var
       elif isinstance(var, variables.Variable) and var._save_slice_info:
         name = var._save_slice_info.full_name
         if name in names_to_saveables:
@@ -550,7 +556,9 @@ def _ValidateAndSliceInputs(self, names_to_saveables):
       op = names_to_saveables[name]
       if isinstance(op, BaseSaverBuilder.SaveableObject):
         self._AddSaveable(saveables, seen_ops, op)
-      elif isinstance(op, (list, tuple)):
+      elif isinstance(op, (list, tuple, variables.PartitionedVariable)):
+        if isinstance(op, variables.PartitionedVariable):
+          op = list(op)
         # A set of slices.
         slice_name = None
         # pylint: disable=protected-access
@@ -576,7 +584,7 @@ def _ValidateAndSliceInputs(self, names_to_saveables):
           raise TypeError("names_to_saveables must be a dict mapping string "
                           "names to Tensors/Variables. Not a variable: %s" %
                           variable)
-        if variable.op.type in ["Variable", "AutoReloadVariable"]:
+        if variable.op.type in ["Variable", "VariableV2", "AutoReloadVariable"]:
           saveable = BaseSaverBuilder.VariableSaveable(variable, "", name)
         else:
           saveable = BaseSaverBuilder.ResourceVariableSaveable(
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index 5ba50de1aceea3..1cfa48c176cb9b 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -112,7 +112,7 @@ def restore(self, restore_tensors, shapes):
 class SaverTest(tf.test.TestCase):
 
   def basicSaveRestore(self, variable_op):
-    save_path = os.path.join(self.get_temp_dir(), "basics")
+    save_path = os.path.join(self.get_temp_dir(), "basic_save_restore")
 
     # Build a graph with 2 parameter nodes, and Save and
     # Restore nodes for them.
@@ -241,6 +241,17 @@ def testSomeErrors(self):
       # The names are different and will work.
       tf.train.Saver({"vee1": v1, "other": [v2]})
 
+      # Partitioned variables also cause name conflicts.
+      p_v1 = tf.get_variable(
+          "p_v1", shape=[4, 5],
+          partitioner=tf.fixed_size_partitioner(num_shards=2))
+      p_v2 = tf.get_variable(
+          "p_v2", shape=[4, 5],
+          partitioner=tf.fixed_size_partitioner(num_shards=2))
+      p_v2._name = "p_v1"
+      with self.assertRaisesRegexp(ValueError, "same name: p_v1"):
+        tf.train.Saver([p_v1, p_v2])
+
   def testSameName(self):
     with tf.Graph().as_default():
       v0 = tf.Variable([10.0], name="v0")
@@ -444,7 +455,7 @@ def testDeferredBuild(self):
       self.assertAllClose([2.0, 2.0, 2.0], twos.eval())
 
   def testReshape(self):
-    save_path = os.path.join(self.get_temp_dir(), "variables")
+    save_path = os.path.join(self.get_temp_dir(), "variables_reshape")
     with tf.Session("", graph=tf.Graph()) as sess:
       var = tf.Variable([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
       init = tf.global_variables_initializer()
@@ -524,7 +535,7 @@ def testSaveToNonexistingPath(self):
 class SaveRestoreShardedTest(tf.test.TestCase):
 
   def testBasics(self):
-    save_path = os.path.join(self.get_temp_dir(), "sharded")
+    save_path = os.path.join(self.get_temp_dir(), "sharded_basics")
 
     # Build a graph with 2 parameter nodes on different devices.
     with tf.Session(
@@ -615,7 +626,7 @@ def testBasics(self):
       self.assertEqual(33.0, t0.values().eval())
       self.assertEqual(b"k22", t1.keys().eval())
       self.assertEqual(44.0, t1.values().eval())
-      save_path = os.path.join(self.get_temp_dir(), "sharded")
+      save_path = os.path.join(self.get_temp_dir(), "sharded_basics")
       if save._write_version is saver_pb2.SaverDef.V1:
         save.restore(sess, save_path + "-?????-of-?????")
       else:
@@ -630,11 +641,11 @@ def testBasics(self):
     if save._write_version is saver_pb2.SaverDef.V1:
       self.assertEqual(
           tf.train.latest_checkpoint(self.get_temp_dir()),
-          os.path.join(self.get_temp_dir(), "sharded-?????-of-00002"))
+          os.path.join(self.get_temp_dir(), "sharded_basics-?????-of-00002"))
     else:
       self.assertEqual(
           tf.train.latest_checkpoint(self.get_temp_dir()),
-          os.path.join(self.get_temp_dir(), "sharded"))
+          os.path.join(self.get_temp_dir(), "sharded_basics"))
 
   def testSaverDef(self):
     with self.test_session():
@@ -648,67 +659,98 @@ def testPartitionedVariables(self):
     # Allows save/restore mechanism to work w/ different slicings.
     var_name = "my_var"
     saved_path = os.path.join(_TestDir("partitioned_variables"), "ckpt")
+    call_saver_with_dict = False  # updated by test loop below
 
-    def _save(slices):
+    def _save(slices=None, partitioner=None):
       with self.test_session(graph=tf.Graph()) as sess:
         # Calls .eval() to return the ndarray that makes up the full variable.
         rnd = tf.random_uniform(var_full_shape).eval()
 
         if slices:
+          assert not partitioner
           vs = tf.create_partitioned_variables(var_full_shape,
                                                slices,
                                                rnd,
                                                name=var_name)
+        elif partitioner:
+          vs = [tf.get_variable(var_name, shape=var_full_shape,
+                                initializer=rnd,
+                                partitioner=partitioner)]
         else:
           vs = [tf.Variable(rnd, name=var_name)]
 
         tf.global_variables_initializer().run()
-        saver = tf.train.Saver(vs)
+        if call_saver_with_dict:
+          saver = tf.train.Saver({var_name: (vs if slices else vs[0])})
+        else:
+          saver = tf.train.Saver(vs)
         actual_path = saver.save(sess, saved_path)
         self.assertEqual(saved_path, actual_path)
 
         return rnd
 
-    def _restore(slices):
+    def _restore(slices=None, partitioner=None):
       with self.test_session(graph=tf.Graph()) as sess:
         if slices:
+          assert not partitioner
           new_vs = tf.create_partitioned_variables(
               var_full_shape,
               slices,
               tf.zeros(var_full_shape),  # != original contents.
               name=var_name)
+        elif partitioner:
+          new_vs = [tf.get_variable(var_name, shape=var_full_shape,
+                                    initializer=tf.zeros(var_full_shape),
+                                    partitioner=partitioner)]
         else:
           new_vs = [tf.Variable(
               tf.zeros(shape=var_full_shape),  # != original contents.
               name=var_name)]
 
         tf.global_variables_initializer().run()
-        saver = tf.train.Saver(new_vs)
+        if call_saver_with_dict:
+          saver = tf.train.Saver({var_name: (new_vs if slices else new_vs[0])})
+        else:
+          saver = tf.train.Saver(new_vs)
         saver.restore(sess, saved_path)
 
-        if slices and slices[0] != 1:
+        if partitioner:
+          return new_vs[0].as_tensor().eval()
+        elif slices and slices[0] != 1:
           return tf.concat(0, new_vs).eval()
         elif slices and slices[1] != 1:
           return tf.concat(1, new_vs).eval()
         else:  # Non-sliced.
           return new_vs[0].eval()
 
-    # Saves 10 horizontal parts of a partitioned variable.
-    # Restores into a full variable, non-sliced.
-    saved_full = _save(slices=[10, 1])
-    restored_full = _restore(slices=None)
-    self.assertAllEqual(saved_full, restored_full)
+    for call_saver_with_dict in {False, True}:
+      # Save PartitionedVariable and restore into full variable.
+      saved_full = _save(
+          partitioner=tf.fixed_size_partitioner(num_shards=2))
+      restored_full = _restore()
+      self.assertAllEqual(saved_full, restored_full)
+
+      # Saves 10 horizontal parts of a partitioned variable.
+      # Restores into a full variable, non-sliced.
+      saved_full = _save(slices=[10, 1])
+      restored_full = _restore()
+      self.assertAllEqual(saved_full, restored_full)
+
+      # Restores into a different number/orientation of slices.
+      restored_full = _restore(slices=[2, 1])  # 2 horizon parts.
+      self.assertAllEqual(saved_full, restored_full)
+      restored_full = _restore(slices=[1, 3])  # 3 vertical parts.
+      self.assertAllEqual(saved_full, restored_full)
 
-    # Restores into a different number/orientation of slices.
-    restored_full = _restore(slices=[2, 1])  # 2 horizon parts.
-    self.assertAllEqual(saved_full, restored_full)
-    restored_full = _restore(slices=[1, 3])  # 3 vertical parts.
-    self.assertAllEqual(saved_full, restored_full)
+      # Restores into a PartitionedVariable
+      restored_full = _restore(
+          partitioner=tf.fixed_size_partitioner(num_shards=2))
+      self.assertAllEqual(saved_full, restored_full)
 
-    # Now, saves a full variable and restores in slices.
-    saved_full = _save(slices=None)
-    restored_full = _restore(slices=[1, 3])
-    self.assertAllEqual(saved_full, restored_full)
+      # Now, saves a full variable and restores in slices.
+      saved_full = _save()
+      restored_full = _restore(slices=[1, 3])
+      self.assertAllEqual(saved_full, restored_full)
 
 
 class MaxToKeepTest(tf.test.TestCase):
@@ -959,7 +1001,7 @@ def testNonSharded(self):
 class SaveRestoreWithVariableNameMap(tf.test.TestCase):
 
   def testNonReshape(self):
-    save_path = os.path.join(self.get_temp_dir(), "basics")
+    save_path = os.path.join(self.get_temp_dir(), "non_reshape")
 
     with self.test_session() as sess:
       # Build a graph with 2 parameter nodes, and Save and
@@ -1872,7 +1914,7 @@ def loop_body(it, biases):
         tf.add_to_collection("logits", logits)
 
       # The rest of the variables.
-      rest_variables = list(set(tf.all_variables()) - set(var_list.keys()))
+      rest_variables = list(set(tf.global_variables()) - set(var_list.keys()))
       init_rest_op = tf.initialize_variables(rest_variables)
 
     with self.test_session(graph=graph) as sess:
diff --git a/tensorflow/python/training/session_manager_test.py b/tensorflow/python/training/session_manager_test.py
index fa81f9b83fd4e0..1d3dcda119730d 100644
--- a/tensorflow/python/training/session_manager_test.py
+++ b/tensorflow/python/training/session_manager_test.py
@@ -154,7 +154,7 @@ def testInitWithNoneLocalInitOpError(self):
                                  "you must also pass a local_init_op "):
       tf.train.SessionManager(
           ready_for_local_init_op=tf.report_uninitialized_variables(
-              tf.all_variables()),
+              tf.global_variables()),
           local_init_op=None)
 
   def testRecoverSessionWithReadyForLocalInitOp(self):
@@ -192,7 +192,7 @@ def testRecoverSessionWithReadyForLocalInitOp(self):
       sm2 = tf.train.SessionManager(
           ready_op=tf.report_uninitialized_variables(),
           ready_for_local_init_op=tf.report_uninitialized_variables(
-              tf.all_variables()),
+              tf.global_variables()),
           local_init_op=w.initializer)
       saver = tf.train.Saver({"v": v})
       sess, initialized = sm2.recover_session(
@@ -348,7 +348,7 @@ def testWaitForSessionLocalInit(self):
           graph=graph,
           ready_op=tf.report_uninitialized_variables(),
           ready_for_local_init_op=tf.report_uninitialized_variables(
-              tf.all_variables()),
+              tf.global_variables()),
           local_init_op=w.initializer)
 
       # Initialize v but not w
@@ -417,7 +417,7 @@ def testPrepareSessionWithReadyForLocalInitOp(self):
       sm2 = tf.train.SessionManager(
           ready_op=tf.report_uninitialized_variables(),
           ready_for_local_init_op=tf.report_uninitialized_variables(
-              tf.all_variables()),
+              tf.global_variables()),
           local_init_op=w.initializer)
       sess = sm2.prepare_session("", init_op=v.initializer)
       self.assertEqual(
@@ -462,7 +462,7 @@ def testPrepareSessionWithReadyNotReadyForLocal(self):
       sm2 = tf.train.SessionManager(
           ready_op=tf.report_uninitialized_variables(),
           ready_for_local_init_op=tf.report_uninitialized_variables(
-              tf.all_variables()),
+              tf.global_variables()),
           local_init_op=w.initializer)
       with self.assertRaisesRegexp(
           RuntimeError,
diff --git a/tensorflow/python/training/summary_io.py b/tensorflow/python/training/summary_io.py
index 4d1c3e7954f4dc..970c67f5e40f8f 100644
--- a/tensorflow/python/training/summary_io.py
+++ b/tensorflow/python/training/summary_io.py
@@ -58,7 +58,7 @@ def __init__(self,
     # Launch the graph in a session.
     sess = tf.Session()
     # Create a summary writer, add the 'graph' to the event file.
-    writer = tf.train.SummaryWriter(<some-directory>, sess.graph)
+    writer = tf.summary.FileWriter(<some-directory>, sess.graph)
     ```
 
     The other arguments to the constructor control the asynchronous writes to
diff --git a/tensorflow/python/training/supervisor.py b/tensorflow/python/training/supervisor.py
index af51c074173663..8159ddc398d893 100644
--- a/tensorflow/python/training/supervisor.py
+++ b/tensorflow/python/training/supervisor.py
@@ -248,7 +248,7 @@ def __init__(self,
         ready to run the local_init_op.
         The model is considered ready if it returns an empty array.  Defaults to
         the tensor returned from
-        `tf.report_uninitialized_variables(tf.all_variables())`. If `None`, the
+        `tf.report_uninitialized_variables(tf.global_variables())`. If `None`, the
         model is not checked for readiness before running local_init_op.
       is_chief: If True, create a chief supervisor in charge of initializing
         and restoring the model.  If False, create a supervisor that relies
@@ -890,7 +890,7 @@ def _verify_setup(self):
     # In that case all Variables must have their device set.
     if not self._is_chief:
       for op in self._graph.get_operations():
-        if op.type == "Variable" and not op.device:
+        if op.type in ["Variable", "VariableV2"] and not op.device:
           raise ValueError("When using replicas, all Variables must have "
                            "their device set: %s" % op)
 
diff --git a/tensorflow/python/training/supervisor_test.py b/tensorflow/python/training/supervisor_test.py
index dda0166aa630f5..c7c16cdf81be42 100644
--- a/tensorflow/python/training/supervisor_test.py
+++ b/tensorflow/python/training/supervisor_test.py
@@ -531,7 +531,7 @@ def get_session(is_chief):
               collections=[tf.GraphKeys.LOCAL_VARIABLES],
               name="default_ready_for_local_init_op_w_" + str(uid))
           ready_for_local_init_op = tf.report_uninitialized_variables(
-              tf.all_variables())
+              tf.global_variables())
       sv = tf.train.Supervisor(
           logdir=logdir,
           is_chief=is_chief,
@@ -588,7 +588,7 @@ def get_session(is_chief):
               collections=[tf.GraphKeys.LOCAL_VARIABLES],
               name="ready_for_local_init_op_restore_w_" + str(uid))
           ready_for_local_init_op = tf.report_uninitialized_variables(
-              tf.all_variables())
+              tf.global_variables())
       sv = tf.train.Supervisor(
           logdir=logdir,
           is_chief=is_chief,
@@ -624,7 +624,7 @@ def testLocalInitOp(self):
 
       # This shouldn't add a variable to the VARIABLES collection responsible
       # for variables that are saved/restored from checkpoints.
-      self.assertEquals(len(tf.all_variables()), 0)
+      self.assertEquals(len(tf.global_variables()), 0)
 
       # Suppress normal variable inits to make sure the local one is
       # initialized via local_init_op.
@@ -644,7 +644,7 @@ def testLocalInitOpForNonChief(self):
                         collections=[tf.GraphKeys.LOCAL_VARIABLES])
         # This shouldn't add a variable to the VARIABLES collection responsible
         # for variables that are saved/restored from checkpoints.
-        self.assertEquals(len(tf.all_variables()), 0)
+        self.assertEquals(len(tf.global_variables()), 0)
 
       # Suppress normal variable inits to make sure the local one is
       # initialized via local_init_op.
diff --git a/tensorflow/python/training/training.py b/tensorflow/python/training/training.py
index e368913e981ffa..005c815a39733c 100644
--- a/tensorflow/python/training/training.py
+++ b/tensorflow/python/training/training.py
@@ -110,42 +110,15 @@
 @@WorkerSessionCreator
 @@MonitoredSession
 
-## Summary Operations
-
-The following ops output
-[`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-protocol buffers as serialized string tensors.
-
-You can fetch the output of a summary op in a session, and pass it to
-a [SummaryWriter](../../api_docs/python/train.md#SummaryWriter) to append it
-to an event file.  Event files contain
-[`Event`](https://www.tensorflow.org/code/tensorflow/core/util/event.proto)
-protos that can contain `Summary` protos along with the timestamp and
-step.  You can then use TensorBoard to visualize the contents of the
-event files.  See [TensorBoard and
-Summaries](../../how_tos/summaries_and_tensorboard/index.md) for more
-details.
-
-@@scalar_summary
-@@image_summary
-@@audio_summary
-@@histogram_summary
-@@zero_fraction
-
-@@merge_summary
-@@merge_all_summaries
-
-## Adding Summaries to Event Files
+## Reading Summaries from Event Files
 
 See [Summaries and
 TensorBoard](../../how_tos/summaries_and_tensorboard/index.md) for an
 overview of summaries, event files, and visualization in TensorBoard.
 
-@@SummaryWriter
-@@SummaryWriterCache
 @@summary_iterator
 
-## Training utilities
+## Training Utilities
 
 @@global_step
 @@basic_train_loop
@@ -239,8 +212,6 @@
 from tensorflow.python.training.session_run_hook import SessionRunValues
 from tensorflow.python.training.session_manager import SessionManager
 from tensorflow.python.training.summary_io import summary_iterator
-from tensorflow.python.training.summary_io import SummaryWriter
-from tensorflow.python.training.summary_io import SummaryWriterCache
 from tensorflow.python.training.supervisor import Supervisor
 from tensorflow.python.training.training_util import write_graph
 from tensorflow.python.training.training_util import global_step
diff --git a/tensorflow/python/util/decorator_utils.py b/tensorflow/python/util/decorator_utils.py
index 155003498ce7f5..df259c7f7c29f9 100644
--- a/tensorflow/python/util/decorator_utils.py
+++ b/tensorflow/python/util/decorator_utils.py
@@ -18,6 +18,8 @@
 from __future__ import division
 from __future__ import print_function
 
+import sys
+
 
 def get_qualified_name(function):
   # Python 3
@@ -30,13 +32,54 @@ def get_qualified_name(function):
   return function.__name__
 
 
+def _normalize_docstring(docstring):
+  """Normalizes the docstring.
+
+  Replaces tabs with spaces, removes leading and trailing blanks lines, and
+  removes any indentation.
+
+  Copied from PEP-257:
+  https://www.python.org/dev/peps/pep-0257/#handling-docstring-indentation
+
+  Args:
+    docstring: the docstring to normalize
+
+  Returns:
+    The normalized docstring
+  """
+  if not docstring:
+    return ''
+  # Convert tabs to spaces (following the normal Python rules)
+  # and split into a list of lines:
+  lines = docstring.expandtabs().splitlines()
+  # Determine minimum indentation (first line doesn't count):
+  # (we use sys.maxsize because sys.maxint doesn't exist in Python 3)
+  indent = sys.maxsize
+  for line in lines[1:]:
+    stripped = line.lstrip()
+    if stripped:
+      indent = min(indent, len(line) - len(stripped))
+  # Remove indentation (first line is special):
+  trimmed = [lines[0].strip()]
+  if indent < sys.maxsize:
+    for line in lines[1:]:
+      trimmed.append(line[indent:].rstrip())
+  # Strip off trailing and leading blank lines:
+  while trimmed and not trimmed[-1]:
+    trimmed.pop()
+  while trimmed and not trimmed[0]:
+    trimmed.pop(0)
+  # Return a single string:
+  return '\n'.join(trimmed)
+
+
 def add_notice_to_docstring(
     doc, instructions, no_doc_str, suffix_str, notice):
   """Adds a deprecation notice to a docstring."""
   if not doc:
     lines = [no_doc_str]
   else:
-    lines = doc.splitlines()
+    lines = _normalize_docstring(doc).splitlines()
     lines[0] += ' ' + suffix_str
 
   notice = [''] + notice + [instructions]
@@ -60,3 +103,25 @@ def validate_callable(func, decorator_name):
         ' @property appears before @%s in your source code:'
         '\n\n@property\n@%s\ndef method(...)' % (
             func, decorator_name, decorator_name))
+
+
+class classproperty(object):  # pylint: disable=invalid-name
+  """Class property decorator.
+
+  Example usage:
+
+  class MyClass(object):
+
+    @classproperty
+    def value(cls):
+      return '123'
+
+  > print MyClass.value
+  123
+  """
+
+  def __init__(self, func):
+    self._func = func
+
+  def __get__(self, owner_self, owner_cls):
+    return self._func(owner_cls)
diff --git a/tensorflow/python/util/decorator_utils_test.py b/tensorflow/python/util/decorator_utils_test.py
index 7a72239ad0365b..bd18fcf7f248d8 100644
--- a/tensorflow/python/util/decorator_utils_test.py
+++ b/tensorflow/python/util/decorator_utils_test.py
@@ -56,22 +56,44 @@ def _check(self, doc, expected):
         expected)
 
   def test_regular(self):
-    self._check("Brief\n\nDocstring",
-                "Brief (suffix)\n\nGo away\nInstructions\n\nDocstring")
+    expected = ("Brief (suffix)\n\nGo away\nInstructions\n\nDocstring\n\n"
+                "Args:\n  arg1: desc")
+    # No indent for main docstring
+    self._check("Brief\n\nDocstring\n\nArgs:\n  arg1: desc", expected)
+    # 2 space indent for main docstring, blank lines not indented
+    self._check("Brief\n\n  Docstring\n\n  Args:\n    arg1: desc", expected)
+    # 2 space indent for main docstring, blank lines indented as well.
+    self._check("Brief\n  \n  Docstring\n  \n  Args:\n    arg1: desc", expected)
+    # No indent for main docstring, first line blank.
+    self._check("\n  Brief\n  \n  Docstring\n  \n  Args:\n    arg1: desc",
+                expected)
+    # 2 space indent, first line blank.
+    self._check("\n  Brief\n  \n  Docstring\n  \n  Args:\n    arg1: desc",
+                expected)
 
   def test_brief_only(self):
-    self._check("Brief",
-                "Brief (suffix)\n\nGo away\nInstructions")
+    expected = "Brief (suffix)\n\nGo away\nInstructions"
+    self._check("Brief", expected)
+    self._check("Brief\n", expected)
+    self._check("Brief\n  ", expected)
+    self._check("\nBrief\n  ", expected)
+    self._check("\n  Brief\n  ", expected)
 
   def test_no_docstring(self):
-    self._check(None,
-                "Nothing here\n\nGo away\nInstructions")
-    self._check("",
-                "Nothing here\n\nGo away\nInstructions")
+    expected = "Nothing here\n\nGo away\nInstructions"
+    self._check(None, expected)
+    self._check("", expected)
 
   def test_no_empty_line(self):
-    self._check("Brief\nDocstring",
-                "Brief (suffix)\n\nGo away\nInstructions\n\nDocstring")
+    expected = "Brief (suffix)\n\nGo away\nInstructions\n\nDocstring"
+    # No second line indent
+    self._check("Brief\nDocstring", expected)
+    # 2 space second line indent
+    self._check("Brief\n  Docstring", expected)
+    # No second line indent, first line blank
+    self._check("\nBrief\nDocstring", expected)
+    # 2 space second line indent, first line blank
+    self._check("\n  Brief\n  Docstring", expected)
 
 
 class ValidateCallableTest(tf.test.TestCase):
diff --git a/tensorflow/python/util/deprecation.py b/tensorflow/python/util/deprecation.py
index 8e0d9bbb06fce8..a31ae90610c47f 100644
--- a/tensorflow/python/util/deprecation.py
+++ b/tensorflow/python/util/deprecation.py
@@ -303,3 +303,29 @@ def new_func(*args, **kwargs):
         func.__doc__, date, instructions)
     return new_func
   return deprecated_wrapper
+
+
+def deprecated_argument_lookup(new_name, new_value, old_name, old_value):
+  """Looks up deprecated argument name and ensures both are not used.
+
+  Args:
+    new_name: new name of argument
+    new_value: value of new argument (or None if not used)
+    old_name: old name of argument
+    old_value: value of old argument (or None if not used)
+  Returns:
+    The effective argument that should be used.
+  Raises:
+    ValueError: if new_value and old_value are both non-null
+  """
+  if old_value is not None:
+    if new_value is not None:
+      raise ValueError("Cannot specify both '%s' and '%s'" %
+                       (old_name, new_name))
+    return old_value
+  return new_value
+
+
+def rewrite_argument_docstring(old_doc, old_argument, new_argument):
+  return old_doc.replace('`%s`' % old_argument, '`%s`' % new_argument).replace(
+      '%s:' % old_argument, '%s:' % new_argument)
diff --git a/tensorflow/python/util/deprecation_test.py b/tensorflow/python/util/deprecation_test.py
index 75bd054d7f358b..4832bdd25b401c 100644
--- a/tensorflow/python/util/deprecation_test.py
+++ b/tensorflow/python/util/deprecation_test.py
@@ -71,13 +71,12 @@ def _fn(arg0, arg1):
         "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s."
         "\nInstructions for updating:\n%s"
         "\n"
-        "\n      Args:"
-        "\n        arg0: Arg 0."
-        "\n        arg1: Arg 1."
+        "\nArgs:"
+        "\n  arg0: Arg 0."
+        "\n  arg1: Arg 1."
         "\n"
-        "\n      Returns:"
-        "\n        Sum of args."
-        "\n      " % (date, instructions),
+        "\nReturns:"
+        "\n  Sum of args." % (date, instructions),
         _fn.__doc__)
 
     # Assert calling new fn issues log warning.
@@ -169,13 +168,12 @@ def _fn(self, arg0, arg1):
         "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s."
         "\nInstructions for updating:\n%s"
         "\n"
-        "\n        Args:"
-        "\n          arg0: Arg 0."
-        "\n          arg1: Arg 1."
+        "\nArgs:"
+        "\n  arg0: Arg 0."
+        "\n  arg1: Arg 1."
         "\n"
-        "\n        Returns:"
-        "\n          Sum of args."
-        "\n        " % (date, instructions),
+        "\nReturns:"
+        "\n  Sum of args." % (date, instructions),
         getattr(_Object, "_fn").__doc__)
 
     # Assert calling new fn issues log warning.
@@ -289,9 +287,8 @@ def _prop(self):
         "\nInstructions for updating:"
         "\n%s"
         "\n"
-        "\n        Returns:"
-        "\n          String."
-        "\n        " % (date, instructions),
+        "\nReturns:"
+        "\n  String." % (date, instructions),
         getattr(_Object, "_prop").__doc__)
 
     # Assert calling new fn issues log warning.
@@ -394,14 +391,13 @@ def _fn(arg0, arg1, deprecated=True):
         "\nSOME ARGUMENTS ARE DEPRECATED. They will be removed after %s."
         "\nInstructions for updating:\n%s"
         "\n"
-        "\n      Args:"
-        "\n        arg0: Arg 0."
-        "\n        arg1: Arg 1."
-        "\n        deprecated: Deprecated!"
+        "\nArgs:"
+        "\n  arg0: Arg 0."
+        "\n  arg1: Arg 1."
+        "\n  deprecated: Deprecated!"
         "\n"
-        "\n      Returns:"
-        "\n        Sum of args."
-        "\n      " % (date, instructions),
+        "\nReturns:"
+        "\n  Sum of args." % (date, instructions),
         _fn.__doc__)
 
     # Assert calls without the deprecated argument log nothing.
@@ -628,14 +624,13 @@ def _fn(arg0, arg1, deprecated=True):
         "\nSOME ARGUMENTS ARE DEPRECATED. They will be removed after %s."
         "\nInstructions for updating:\n%s"
         "\n"
-        "\n      Args:"
-        "\n        arg0: Arg 0."
-        "\n        arg1: Arg 1."
-        "\n        deprecated: Deprecated!"
+        "\nArgs:"
+        "\n  arg0: Arg 0."
+        "\n  arg1: Arg 1."
+        "\n  deprecated: Deprecated!"
         "\n"
-        "\n      Returns:"
-        "\n        Sum of args."
-        "\n      " % (date, instructions),
+        "\nReturns:"
+        "\n  Sum of args." % (date, instructions),
         _fn.__doc__)
 
     # Assert calling new fn with non-deprecated value logs nothing.
@@ -722,5 +717,37 @@ def _fn(arg0, arg1, deprecated=True):
     self.assertEqual(2, mock_warning.call_count)
 
 
+class DeprecationArgumentsTest(tf.test.TestCase):
+
+  def testDeprecatedArgumentLookup(self):
+    good_value = 3
+    self.assertEqual(deprecation.deprecated_argument_lookup(
+        "val_new", good_value, "val_old", None), good_value)
+    self.assertEqual(deprecation.deprecated_argument_lookup(
+        "val_new", None, "val_old", good_value), good_value)
+    with self.assertRaisesRegexp(ValueError,
+                                 "Cannot specify both 'val_old' and 'val_new'"):
+      self.assertEqual(deprecation.deprecated_argument_lookup(
+          "val_new", good_value, "val_old", good_value), good_value)
+
+  def testRewriteArgumentDocstring(self):
+    docs = """Add `a` and `b`
+
+    Args:
+      a: first arg
+      b: second arg
+    """
+    new_docs = deprecation.rewrite_argument_docstring(
+        deprecation.rewrite_argument_docstring(docs, "a", "left"),
+        "b", "right")
+    new_docs_ref = """Add `left` and `right`
+
+    Args:
+      left: first arg
+      right: second arg
+    """
+    self.assertEqual(new_docs, new_docs_ref)
+
+
 if __name__ == "__main__":
   tf.test.main()
diff --git a/tensorflow/stream_executor/BUILD b/tensorflow/stream_executor/BUILD
index 256b1287506905..a84f9d5f9df113 100644
--- a/tensorflow/stream_executor/BUILD
+++ b/tensorflow/stream_executor/BUILD
@@ -7,6 +7,8 @@ cc_library(
     srcs = glob(
         [
             "*.cc",
+            "host/*.cc",
+            "cuda/cuda_platform_id.cc",
             "lib/*.cc",
             "platform/default/*.cc",
         ],
@@ -14,13 +16,17 @@ cc_library(
             "**/*_test.cc",
         ],
     ) + if_cuda(
-        glob([
-            "cuda/*.cc",
-        ]),
+        glob(
+            [
+                "cuda/*.cc",
+            ],
+            exclude = ["cuda/cuda_platform_id.cc"],
+        ),
     ),
     hdrs = glob([
         "*.h",
         "cuda/*.h",
+        "host/*.h",
         "lib/*.h",
         "lib/gtl/*.h",
         "platform/**/*.h",
diff --git a/tensorflow/stream_executor/host/host_gpu_executor.cc b/tensorflow/stream_executor/host/host_gpu_executor.cc
new file mode 100644
index 00000000000000..ff07432bb7c16e
--- /dev/null
+++ b/tensorflow/stream_executor/host/host_gpu_executor.cc
@@ -0,0 +1,263 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Implementation of HostExecutor class [of those methods not defined in the
+// class declaration].
+#include "tensorflow/stream_executor/host/host_gpu_executor.h"
+
+#include <string.h>
+
+#include "tensorflow/stream_executor/host/host_platform_id.h"
+#include "tensorflow/stream_executor/host/host_stream.h"
+#include "tensorflow/stream_executor/host/host_timer.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/plugin_registry.h"
+
+namespace gpu = ::perftools::gputools;
+
+namespace perftools {
+namespace gputools {
+namespace host {
+
+HostStream *AsHostStream(Stream *stream) {
+  DCHECK(stream != nullptr);
+  return dynamic_cast<HostStream *>(stream->implementation());
+}
+
+HostExecutor::HostExecutor(const PluginConfig &plugin_config)
+    : plugin_config_(plugin_config) {}
+
+HostExecutor::~HostExecutor() {}
+
+void *HostExecutor::Allocate(uint64 size) { return new char[size]; }
+
+void *HostExecutor::AllocateSubBuffer(DeviceMemoryBase *parent,
+                                      uint64 offset_bytes, uint64 size_bytes) {
+  return reinterpret_cast<char *>(parent->opaque()) + offset_bytes;
+}
+
+void HostExecutor::Deallocate(DeviceMemoryBase *mem) {
+  if (!mem->is_sub_buffer()) {
+    delete[] static_cast<char *>(mem->opaque());
+  }
+}
+
+bool HostExecutor::SynchronousMemZero(DeviceMemoryBase *location, uint64 size) {
+  memset(location->opaque(), 0, size);
+  return true;
+}
+
+bool HostExecutor::SynchronousMemSet(DeviceMemoryBase *location, int value,
+                                     uint64 size) {
+  memset(location->opaque(), value, size);
+  return true;
+}
+
+bool HostExecutor::Memcpy(Stream *stream, void *host_dst,
+                          const DeviceMemoryBase &gpu_src, uint64 size) {
+  // Enqueue the [asynchronous] memcpy on the stream (HostStream) associated
+  // with the HostExecutor.
+  void *src_mem = const_cast<void *>(gpu_src.opaque());
+  AsHostStream(stream)->EnqueueTask(
+      [host_dst, src_mem, size]() { memcpy(host_dst, src_mem, size); });
+  return true;
+}
+
+bool HostExecutor::Memcpy(Stream *stream, DeviceMemoryBase *gpu_dst,
+                          const void *host_src, uint64 size) {
+  void *dst_mem = gpu_dst->opaque();
+  // Enqueue the [asynchronous] memcpy on the stream (HostStream) associated
+  // with the HostExecutor.
+  AsHostStream(stream)->EnqueueTask(
+      [dst_mem, host_src, size]() { memcpy(dst_mem, host_src, size); });
+  return true;
+}
+
+bool HostExecutor::MemcpyDeviceToDevice(Stream *stream,
+                                        DeviceMemoryBase *gpu_dst,
+                                        const DeviceMemoryBase &gpu_src,
+                                        uint64 size) {
+  void *dst_mem = gpu_dst->opaque();
+  void *src_mem = const_cast<void *>(gpu_src.opaque());
+  // Enqueue this [asynchronous] "device-to-device" (i.e., host-to-host, given
+  // the nature of the HostExecutor) memcpy  on the stream (HostStream)
+  // associated with the HostExecutor.
+  AsHostStream(stream)->EnqueueTask(
+      [src_mem, dst_mem, size]() { memcpy(src_mem, dst_mem, size); });
+  return true;
+}
+
+bool HostExecutor::MemZero(Stream *stream, DeviceMemoryBase *location,
+                           uint64 size) {
+  void *gpu_mem = location->opaque();
+  // Enqueue the [asynchronous] memzero on the stream (HostStream) associated
+  // with the HostExecutor.
+  AsHostStream(stream)->EnqueueTask(
+      [gpu_mem, size]() { memset(gpu_mem, 0, size); });
+  return true;
+}
+
+bool HostExecutor::Memset(Stream *stream, DeviceMemoryBase *location,
+                          uint8 pattern, uint64 size) {
+  void *gpu_mem = location->opaque();
+  // Enqueue the [asynchronous] memzero on the stream (HostStream) associated
+  // with the HostExecutor.
+  AsHostStream(stream)->EnqueueTask(
+      [gpu_mem, size, pattern]() { memset(gpu_mem, pattern, size); });
+  return true;
+}
+
+bool HostExecutor::Memset32(Stream *stream, DeviceMemoryBase *location,
+                            uint32 pattern, uint64 size) {
+  void *gpu_mem = location->opaque();
+  // Enqueue the [asynchronous] memzero on the stream (HostStream) associated
+  // with the HostExecutor.
+  AsHostStream(stream)->EnqueueTask(
+      [gpu_mem, size, pattern]() { memset(gpu_mem, pattern, size); });
+  return true;
+}
+
+bool HostExecutor::SynchronousMemcpy(DeviceMemoryBase *gpu_dst,
+                                     const void *host_src, uint64 size) {
+  memcpy(gpu_dst->opaque(), host_src, size);
+  return true;
+}
+
+bool HostExecutor::SynchronousMemcpy(void *host_dst,
+                                     const DeviceMemoryBase &gpu_src,
+                                     uint64 size) {
+  memcpy(host_dst, gpu_src.opaque(), size);
+  return true;
+}
+
+bool HostExecutor::SynchronousMemcpyDeviceToDevice(
+    DeviceMemoryBase *gpu_dst, const DeviceMemoryBase &gpu_src, uint64 size) {
+  memcpy(gpu_dst->opaque(), gpu_src.opaque(), size);
+  return true;
+}
+
+bool HostExecutor::HostCallback(Stream *stream,
+                                std::function<void()> callback) {
+  AsHostStream(stream)->EnqueueTask(callback);
+  return true;
+}
+
+bool HostExecutor::AllocateStream(Stream *stream) { return true; }
+
+void HostExecutor::DeallocateStream(Stream *stream) {}
+
+bool HostExecutor::CreateStreamDependency(Stream *dependent, Stream *other) {
+  AsHostStream(dependent)->EnqueueTask(
+      [other]() { other->BlockHostUntilDone(); });
+  AsHostStream(dependent)->BlockUntilDone();
+  return true;
+}
+
+bool HostExecutor::StartTimer(Stream *stream, Timer *timer) {
+  dynamic_cast<HostTimer *>(timer->implementation())->Start(stream);
+  return true;
+}
+
+bool HostExecutor::StopTimer(Stream *stream, Timer *timer) {
+  dynamic_cast<HostTimer *>(timer->implementation())->Stop(stream);
+  return true;
+}
+
+bool HostExecutor::BlockHostUntilDone(Stream *stream) {
+  AsHostStream(stream)->BlockUntilDone();
+  return true;
+}
+
+DeviceDescription *HostExecutor::PopulateDeviceDescription() const {
+  internal::DeviceDescriptionBuilder builder;
+
+  builder.set_device_address_bits(64);
+
+  // TODO(rspringer): How to report a value that's based in reality but that
+  // doesn't result in thrashing or other badness? 4GiB chosen arbitrarily.
+  builder.set_device_memory_size(static_cast<uint64>(4) * 1024 * 1024 * 1024);
+
+  builder.set_clock_rate_ghz(static_cast<float>(CLOCKS_PER_SEC) / 1e9);
+
+  auto built = builder.Build();
+  return built.release();
+}
+
+bool HostExecutor::SupportsBlas() const {
+  return PluginRegistry::Instance()
+      ->GetFactory<PluginRegistry::BlasFactory>(kHostPlatformId,
+                                                plugin_config_.blas())
+      .ok();
+}
+
+blas::BlasSupport *HostExecutor::CreateBlas() {
+  PluginRegistry *registry = PluginRegistry::Instance();
+  port::StatusOr<PluginRegistry::BlasFactory> status =
+      registry->GetFactory<PluginRegistry::BlasFactory>(kHostPlatformId,
+                                                        plugin_config_.blas());
+  if (!status.ok()) {
+    LOG(ERROR) << "Unable to retrieve BLAS factory: "
+               << status.status().error_message();
+    return nullptr;
+  }
+
+  return status.ValueOrDie()(this);
+}
+
+bool HostExecutor::SupportsFft() const {
+  return PluginRegistry::Instance()
+      ->GetFactory<PluginRegistry::FftFactory>(kHostPlatformId,
+                                               plugin_config_.fft())
+      .ok();
+}
+
+fft::FftSupport *HostExecutor::CreateFft() {
+  PluginRegistry *registry = PluginRegistry::Instance();
+  port::StatusOr<PluginRegistry::FftFactory> status =
+      registry->GetFactory<PluginRegistry::FftFactory>(kHostPlatformId,
+                                                       plugin_config_.fft());
+  if (!status.ok()) {
+    LOG(ERROR) << "Unable to retrieve FFT factory: "
+               << status.status().error_message();
+    return nullptr;
+  }
+
+  return status.ValueOrDie()(this);
+}
+
+bool HostExecutor::SupportsRng() const {
+  return PluginRegistry::Instance()
+      ->GetFactory<PluginRegistry::RngFactory>(kHostPlatformId,
+                                               plugin_config_.rng())
+      .ok();
+}
+
+rng::RngSupport *HostExecutor::CreateRng() {
+  PluginRegistry *registry = PluginRegistry::Instance();
+  port::StatusOr<PluginRegistry::RngFactory> status =
+      registry->GetFactory<PluginRegistry::RngFactory>(kHostPlatformId,
+                                                       plugin_config_.rng());
+  if (!status.ok()) {
+    LOG(ERROR) << "Unable to retrieve RNG factory: "
+               << status.status().error_message();
+    return nullptr;
+  }
+
+  return status.ValueOrDie()(this);
+}
+
+}  // namespace host
+}  // namespace gputools
+}  // namespace perftools
diff --git a/tensorflow/stream_executor/host/host_gpu_executor.h b/tensorflow/stream_executor/host/host_gpu_executor.h
new file mode 100644
index 00000000000000..f217f7947f998f
--- /dev/null
+++ b/tensorflow/stream_executor/host/host_gpu_executor.h
@@ -0,0 +1,215 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Declares the HostExecutor class, which is a CPU-only implementation of
+// the StreamExecutor interface. For now, this is used for testing and to
+// examine the performance of host-based StreamExecutor code.
+#ifndef TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_GPU_EXECUTOR_H_
+#define TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_GPU_EXECUTOR_H_
+
+#include "tensorflow/stream_executor/blas.h"
+#include "tensorflow/stream_executor/host/host_stream.h"
+#include "tensorflow/stream_executor/host/host_timer.h"
+#include "tensorflow/stream_executor/lib/error.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/rng.h"
+#include "tensorflow/stream_executor/stream_executor.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+
+namespace perftools {
+namespace gputools {
+namespace host {
+
+// An implementation of StreamExecutor that does no communication or interaction
+// with a device, but DOES perform memory operations backed by the host.
+// Plugin routines (RNG, BLAS) are also supported and functional.
+// Kernel invocations will fail, but host callbacks may be enqueued on this
+// executor and its associated stream, and should follow standard ordering
+// semantics.
+//
+// This is useful for evaluating the performance of host-based or fallback
+// routines executed under the context of a GPU executor.
+// See stream_executor.h for description of the below operations.
+class HostExecutor : public internal::StreamExecutorInterface {
+ public:
+  explicit HostExecutor(const PluginConfig &plugin_config);
+  ~HostExecutor() override;
+
+  port::Status Init(int device_ordinal, DeviceOptions device_options) override {
+    return port::Status::OK();
+  }
+
+  bool GetKernel(const MultiKernelLoaderSpec &spec,
+                 KernelBase *kernel) override {
+    return false;
+  }
+  bool Launch(Stream *stream, const ThreadDim &thread_dims,
+              const BlockDim &block_dims, const KernelBase &kernel,
+              const KernelArgsArrayBase &args) override {
+    return false;
+  }
+
+  void *Allocate(uint64 size) override;
+  void *AllocateSubBuffer(DeviceMemoryBase *mem, uint64 offset_bytes,
+                          uint64 size_bytes) override;
+  void Deallocate(DeviceMemoryBase *mem) override;
+
+  void *HostMemoryAllocate(uint64 size) override { return new char[size]; }
+  void HostMemoryDeallocate(void *mem) override {
+    delete[] static_cast<char *>(mem);
+  }
+  bool HostMemoryRegister(void *mem, uint64 size) override { return true; }
+  bool HostMemoryUnregister(void *mem) override { return true; }
+
+  bool Memcpy(Stream *stream, void *host_dst, const DeviceMemoryBase &gpu_src,
+              uint64 size) override;
+  bool Memcpy(Stream *stream, DeviceMemoryBase *gpu_dst, const void *host_src,
+              uint64 size) override;
+  bool MemcpyDeviceToDevice(Stream *stream, DeviceMemoryBase *gpu_dst,
+                            const DeviceMemoryBase &host_src,
+                            uint64 size) override;
+
+  bool MemZero(Stream *stream, DeviceMemoryBase *location,
+               uint64 size) override;
+  bool Memset(Stream *stream, DeviceMemoryBase *location, uint8 pattern,
+              uint64 size) override;
+  bool Memset32(Stream *stream, DeviceMemoryBase *location, uint32 pattern,
+                uint64 size) override;
+
+  // No "synchronize all activity" implemented for this platform at the moment.
+  bool SynchronizeAllActivity() override { return false; }
+  bool SynchronousMemZero(DeviceMemoryBase *location, uint64 size) override;
+
+  bool SynchronousMemSet(DeviceMemoryBase *location, int value,
+                         uint64 size) override;
+
+  bool SynchronousMemcpy(DeviceMemoryBase *gpu_dst, const void *host_src,
+                         uint64 size) override;
+  bool SynchronousMemcpy(void *host_dst, const DeviceMemoryBase &gpu_src,
+                         uint64 size) override;
+  bool SynchronousMemcpyDeviceToDevice(DeviceMemoryBase *gpu_dst,
+                                       const DeviceMemoryBase &gpu_src,
+                                       uint64 size) override;
+
+  bool HostCallback(Stream *stream, std::function<void()> callback) override;
+
+  port::Status AllocateEvent(Event *event) override {
+    return port::Status{port::error::UNIMPLEMENTED, ""};
+  }
+
+  port::Status DeallocateEvent(Event *event) override {
+    return port::Status{port::error::UNIMPLEMENTED, ""};
+  }
+
+  port::Status RecordEvent(Stream *stream, Event *event) override {
+    return port::Status{port::error::UNIMPLEMENTED, ""};
+  }
+
+  port::Status WaitForEvent(Stream *stream, Event *event) override {
+    return port::Status{port::error::UNIMPLEMENTED, ""};
+  }
+
+  Event::Status PollForEventStatus(Event *event) override {
+    return Event::Status::kError;
+  }
+
+  bool AllocateStream(Stream *stream) override;
+  void DeallocateStream(Stream *stream) override;
+  bool CreateStreamDependency(Stream *dependent, Stream *other) override;
+
+  // No special initialization is necessary for host timers.
+  bool AllocateTimer(Timer *timer) override { return true; }
+
+  void DeallocateTimer(Timer *timer) override {}
+
+  bool StartTimer(Stream *stream, Timer *timer) override;
+
+  bool StopTimer(Stream *stream, Timer *timer) override;
+
+  bool BlockHostUntilDone(Stream *stream) override;
+
+  int PlatformDeviceCount() override { return 1; }
+
+  bool DeviceMemoryUsage(int64 *free, int64 *total) const override {
+    return false;
+  }
+
+  DeviceDescription *PopulateDeviceDescription() const override;
+
+  port::Status EnablePeerAccessTo(StreamExecutorInterface *other) override {
+    return port::Status::OK();
+  }
+
+  bool CanEnablePeerAccessTo(StreamExecutorInterface *other) override {
+    return true;
+  }
+
+  SharedMemoryConfig GetDeviceSharedMemoryConfig() override {
+    LOG(INFO) << "Shared memory configuration is unsupported for host "
+              << "executors.";
+    return SharedMemoryConfig::kDefault;
+  }
+
+  port::Status SetDeviceSharedMemoryConfig(SharedMemoryConfig config) override {
+    string error_msg{
+        "Shared memory configuration is unsupported for host "
+        "executors."};
+    LOG(INFO) << error_msg;
+    return port::Status{port::error::UNIMPLEMENTED, error_msg};
+  }
+
+  bool SupportsBlas() const override;
+  blas::BlasSupport *CreateBlas() override;
+
+  bool SupportsDnn() const override { return false; }
+  dnn::DnnSupport *CreateDnn() override { return nullptr; }
+
+  bool SupportsFft() const override;
+  fft::FftSupport *CreateFft() override;
+
+  bool SupportsRng() const override;
+  rng::RngSupport *CreateRng() override;
+
+  std::unique_ptr<internal::EventInterface> CreateEventImplementation()
+      override {
+    LOG(WARNING) << "Events not currently supported by HostExecutor.";
+    return nullptr;
+  }
+
+  std::unique_ptr<internal::KernelInterface> CreateKernelImplementation()
+      override {
+    return nullptr;
+  }
+
+  std::unique_ptr<internal::StreamInterface> GetStreamImplementation()
+      override {
+    return std::unique_ptr<internal::StreamInterface>(new HostStream());
+  }
+
+  std::unique_ptr<internal::TimerInterface> GetTimerImplementation() override {
+    return std::unique_ptr<internal::TimerInterface>(new HostTimer());
+  }
+
+  void *CudaContextHack() override { return nullptr; }
+
+ private:
+  const PluginConfig plugin_config_;
+};
+
+}  // namespace host
+}  // namespace gputools
+}  // namespace perftools
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_GPU_EXECUTOR_H_
diff --git a/tensorflow/stream_executor/host/host_platform.cc b/tensorflow/stream_executor/host/host_platform.cc
new file mode 100644
index 00000000000000..1fa4dfce8464f4
--- /dev/null
+++ b/tensorflow/stream_executor/host/host_platform.cc
@@ -0,0 +1,120 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/host/host_platform.h"
+
+#include <thread>
+
+#include "tensorflow/stream_executor/host/host_gpu_executor.h"
+#include "tensorflow/stream_executor/host/host_platform_id.h"
+#include "tensorflow/stream_executor/lib/error.h"
+#include "tensorflow/stream_executor/lib/initialize.h"
+#include "tensorflow/stream_executor/lib/ptr_util.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/lib/status_macros.h"
+#include "tensorflow/stream_executor/lib/stringprintf.h"
+
+namespace gpu = ::perftools::gputools;
+
+namespace perftools {
+namespace gputools {
+namespace host {
+
+HostPlatform::HostPlatform() : name_("Host") {}
+
+HostPlatform::~HostPlatform() {}
+
+Platform::Id HostPlatform::id() const { return kHostPlatformId; }
+
+int HostPlatform::VisibleDeviceCount() const {
+  return std::thread::hardware_concurrency();
+}
+
+const string& HostPlatform::Name() const { return name_; }
+
+port::StatusOr<StreamExecutor*> HostPlatform::ExecutorForDevice(int ordinal) {
+  StreamExecutorConfig config;
+  config.ordinal = ordinal;
+  config.plugin_config = PluginConfig();
+  config.device_options = DeviceOptions::Default();
+  return GetExecutor(config);
+}
+
+port::StatusOr<StreamExecutor*> HostPlatform::ExecutorForDeviceWithPluginConfig(
+    int device_ordinal, const PluginConfig& plugin_config) {
+  StreamExecutorConfig config;
+  config.ordinal = device_ordinal;
+  config.plugin_config = plugin_config;
+  config.device_options = DeviceOptions::Default();
+  return GetExecutor(config);
+}
+
+port::StatusOr<StreamExecutor*> HostPlatform::GetExecutor(
+    const StreamExecutorConfig& config) {
+  mutex_lock lock(executors_mutex_);
+
+  port::StatusOr<StreamExecutor*> status = executor_cache_.Get(config);
+  if (status.ok()) {
+    return status.ValueOrDie();
+  }
+
+  port::StatusOr<std::unique_ptr<StreamExecutor>> executor =
+      GetUncachedExecutor(config);
+  if (!executor.ok()) {
+    return executor.status();
+  }
+
+  StreamExecutor* naked_executor = executor.ValueOrDie().get();
+  SE_RETURN_IF_ERROR(
+      executor_cache_.Insert(config, executor.ConsumeValueOrDie()));
+  return naked_executor;
+}
+
+port::StatusOr<std::unique_ptr<StreamExecutor>>
+HostPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
+  auto executor = port::MakeUnique<StreamExecutor>(
+      this, new HostExecutor(config.plugin_config));
+  auto init_status = executor->Init(config.ordinal, config.device_options);
+  if (!init_status.ok()) {
+    return port::Status{
+        port::error::INTERNAL,
+        port::Printf(
+            "failed initializing StreamExecutor for device ordinal %d: %s",
+            config.ordinal, init_status.ToString().c_str())};
+  }
+
+  return std::move(executor);
+}
+
+void HostPlatform::RegisterTraceListener(
+    std::unique_ptr<TraceListener> listener) {
+  LOG(FATAL) << "not yet implemented: register host trace listener";
+}
+
+void HostPlatform::UnregisterTraceListener(TraceListener* listener) {
+  LOG(FATAL) << "not yet implemented: unregister host trace listener";
+}
+
+static void InitializeHostPlatform() {
+  std::unique_ptr<gpu::Platform> platform(new gpu::host::HostPlatform);
+  SE_CHECK_OK(gpu::MultiPlatformManager::RegisterPlatform(std::move(platform)));
+}
+
+}  // namespace host
+}  // namespace gputools
+}  // namespace perftools
+
+REGISTER_MODULE_INITIALIZER(
+    host_platform, perftools::gputools::host::InitializeHostPlatform());
diff --git a/tensorflow/stream_executor/host/host_platform.h b/tensorflow/stream_executor/host/host_platform.h
new file mode 100644
index 00000000000000..86805ef3e30f4f
--- /dev/null
+++ b/tensorflow/stream_executor/host/host_platform.h
@@ -0,0 +1,88 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Declares the "host" platform, which is a CPU-only implementation of the
+// StreamExecutor. The host platform only supports memory operations and plugin
+// routines, and is primarily used for testing.
+#ifndef TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_PLATFORM_H_
+#define TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_PLATFORM_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/stream_executor/executor_cache.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/multi_platform_manager.h"
+#include "tensorflow/stream_executor/platform.h"
+#include "tensorflow/stream_executor/platform/mutex.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/platform/thread_annotations.h"
+#include "tensorflow/stream_executor/stream_executor_pimpl.h"
+#include "tensorflow/stream_executor/trace_listener.h"
+
+namespace perftools {
+namespace gputools {
+namespace host {
+
+// Host (CPU) platform plugin, registered as a singleton value via module
+// initializer.
+class HostPlatform : public Platform {
+ public:
+  HostPlatform();
+  ~HostPlatform() override;
+
+  Platform::Id id() const override;
+
+  // Device count is less clear-cut for CPUs than accelerators. This call
+  // currently returns the number of thread units in the host, as reported by
+  // base::NumCPUs().
+  int VisibleDeviceCount() const override;
+
+  const string& Name() const override;
+
+  port::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal) override;
+
+  port::StatusOr<StreamExecutor*> ExecutorForDeviceWithPluginConfig(
+      int ordinal, const PluginConfig& config) override;
+
+  port::StatusOr<StreamExecutor*> GetExecutor(
+      const StreamExecutorConfig& config) override;
+
+  port::StatusOr<std::unique_ptr<StreamExecutor>> GetUncachedExecutor(
+      const StreamExecutorConfig& config) override;
+
+  void RegisterTraceListener(std::unique_ptr<TraceListener> listener) override;
+
+  void UnregisterTraceListener(TraceListener* listener) override;
+
+ private:
+  // This platform's name.
+  string name_;
+
+  // mutex that guards the ordinal-to-executor map.
+  mutable mutex executors_mutex_;
+
+  // Cache of created StreamExecutors.
+  ExecutorCache executor_cache_;
+
+  SE_DISALLOW_COPY_AND_ASSIGN(HostPlatform);
+};
+
+}  // namespace host
+}  // namespace gputools
+}  // namespace perftools
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_PLATFORM_H_
diff --git a/tensorflow/stream_executor/host/host_platform_id.cc b/tensorflow/stream_executor/host/host_platform_id.cc
new file mode 100644
index 00000000000000..69a203f2985b67
--- /dev/null
+++ b/tensorflow/stream_executor/host/host_platform_id.cc
@@ -0,0 +1,26 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/host/host_platform_id.h"
+
+namespace perftools {
+namespace gputools {
+namespace host {
+
+PLATFORM_DEFINE_ID(kHostPlatformId);
+
+}  // namespace host
+}  // namespace gputools
+}  // namespace perftools
diff --git a/tensorflow/stream_executor/host/host_platform_id.h b/tensorflow/stream_executor/host/host_platform_id.h
new file mode 100644
index 00000000000000..61d84ea2e2faca
--- /dev/null
+++ b/tensorflow/stream_executor/host/host_platform_id.h
@@ -0,0 +1,36 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_PLATFORM_ID_H_
+#define TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_PLATFORM_ID_H_
+
+#include "tensorflow/stream_executor/platform.h"
+
+namespace perftools {
+namespace gputools {
+namespace host {
+
+// Opaque and unique identifier for the host platform.
+// This is needed so that plugins can refer to/identify this platform without
+// instantiating a HostPlatform object.
+// This is broken out here to avoid a circular dependency between HostPlatform
+// and HostStreamExecutor.
+extern const Platform::Id kHostPlatformId;
+
+}  // namespace host
+}  // namespace gputools
+}  // namespace perftools
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_PLATFORM_ID_H_
diff --git a/tensorflow/stream_executor/host/host_stream.cc b/tensorflow/stream_executor/host/host_stream.cc
new file mode 100644
index 00000000000000..c81ca406b8eca4
--- /dev/null
+++ b/tensorflow/stream_executor/host/host_stream.cc
@@ -0,0 +1,57 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Class method definitions for HostStream, the Stream implementation for
+// the HostExecutor implementation.
+#include "tensorflow/stream_executor/host/host_stream.h"
+
+namespace perftools {
+namespace gputools {
+namespace host {
+
+HostStream::HostStream()
+    : host_executor_(new port::ThreadPool(port::Env::Default(),
+                                          port::ThreadOptions(),
+                                          "host_executor", kExecutorThreads)) {}
+
+HostStream::~HostStream() {}
+
+bool HostStream::EnqueueTask(std::function<void()> task) {
+  {
+    mutex_lock lock(mu_);
+    ++pending_tasks_;
+  }
+  host_executor_->Schedule([this, task]() {
+    task();
+    {
+      mutex_lock lock(mu_);
+      --pending_tasks_;
+    }
+    completion_condition_.notify_all();
+  });
+  return true;
+}
+
+void HostStream::BlockUntilDone() {
+  mutex_lock lock(mu_);
+  completion_condition_.wait(lock, [this]() {
+    return pending_tasks_ == 0;
+  });
+}
+
+}  // namespace host
+
+}  // namespace gputools
+}  // namespace perftools
diff --git a/tensorflow/stream_executor/host/host_stream.h b/tensorflow/stream_executor/host/host_stream.h
new file mode 100644
index 00000000000000..9894d17febcae2
--- /dev/null
+++ b/tensorflow/stream_executor/host/host_stream.h
@@ -0,0 +1,58 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Class declaration for Stream type that enqueues tasks onto a host/CPU-based
+// execution context (as opposed to a GPU device), HostExecutor.
+#ifndef TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_STREAM_H_
+#define TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_STREAM_H_
+
+#include <functional>
+#include <memory>
+
+#include "tensorflow/stream_executor/lib/threadpool.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+
+namespace perftools {
+namespace gputools {
+namespace host {
+
+class HostStream : public internal::StreamInterface {
+ public:
+  HostStream();
+  ~HostStream() override;
+
+  bool EnqueueTask(std::function<void()> task);
+
+  void *CudaStreamHack() override { return nullptr; }
+  void **CudaStreamMemberHack() override { return nullptr; }
+
+  void BlockUntilDone();
+
+ private:
+  // Use only one thread and own task queue to preserve FIFO ordering
+  // for the operations enqueued by any given stream.
+  static const int kExecutorThreads = 1;
+  std::unique_ptr<port::ThreadPool> host_executor_;
+
+  mutex mu_;
+  int pending_tasks_ GUARDED_BY(mu_) = 0;
+  condition_variable completion_condition_;
+};
+
+}  // namespace host
+}  // namespace gputools
+}  // namespace perftools
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_STREAM_H_
diff --git a/tensorflow/stream_executor/host/host_timer.cc b/tensorflow/stream_executor/host/host_timer.cc
new file mode 100644
index 00000000000000..187db9f0c27245
--- /dev/null
+++ b/tensorflow/stream_executor/host/host_timer.cc
@@ -0,0 +1,52 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/host/host_timer.h"
+
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/stream.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+
+namespace gpu = ::perftools::gputools;
+
+namespace perftools {
+namespace gputools {
+namespace host {
+
+using std::chrono::duration_cast;
+
+bool HostTimer::Start(Stream* stream) {
+  return stream->ThenDoHostCallback([this]() { this->StartNow(); }).ok();
+}
+
+bool HostTimer::Stop(Stream* stream) {
+  return stream->ThenDoHostCallback([this]() { this->StopNow(); }).ok();
+}
+
+uint64 HostTimer::Microseconds() const {
+  return duration_cast<std::chrono::microseconds>(duration_).count();
+}
+
+uint64 HostTimer::Nanoseconds() const {
+  return duration_cast<std::chrono::nanoseconds>(duration_).count();
+}
+
+void HostTimer::StartNow() { start_time_ = clock::now(); }
+
+void HostTimer::StopNow() { duration_ = clock::now() - start_time_; }
+
+}  // namespace host
+}  // namespace gputools
+}  // namespace perftools
diff --git a/tensorflow/stream_executor/host/host_timer.h b/tensorflow/stream_executor/host/host_timer.h
new file mode 100644
index 00000000000000..17af7c0521d2ee
--- /dev/null
+++ b/tensorflow/stream_executor/host/host_timer.h
@@ -0,0 +1,63 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_TIMER_H_
+#define TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_TIMER_H_
+
+#include <chrono>
+
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+
+namespace perftools {
+namespace gputools {
+namespace host {
+
+class HostTimer : public internal::TimerInterface {
+ public:
+  HostTimer() {}
+  ~HostTimer() override {}
+
+  // Begins the timer at the present point in the stream.
+  bool Start(Stream *stream);
+
+  // Stops the timer at the present point in the stream.
+  bool Stop(Stream *stream);
+
+  // Returns the most recent value recorded for a start/stopcycle, in
+  // microseconds.
+  uint64 Microseconds() const override;
+
+  // Returns the most recent value recorded for a start/stopcycle, in
+  // nanoseconds.
+  uint64 Nanoseconds() const override;
+
+ private:
+  using clock = std::chrono::high_resolution_clock;
+
+  clock::time_point start_time_;
+  clock::duration duration_;
+
+  // Actually starts (rather than enqueues starting) the timer.
+  void StartNow();
+
+  // Actually stops (rather than enqueues stopping) the timer.
+  void StopNow();
+};
+
+}  // namespace host
+}  // namespace gputools
+}  // namespace perftools
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_TIMER_H_
diff --git a/tensorflow/stream_executor/kernel.h b/tensorflow/stream_executor/kernel.h
index 3e5453e4c9c66b..4291a7a6321496 100644
--- a/tensorflow/stream_executor/kernel.h
+++ b/tensorflow/stream_executor/kernel.h
@@ -69,6 +69,7 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_KERNEL_H_
 #define TENSORFLOW_STREAM_EXECUTOR_KERNEL_H_
 
+#include <array>
 #include <memory>
 #include <tuple>
 #include <type_traits>
diff --git a/tensorflow/stream_executor/lib/env.h b/tensorflow/stream_executor/lib/env.h
index f11a4cb25b21d4..9bdc46b3dca3a4 100644
--- a/tensorflow/stream_executor/lib/env.h
+++ b/tensorflow/stream_executor/lib/env.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_STREAM_EXECUTOR_LIB_ENV_H_
 
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/stringpiece.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
diff --git a/tensorflow/stream_executor/multi_platform_manager.cc b/tensorflow/stream_executor/multi_platform_manager.cc
index 6f3ac8bda1868d..b1f25ab5d3d458 100644
--- a/tensorflow/stream_executor/multi_platform_manager.cc
+++ b/tensorflow/stream_executor/multi_platform_manager.cc
@@ -22,13 +22,11 @@ limitations under the License.
 namespace perftools {
 namespace gputools {
 
-/* static */ mutex MultiPlatformManager::platforms_mutex_(LINKER_INITIALIZED);
-
 /* static */ port::Status MultiPlatformManager::RegisterPlatform(
     std::unique_ptr<Platform> platform) {
   CHECK(platform != nullptr);
   string key = port::Lowercase(platform->Name());
-  mutex_lock lock(platforms_mutex_);
+  mutex_lock lock(GetPlatformsMutex());
   if (GetPlatformMap()->find(key) != GetPlatformMap()->end()) {
     return port::Status(port::error::INTERNAL,
                         "platform is already registered with name: \"" +
@@ -46,7 +44,7 @@ namespace gputools {
 
 /* static */ port::StatusOr<Platform*> MultiPlatformManager::PlatformWithName(
     const string& target) {
-  mutex_lock lock(platforms_mutex_);
+  mutex_lock lock(GetPlatformsMutex());
   auto it = GetPlatformMap()->find(port::Lowercase(target));
 
   if (it == GetPlatformMap()->end()) {
@@ -60,7 +58,7 @@ namespace gputools {
 
 /* static */ port::StatusOr<Platform*> MultiPlatformManager::PlatformWithId(
     const Platform::Id& id) {
-  mutex_lock lock(platforms_mutex_);
+  mutex_lock lock(GetPlatformsMutex());
   auto it = GetPlatformByIdMap()->find(id);
   if (it == GetPlatformByIdMap()->end()) {
     return port::Status(
@@ -72,7 +70,7 @@ namespace gputools {
 }
 
 /* static */ void MultiPlatformManager::ClearPlatformRegistry() {
-  mutex_lock lock(platforms_mutex_);
+  mutex_lock lock(GetPlatformsMutex());
   GetPlatformMap()->clear();
   GetPlatformByIdMap()->clear();
 }
diff --git a/tensorflow/stream_executor/multi_platform_manager.h b/tensorflow/stream_executor/multi_platform_manager.h
index e49cb73848d39b..ea6155b4826439 100644
--- a/tensorflow/stream_executor/multi_platform_manager.h
+++ b/tensorflow/stream_executor/multi_platform_manager.h
@@ -123,13 +123,16 @@ class MultiPlatformManager {
   // Provides access to the available set of platforms under a lock.
   static port::Status WithPlatforms(
       std::function<port::Status(PlatformMap*)> callback) {
-    mutex_lock lock(platforms_mutex_);
+    mutex_lock lock(GetPlatformsMutex());
     return callback(GetPlatformMap());
   }
 
  private:
   // mutex that guards the platform map.
-  static mutex platforms_mutex_;
+  static mutex& GetPlatformsMutex() {
+    static mutex* platforms_mutex = new mutex;
+    return *platforms_mutex;
+  }
 
   // TODO(b/22689637): Clean up these two maps; make sure they coexist nicely.
   // TODO(b/22689637): Move this (whatever the final/"official" map is) to
diff --git a/tensorflow/tensorboard/README.md b/tensorflow/tensorboard/README.md
index 45fa9f25cf0396..b3f66893020788 100644
--- a/tensorflow/tensorboard/README.md
+++ b/tensorflow/tensorboard/README.md
@@ -16,12 +16,12 @@ For in-depth information on the Graph Visualizer, see this tutorial: [TensorBoar
 # Usage
 
 Before running TensorBoard, make sure you have generated summary data in a log
-directory by creating a `SummaryWriter`:
+directory by creating a summary writer:
 
 ``` python
 # sess.graph_def is the graph definition; that enables the Graph Visualizer.
 
-summary_writer = tf.train.SummaryWriter('/path/to/logs', sess.graph)
+file_writer = tf.summary.FileWriter('/path/to/logs', sess.graph)
 ```
 
 For more details, see [this
@@ -64,7 +64,7 @@ a TensorFlow graph. However, summary ops have a twist: the Tensors they produce
 contain serialized protobufs, which are written to disk and sent to TensorBoard.
 To visualize the summary data in TensorBoard, you should evaluate the summary
 op, retrieve the result, and then write that result to disk using a
-SummaryWriter. A full explanation, with examples, is in [the
+summary.FileWriter. A full explanation, with examples, is in [the
 tutorial](https://www.tensorflow.org/versions/r0.12/how_tos/summaries_and_tensorboard/index.html).
 
 ### Tags: Giving names to data
@@ -77,7 +77,7 @@ a lot of tags, we recommend grouping them with slashes.
 
 ### Event Files & LogDirs: How TensorBoard loads the data
 
-`SummaryWriters` take summary data from TensorFlow, and then write them to a
+`summary.FileWriters` take summary data from TensorFlow, and then write them to a
 specified directory, known as the `logdir`. Specifically, the data is written to
 an append-only record dump that will have "tfevents" in the filename.
 TensorBoard reads data from a full directory, and organizes it into the history
@@ -144,11 +144,11 @@ the run-selector on the left.
 Additionally, you can create new folders to organize tags by writing regular
 expressions in the box in the top-left of the dashboard.
 
-### Histogram Dashboard
+### Distribution Dashboard
 
-The Histogram Dashboard is for visualizing how the statistical distribution of a
-Tensor has varied over time. It visualizes data recorded via a
-tf.histogram_summary. Right now, its name is a bit of a misnomer, as it doesn't
+The Distribution Dashboard is for visualizing how the statistical distribution
+of a Tensor has varied over time. It visualizes data recorded via a
+tf.summary.histogram. Right now, its name is a bit of a misnomer, as it doesn't
 show histograms; instead, it shows some high-level statistics on a distribution.
 Each line on the chart represents a percentile in the distribution over the
 data: for example, the bottom line shows how the minimum value has changed over
@@ -167,7 +167,7 @@ replacement.
 
 ### Image Dashboard
 
-The Image Dashboard can display pngs that were saved via a tf.image_summary. The
+The Image Dashboard can display pngs that were saved via a tf.summary.image. The
 dashboard is set up so that each row corresponds to a different tag, and each
 column corresponds to a run. Since the image dashboard supports arbitrary pngs,
 you can use this to embed custom visualizations (e.g. matplotlib scatterplots)
@@ -176,7 +176,7 @@ into TensorBoard. This dashboard always shows you the latest image for each tag.
 ### Audio Dashboard
 
 The Audio Dashboard can embed playable audio widgets for audio saved via a
-tf.audio_summary. The dashboard is set up so that each row corresponds to a
+tf.summary.audio. The dashboard is set up so that each row corresponds to a
 different tag, and each column corresponds to a run. This dashboard always
 embeds the latest audio for each tag.
 
@@ -228,7 +228,7 @@ only reads one file at a time. Let's suppose we have files with timestamps `a`
 and `b`, where `a<b`. Once TensorBoard has read all the events in `a`, it will
 never return to it, because it assumes any new events are being written in the
 more recent file. This could cause an issue if, for example, you have two
-`SummaryWriters` simultaneously writing to the same directory. If you have
+`FileWriters` simultaneously writing to the same directory. If you have
 multiple summary writers, each one should be writing to a separate directory.
 
 ### Does TensorBoard support multiple or distributed summary writers?
@@ -249,7 +249,7 @@ with itself, there are a few possible explanations.
 directory. Please have each TensorFlow run write to its own logdir.
 
 * You may have a have a bug in your code where the global_step variable (passed
-to `SummaryWriter.add_summary`) is being maintained incorrectly.
+to `FileWriter.add_summary`) is being maintained incorrectly.
 
 * It may be that your TensorFlow job crashed, and was restarted from an earlier
 checkpoint. See *How to handle TensorFlow restarts*, below.
@@ -301,7 +301,7 @@ have the same tag name.
 
 This isn't yet possible. As a workaround, you could create your custom plot in
 your own code (e.g. matplotlib) and then write it into an `SummaryProto`
-(`core/framework/summary.proto`) and add it to your `SummaryWriter`. Then, your
+(`core/framework/summary.proto`) and add it to your `FileWriter`. Then, your
 custom plot will appear in the TensorBoard image tab.
 
 ### Is my data being downsampled? Am I really seeing all the data?
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/tf-no-data-warning.html b/tensorflow/tensorboard/components/tf_dashboard_common/tf-no-data-warning.html
index 20d6e4f3b16c32..dbc1dc5c5fa62e 100644
--- a/tensorflow/tensorboard/components/tf_dashboard_common/tf-no-data-warning.html
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/tf-no-data-warning.html
@@ -30,7 +30,7 @@ <h3>
           </h3>
           <p>
             To store a graph, create a
-            <code>tf.train.SummaryWriter</code>
+            <code>tf.summary.FileWriter</code>
             and pass the graph either via the constructor, or by calling its
             <code>add_graph()</code> method.
             You may want to check out the
diff --git a/tensorflow/tensorboard/components/vz_projector/data-provider.ts b/tensorflow/tensorboard/components/vz_projector/data-provider.ts
index b6c2c30dbe2f08..9c6d675fc9ce3b 100644
--- a/tensorflow/tensorboard/components/vz_projector/data-provider.ts
+++ b/tensorflow/tensorboard/components/vz_projector/data-provider.ts
@@ -297,6 +297,7 @@ export function fetchImage(url: string): Promise<HTMLImageElement> {
     let image = new Image();
     image.onload = () => resolve(image);
     image.onerror = (err) => reject(err);
+    image.crossOrigin = '';
     image.src = url;
   });
 }
diff --git a/tensorflow/tensorboard/components/vz_projector/projectorScatterPlotAdapter.ts b/tensorflow/tensorboard/components/vz_projector/projectorScatterPlotAdapter.ts
index d00973935cdb36..8d986c7333ffff 100644
--- a/tensorflow/tensorboard/components/vz_projector/projectorScatterPlotAdapter.ts
+++ b/tensorflow/tensorboard/components/vz_projector/projectorScatterPlotAdapter.ts
@@ -25,11 +25,10 @@ import {ScatterPlotVisualizerTraces} from './scatterPlotVisualizerTraces';
 import * as vector from './vector';
 
 const LABEL_FONT_SIZE = 10;
-const LABEL_SCALE_DEFAULT = 1.0;
 const LABEL_SCALE_LARGE = 2;
+const LABEL_SCALE_DEFAULT = 1.0;
 const LABEL_FILL_COLOR_SELECTED = 0x000000;
 const LABEL_FILL_COLOR_HOVER = 0x000000;
-const LABEL_FILL_COLOR_NEIGHBOR = 0x000000;
 const LABEL_STROKE_COLOR_SELECTED = 0xFFFFFF;
 const LABEL_STROKE_COLOR_HOVER = 0xFFFFFF;
 const LABEL_STROKE_COLOR_NEIGHBOR = 0xFFFFFF;
@@ -82,7 +81,7 @@ export class ProjectorScatterPlotAdapter {
   private selectedPointIndices: number[];
   private neighborsOfFirstSelectedPoint: NearestEntry[];
   private renderLabelsIn3D: boolean = false;
-  private labelPointAccessor: (ds: DataSet, index: number) => string;
+  private labelPointAccessor: string;
   private legendPointColorer: (ds: DataSet, index: number) => string;
   private distanceMetric: DistanceFunction;
 
@@ -137,11 +136,9 @@ export class ProjectorScatterPlotAdapter {
     if (this.traceVisualizer != null) {
       this.traceVisualizer.setDataSet(dataSet);
     }
-    if (this.canvasLabelsVisualizer != null) {
-      this.canvasLabelsVisualizer.setDataSet(dataSet);
-    }
     if (this.labels3DVisualizer != null) {
-      this.labels3DVisualizer.setDataSet(dataSet);
+      this.labels3DVisualizer.setLabelStrings(
+          this.generate3DLabelsArray(dataSet, this.labelPointAccessor));
     }
     if (this.spriteVisualizer == null) {
       return;
@@ -176,12 +173,12 @@ export class ProjectorScatterPlotAdapter {
     this.legendPointColorer = legendPointColorer;
   }
 
-  setLabelPointAccessor(
-      labelPointAccessor: (ds: DataSet, index: number) => string) {
+  setLabelPointAccessor(labelPointAccessor: string) {
     this.labelPointAccessor = labelPointAccessor;
     if (this.labels3DVisualizer != null) {
-      this.labels3DVisualizer.setLabelStrings(this.generate3DLabelsArray(
-          this.projection.dataSet, labelPointAccessor));
+      const ds = (this.projection == null) ? null : this.projection.dataSet;
+      this.labels3DVisualizer.setLabelStrings(
+          this.generate3DLabelsArray(ds, labelPointAccessor));
     }
   }
 
@@ -223,7 +220,7 @@ export class ProjectorScatterPlotAdapter {
     const pointScaleFactors = this.generatePointScaleFactorArray(
         dataSet, selectedSet, neighbors, hoverIndex);
     const labels = this.generateVisibleLabelRenderParams(
-        dataSet, selectedSet, neighbors, hoverIndex);
+        dataSet, selectedSet, neighbors, hoverIndex, this.distanceMetric);
     const traceColors = this.generateLineSegmentColorMap(dataSet, pointColorer);
     const traceOpacities =
         this.generateLineSegmentOpacityArray(dataSet, selectedSet);
@@ -300,8 +297,8 @@ export class ProjectorScatterPlotAdapter {
 
   generateVisibleLabelRenderParams(
       ds: DataSet, selectedPointIndices: number[],
-      neighborsOfFirstPoint: NearestEntry[],
-      hoverPointIndex: number): LabelRenderParams {
+      neighborsOfFirstPoint: NearestEntry[], hoverPointIndex: number,
+      distFunc: DistanceFunction): LabelRenderParams {
     if (ds == null) {
       return null;
     }
@@ -326,7 +323,8 @@ export class ProjectorScatterPlotAdapter {
     let dst = 0;
 
     if (hoverPointIndex != null) {
-      labelStrings.push(this.labelPointAccessor(ds, hoverPointIndex));
+      labelStrings.push(
+          this.getLabelText(ds, hoverPointIndex, this.labelPointAccessor));
       visibleLabels[dst] = hoverPointIndex;
       scale[dst] = LABEL_SCALE_LARGE;
       opacityFlags[dst] = 0;
@@ -346,7 +344,8 @@ export class ProjectorScatterPlotAdapter {
       const strokeRgb = styleRgbFromHexColor(LABEL_STROKE_COLOR_SELECTED);
       for (let i = 0; i < n; ++i) {
         const labelIndex = selectedPointIndices[i];
-        labelStrings.push(this.labelPointAccessor(ds, labelIndex));
+        labelStrings.push(
+            this.getLabelText(ds, labelIndex, this.labelPointAccessor));
         visibleLabels[dst] = labelIndex;
         scale[dst] = LABEL_SCALE_LARGE;
         opacityFlags[dst] = (n === 1) ? 0 : 1;
@@ -361,12 +360,15 @@ export class ProjectorScatterPlotAdapter {
     // Neighbors
     {
       const n = neighborCount;
-      const fillRgb = styleRgbFromHexColor(LABEL_FILL_COLOR_NEIGHBOR);
+      const minDist = n > 0 ? neighborsOfFirstPoint[0].dist : 0;
       const strokeRgb = styleRgbFromHexColor(LABEL_STROKE_COLOR_NEIGHBOR);
       for (let i = 0; i < n; ++i) {
         const labelIndex = neighborsOfFirstPoint[i].index;
-        labelStrings.push(this.labelPointAccessor(ds, labelIndex));
+        labelStrings.push(
+            this.getLabelText(ds, labelIndex, this.labelPointAccessor));
         visibleLabels[dst] = labelIndex;
+        const fillRgb = styleRgbFromDistance(
+            distFunc, neighborsOfFirstPoint[i].dist, minDist);
         packRgbIntoUint8Array(
             fillColors, dst, fillRgb[0], fillRgb[1], fillRgb[2]);
         packRgbIntoUint8Array(
@@ -600,35 +602,39 @@ export class ProjectorScatterPlotAdapter {
     return colors;
   }
 
-  generate3DLabelsArray(
-      ds: DataSet, accessor: (ds: DataSet, i: number) => string) {
+  generate3DLabelsArray(ds: DataSet, accessor: string) {
     if ((ds == null) || (accessor == null)) {
       return null;
     }
     let labels: string[] = [];
     const n = ds.points.length;
     for (let i = 0; i < n; ++i) {
-      labels.push(accessor(ds, i).toString());
+      labels.push(this.getLabelText(ds, i, accessor));
     }
     return labels;
   }
 
+  private getLabelText(ds: DataSet, i: number, accessor: string) {
+    return ds.points[i].metadata[accessor].toString();
+  }
+
   private updateScatterPlotWithNewProjection(projection: Projection) {
-    if (projection != null) {
-      this.scatterPlot.setDimensions(projection.dimensionality);
-      if (projection.dataSet.projectionCanBeRendered(
-              projection.projectionType)) {
-        this.updateScatterPlotAttributes();
-        this.notifyProjectionPositionsUpdated();
-      }
-      this.scatterPlot.setCameraParametersForNextCameraCreation(null, false);
-    } else {
+    if (projection == null) {
+      this.createVisualizers(this.renderLabelsIn3D);
+      this.scatterPlot.render();
+      return;
+    }
+    this.setDataSet(projection.dataSet);
+    this.scatterPlot.setDimensions(projection.dimensionality);
+    if (projection.dataSet.projectionCanBeRendered(projection.projectionType)) {
       this.updateScatterPlotAttributes();
       this.notifyProjectionPositionsUpdated();
     }
+    this.scatterPlot.setCameraParametersForNextCameraCreation(null, false);
   }
 
   private createVisualizers(inLabels3DMode: boolean) {
+    const ds = (this.projection == null) ? null : this.projection.dataSet;
     const scatterPlot = this.scatterPlot;
     scatterPlot.removeAllVisualizers();
     this.labels3DVisualizer = null;
@@ -637,8 +643,8 @@ export class ProjectorScatterPlotAdapter {
     this.traceVisualizer = null;
     if (inLabels3DMode) {
       this.labels3DVisualizer = new ScatterPlotVisualizer3DLabels();
-      this.labels3DVisualizer.setLabelStrings(this.generate3DLabelsArray(
-          this.projection.dataSet, this.labelPointAccessor));
+      this.labels3DVisualizer.setLabelStrings(
+          this.generate3DLabelsArray(ds, this.labelPointAccessor));
     } else {
       this.spriteVisualizer = new ScatterPlotVisualizerSprites();
       scatterPlot.addVisualizer(this.spriteVisualizer);
@@ -646,8 +652,7 @@ export class ProjectorScatterPlotAdapter {
           new ScatterPlotVisualizerCanvasLabels(this.scatterPlotContainer);
     }
     this.traceVisualizer = new ScatterPlotVisualizerTraces();
-    const dataSet = (this.projection == null) ? null : this.projection.dataSet;
-    this.setDataSet(dataSet);
+    this.setDataSet(ds);
     if (this.spriteVisualizer) {
       scatterPlot.addVisualizer(this.spriteVisualizer);
     }
@@ -684,6 +689,13 @@ function styleRgbFromHexColor(hex: number): [number, number, number] {
   return [(c.r * 255) | 0, (c.g * 255) | 0, (c.b * 255) | 0];
 }
 
+function styleRgbFromDistance(
+    distFunc: DistanceFunction, d: number,
+    minDist: number): [number, number, number] {
+  const c = new THREE.Color(dist2color(distFunc, d, minDist));
+  return [(c.r * 255) | 0, (c.g * 255) | 0, (c.b * 255) | 0];
+}
+
 function getDefaultPointInTraceColor(
     index: number, totalPoints: number): THREE.Color {
   let hue =
diff --git a/tensorflow/tensorboard/components/vz_projector/scatterPlot.ts b/tensorflow/tensorboard/components/vz_projector/scatterPlot.ts
index 9d9b0b5aff78ca..625930395dd6df 100644
--- a/tensorflow/tensorboard/components/vz_projector/scatterPlot.ts
+++ b/tensorflow/tensorboard/components/vz_projector/scatterPlot.ts
@@ -120,8 +120,8 @@ export class ScatterPlot {
     this.getLayoutValues();
 
     this.scene = new THREE.Scene();
-    this.renderer =
-        new THREE.WebGLRenderer({alpha: true, premultipliedAlpha: false});
+    this.renderer = new THREE.WebGLRenderer(
+        {alpha: true, premultipliedAlpha: false, antialias: true});
     this.renderer.setClearColor(BACKGROUND_COLOR, 1);
     this.containerNode.appendChild(this.renderer.domElement);
     this.light = new THREE.PointLight(0xFFECBF, 1, 0);
diff --git a/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizer3DLabels.ts b/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizer3DLabels.ts
index ecd2e21403ab31..cbd9785e2f6eb5 100644
--- a/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizer3DLabels.ts
+++ b/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizer3DLabels.ts
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-import {DataSet} from './data';
 import {RenderContext} from './renderContext';
 import {ScatterPlotVisualizer} from './scatterPlotVisualizer';
 import * as util from './util';
@@ -96,7 +95,6 @@ type GlyphTexture = {
  * Renders the text labels as 3d geometry in the world.
  */
 export class ScatterPlotVisualizer3DLabels implements ScatterPlotVisualizer {
-  private dataSet: DataSet;
   private scene: THREE.Scene;
   private labelStrings: string[];
   private geometry: THREE.BufferGeometry;
@@ -111,10 +109,6 @@ export class ScatterPlotVisualizer3DLabels implements ScatterPlotVisualizer {
   private labelVertexMap: number[][];
   private glyphTexture: GlyphTexture;
 
-  setDataSet(ds: DataSet) {
-    this.dataSet = ds;
-  }
-
   private createGlyphTexture(): GlyphTexture {
     let canvas = document.createElement('canvas');
     canvas.width = MAX_CANVAS_DIMENSION;
@@ -288,14 +282,14 @@ export class ScatterPlotVisualizer3DLabels implements ScatterPlotVisualizer {
 
   private colorLabels(pointColors: Float32Array) {
     if (this.labelStrings == null || this.geometry == null ||
-        this.dataSet == null || pointColors == null) {
+        pointColors == null) {
       return;
     }
 
     const colors = this.geometry.getAttribute('color') as THREE.BufferAttribute;
     colors.array = this.renderColors;
 
-    const n = this.dataSet.points.length;
+    const n = pointColors.length / XYZ_ELEMENTS_PER_ENTRY;
     let src = 0;
     for (let i = 0; i < n; ++i) {
       const c = new THREE.Color(
diff --git a/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerCanvasLabels.ts b/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerCanvasLabels.ts
index ef473eda6cb1ad..d357100797db94 100644
--- a/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerCanvasLabels.ts
+++ b/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerCanvasLabels.ts
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-import {DataSet} from './data';
 import {BoundingBox, CollisionGrid} from './label';
 import {CameraType, RenderContext} from './renderContext';
 import {ScatterPlotVisualizer} from './scatterPlotVisualizer';
@@ -22,6 +21,8 @@ import * as util from './util';
 const MAX_LABELS_ON_SCREEN = 10000;
 const LABEL_STROKE_WIDTH = 3;
 const LABEL_FILL_WIDTH = 6;
+const LABEL_BACKGROUND_CARD_COLOR = 0xFFFFFF;
+const LABEL_BACKGROUND_CARD_OPACITY = 0.4;
 
 /**
  * Creates and maintains a 2d canvas on top of the GL canvas. All labels, when
@@ -29,7 +30,6 @@ const LABEL_FILL_WIDTH = 6;
  */
 export class ScatterPlotVisualizerCanvasLabels implements
     ScatterPlotVisualizer {
-  private dataSet: DataSet;
   private worldSpacePointPositions: Float32Array;
   private gc: CanvasRenderingContext2D;
   private canvas: HTMLCanvasElement;
@@ -42,10 +42,6 @@ export class ScatterPlotVisualizerCanvasLabels implements
     this.canvas.style.pointerEvents = 'none';
   }
 
-  setDataSet(ds: DataSet) {
-    this.dataSet = ds;
-  }
-
   private removeAllLabels() {
     const pixelWidth = this.canvas.width * window.devicePixelRatio;
     const pixelHeight = this.canvas.height * window.devicePixelRatio;
@@ -54,9 +50,6 @@ export class ScatterPlotVisualizerCanvasLabels implements
 
   /** Render all of the non-overlapping visible labels to the canvas. */
   private makeLabels(rc: RenderContext) {
-    if (this.dataSet == null) {
-      return;
-    }
     if ((rc.labels == null) || (rc.labels.pointIndices.length === 0)) {
       return;
     }
@@ -95,6 +88,9 @@ export class ScatterPlotVisualizerCanvasLabels implements
     // Shift the label to the right of the point circle.
     const xShift = 4;
 
+    const labelBackgroundCardStyle = this.styleStringFromHexColorAndOpacity(
+        LABEL_BACKGROUND_CARD_COLOR, LABEL_BACKGROUND_CARD_OPACITY);
+
     const n = Math.min(MAX_LABELS_ON_SCREEN, lrc.pointIndices.length);
     for (let i = 0; i < n; ++i) {
       let point: THREE.Vector3;
@@ -126,7 +122,7 @@ export class ScatterPlotVisualizerCanvasLabels implements
       if (grid.insert(textBoundingBox, true)) {
         const text = lrc.labelStrings[i];
         const fontSize = lrc.defaultFontSize * lrc.scaleFactors[i] * dpr;
-        this.gc.font = fontSize + 'px roboto';
+        this.gc.font = fontSize + 'pt roboto';
 
         // Now, check with properly computed width.
         textBoundingBox.hiX += this.gc.measureText(text).width - 1;
@@ -135,6 +131,10 @@ export class ScatterPlotVisualizerCanvasLabels implements
           if (sceneIs3D && (lrc.useSceneOpacityFlags[i] === 1)) {
             opacity = opacityMap(camToPoint.length());
           }
+          this.gc.fillStyle = labelBackgroundCardStyle;
+          const rw = textBoundingBox.hiX - textBoundingBox.loX;
+          const rh = textBoundingBox.hiY - textBoundingBox.loY;
+          this.gc.fillRect(textBoundingBox.loX, textBoundingBox.loY, rw, rh);
           this.gc.fillStyle =
               this.styleStringFromPackedRgba(lrc.fillColors, i, opacity);
           this.gc.strokeStyle =
@@ -148,6 +148,15 @@ export class ScatterPlotVisualizerCanvasLabels implements
     }
   }
 
+  private styleStringFromHexColorAndOpacity(hex: number, opacity: number):
+      string {
+    const c = new THREE.Color(hex);
+    const r = (c.r * 255) | 0;
+    const g = (c.g * 255) | 0;
+    const b = (c.b * 255) | 0;
+    return 'rgba(' + r + ',' + g + ',' + b + ',' + opacity + ')';
+  }
+
   private styleStringFromPackedRgba(
       packedRgbaArray: Uint8Array, colorIndex: number,
       opacity: number): string {
diff --git a/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerSprites.ts b/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerSprites.ts
index 1facddba1ab4cf..90b6d0273eda39 100644
--- a/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerSprites.ts
+++ b/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerSprites.ts
@@ -303,7 +303,6 @@ export class ScatterPlotVisualizerSprites implements ScatterPlotVisualizer {
   dispose() {
     this.disposeGeometry();
     this.disposeTextureAtlas();
-    this.worldSpacePointPositions = null;
   }
 
   private disposeGeometry() {
@@ -311,6 +310,7 @@ export class ScatterPlotVisualizerSprites implements ScatterPlotVisualizer {
       this.scene.remove(this.points);
       this.points.geometry.dispose();
       this.points = null;
+      this.worldSpacePointPositions = null;
     }
   }
 
@@ -345,14 +345,11 @@ export class ScatterPlotVisualizerSprites implements ScatterPlotVisualizer {
 
   onPointPositionsChanged(newPositions: Float32Array) {
     if ((newPositions == null) || (newPositions.length === 0)) {
-      this.disposeGeometry();
+      this.dispose();
       return;
     }
-
     if (this.points != null) {
-      const notEnoughSpace =
-          (this.worldSpacePointPositions.length < newPositions.length);
-      if (notEnoughSpace) {
+      if (this.worldSpacePointPositions.length !== newPositions.length) {
         this.disposeGeometry();
       }
     }
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-data-panel.ts b/tensorflow/tensorboard/components/vz_projector/vz-projector-data-panel.ts
index 78accd80d0e4a9..9cc11c0f323ff3 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-data-panel.ts
+++ b/tensorflow/tensorboard/components/vz_projector/vz-projector-data-panel.ts
@@ -87,8 +87,8 @@ export class DataPanel extends DataPanelPolymer {
     return isSeparator ? 'separator' : null;
   }
 
-  metadataChanged(spriteAndMetadata: SpriteAndMetadataInfo,
-      metadataFile: string) {
+  metadataChanged(
+      spriteAndMetadata: SpriteAndMetadataInfo, metadataFile: string) {
     this.updateMetadataUI(spriteAndMetadata.stats, metadataFile);
   }
 
@@ -171,18 +171,19 @@ export class DataPanel extends DataPanelPolymer {
   }
 
   _selectedTensorChanged() {
+    this.projector.updateDataSet(null, null, null);
     if (this.selectedTensor == null) {
       return;
     }
     this.dataProvider.retrieveTensor(
         this.selectedRun, this.selectedTensor, ds => {
-      let metadataFile =
-          this.getEmbeddingInfoByName(this.selectedTensor).metadataPath;
-      this.dataProvider.retrieveSpriteAndMetadata(this.selectedRun,
-          this.selectedTensor, metadata => {
-        this.projector.updateDataSet(ds, metadata, metadataFile);
-      });
-    });
+          let metadataFile =
+              this.getEmbeddingInfoByName(this.selectedTensor).metadataPath;
+          this.dataProvider.retrieveSpriteAndMetadata(
+              this.selectedRun, this.selectedTensor, metadata => {
+                this.projector.updateDataSet(ds, metadata, metadataFile);
+              });
+        });
     this.projector.setSelectedTensor(
         this.selectedRun, this.getEmbeddingInfoByName(this.selectedTensor));
   }
@@ -223,10 +224,7 @@ export class DataPanel extends DataPanelPolymer {
                 return a <= b ? -1 : 1;
               });
       this.tensorNames = names.map(name => {
-        return {
-          name,
-          shape: this.getEmbeddingInfoByName(name).tensorShape
-        };
+        return {name, shape: this.getEmbeddingInfoByName(name).tensorShape};
       });
       let wordBreakablePath =
           this.addWordBreaks(this.projectorConfig.modelCheckpointPath);
@@ -324,8 +322,9 @@ export class DataPanel extends DataPanelPolymer {
     });
 
     let uploadButton = this.dom.select('#upload-tensors');
-    uploadButton.on(
-        'click', () => { (fileInput.node() as HTMLInputElement).click(); });
+    uploadButton.on('click', () => {
+      (fileInput.node() as HTMLInputElement).click();
+    });
 
     // Show and setup the upload metadata button.
     let fileMetadataInput = this.dom.select('#file-metadata');
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-projections-panel.ts b/tensorflow/tensorboard/components/vz_projector/vz-projector-projections-panel.ts
index 32e9b0a724b7f0..9df182ed489afd 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-projections-panel.ts
+++ b/tensorflow/tensorboard/components/vz_projector/vz-projector-projections-panel.ts
@@ -357,12 +357,16 @@ export class ProjectionsPanel extends ProjectionsPanelPolymer {
       return;
     }
     if (projection === 'pca') {
-      this.dataSet.stopTSNE();
+      if (this.dataSet != null) {
+        this.dataSet.stopTSNE();
+      }
       this.showPCA();
     } else if (projection === 'tsne') {
       this.showTSNE();
     } else if (projection === 'custom') {
-      this.dataSet.stopTSNE();
+      if (this.dataSet != null) {
+        this.dataSet.stopTSNE();
+      }
       this.computeAllCentroids();
       this.reprojectCustom();
     }
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector.ts b/tensorflow/tensorboard/components/vz_projector/vz-projector.ts
index 14ea58b24af830..f184ce98603948 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector.ts
+++ b/tensorflow/tensorboard/components/vz_projector/vz-projector.ts
@@ -127,10 +127,9 @@ export class Projector extends ProjectorPolymer implements
 
   setSelectedLabelOption(labelOption: string) {
     this.selectedLabelOption = labelOption;
-    const labelAccessor = (ds: DataSet, i: number): string =>
-        ds.points[i].metadata[this.selectedLabelOption] as string;
     this.metadataCard.setLabelOption(this.selectedLabelOption);
-    this.projectorScatterPlotAdapter.setLabelPointAccessor(labelAccessor);
+    this.projectorScatterPlotAdapter.setLabelPointAccessor(labelOption);
+    this.projectorScatterPlotAdapter.updateScatterPlotAttributes();
     this.projectorScatterPlotAdapter.render();
   }
 
@@ -165,16 +164,20 @@ export class Projector extends ProjectorPolymer implements
     }
     if (this.projectorScatterPlotAdapter != null) {
       if (ds == null) {
+        this.projectorScatterPlotAdapter.setLabelPointAccessor(null);
         this.setProjection(null);
+      } else {
+        this.projectorScatterPlotAdapter.updateScatterPlotPositions();
+        this.projectorScatterPlotAdapter.updateScatterPlotAttributes();
+        this.projectorScatterPlotAdapter.resize();
+        this.projectorScatterPlotAdapter.render();
       }
-      this.projectorScatterPlotAdapter.updateScatterPlotPositions();
-      this.projectorScatterPlotAdapter.updateScatterPlotAttributes();
-      this.projectorScatterPlotAdapter.resize();
-      this.projectorScatterPlotAdapter.render();
     }
     if (ds != null) {
       this.dataPanel.setNormalizeData(this.normalizeData);
       this.setCurrentDataSet(ds.getSubset());
+      this.projectorScatterPlotAdapter.setLabelPointAccessor(
+          this.selectedLabelOption);
       this.inspectorPanel.datasetChanged();
 
       this.inspectorPanel.metadataChanged(spriteAndMetadata);
@@ -416,11 +419,10 @@ export class Projector extends ProjectorPolymer implements
     });
 
     {
-      const labelAccessor = i =>
-          '' + this.dataSet.points[i].metadata[this.selectedLabelOption];
       this.projectorScatterPlotAdapter = new ProjectorScatterPlotAdapter(
           this.getScatterContainer(), this as ProjectorEventContext);
-      this.projectorScatterPlotAdapter.setLabelPointAccessor(labelAccessor);
+      this.projectorScatterPlotAdapter.setLabelPointAccessor(
+          this.selectedLabelOption);
     }
 
     this.projectorScatterPlotAdapter.scatterPlot.onCameraMove(
diff --git a/tensorflow/tensorboard/plugins/projector/plugin.py b/tensorflow/tensorboard/plugins/projector/plugin.py
index 7c10bab6206d8c..bfcbb4ad6a283f 100644
--- a/tensorflow/tensorboard/plugins/projector/plugin.py
+++ b/tensorflow/tensorboard/plugins/projector/plugin.py
@@ -136,7 +136,7 @@ def get_plugin_handlers(self, run_paths, logdir):
   @property
   def configs(self):
     """Returns a map of run paths to `ProjectorConfig` protos."""
-    run_path_pairs = self.run_paths.items()
+    run_path_pairs = list(self.run_paths.items())
     # If there are no summary event files, the projector should still work,
     # treating the `logdir` as the model checkpoint directory.
     if not run_path_pairs:
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 502d6984687536..56a0b772acfb88 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -50,54 +50,17 @@ load(
 )
 
 # List of proto files for android builds
-def tf_android_core_proto_sources():
+def tf_android_core_proto_sources(core_proto_sources_relative):
   return ["//tensorflow/core:" + p
-          for p in tf_android_core_proto_sources_relative()]
-
-# As tf_android_core_proto_sources, but paths relative to
-# //third_party/tensorflow/core.
-def tf_android_core_proto_sources_relative():
-    return [
-        "example/example.proto",
-        "example/feature.proto",
-        "framework/allocation_description.proto",
-        "framework/attr_value.proto",
-        "framework/cost_graph.proto",
-        "framework/device_attributes.proto",
-        "framework/function.proto",
-        "framework/graph.proto",
-        "framework/kernel_def.proto",
-        "framework/log_memory.proto",
-        "framework/node_def.proto",
-        "framework/op_def.proto",
-        "framework/resource_handle.proto",
-        "framework/step_stats.proto",
-        "framework/summary.proto",
-        "framework/tensor.proto",
-        "framework/tensor_description.proto",
-        "framework/tensor_shape.proto",
-        "framework/tensor_slice.proto",
-        "framework/types.proto",
-        "framework/versions.proto",
-        "lib/core/error_codes.proto",
-        "protobuf/config.proto",
-        "protobuf/tensor_bundle.proto",
-        "protobuf/saver.proto",
-        "util/memmapped_file_system.proto",
-        "util/saved_tensor_slice.proto",
-  ]
+          for p in core_proto_sources_relative]
 
 # Returns the list of pb.h and proto.h headers that are generated for
 # tf_android_core_proto_sources().
-def tf_android_core_proto_headers():
+def tf_android_core_proto_headers(core_proto_sources_relative):
   return (["//tensorflow/core/" + p.replace(".proto", ".pb.h")
-          for p in tf_android_core_proto_sources_relative()] +
+          for p in core_proto_sources_relative] +
          ["//tensorflow/core/" + p.replace(".proto", ".proto.h")
-          for p in tf_android_core_proto_sources_relative()])
-
-# Returns the list of protos for which proto_text headers should be generated.
-def tf_proto_text_protos_relative():
-  return [p for p in tf_android_core_proto_sources_relative()]
+          for p in core_proto_sources_relative])
 
 def if_android_arm(a):
   return select({
diff --git a/tensorflow/tools/ci_build/builds/test_installation.sh b/tensorflow/tools/ci_build/builds/test_installation.sh
index 9cafd518cbdfe9..e0a9348fbe832b 100755
--- a/tensorflow/tools/ci_build/builds/test_installation.sh
+++ b/tensorflow/tools/ci_build/builds/test_installation.sh
@@ -108,6 +108,7 @@ PY_TEST_BLACKLIST="${PY_TEST_BLACKLIST}:"\
 PY_TEST_GPU_BLACKLIST="${PY_TEST_GPU_BLACKLIST}:"\
 "tensorflow/python/client/session_test.py:"\
 "tensorflow/python/framework/function_test.py:"\
+"tensorflow/contrib/integrate/python/ops/odes_test.py:"\
 "tensorflow/contrib/tensor_forest/python/kernel_tests/scatter_add_ndim_op_test.py"
 
 # Tests that should be run in the exclusive mode (i.e., not parallel with
diff --git a/tensorflow/tools/ci_build/update_version.sh b/tensorflow/tools/ci_build/update_version.sh
index 40fec8355d05ff..759c7e5f7e434d 100755
--- a/tensorflow/tools/ci_build/update_version.sh
+++ b/tensorflow/tools/ci_build/update_version.sh
@@ -61,6 +61,7 @@ fi
 MAJOR=$(echo "${NEW_VER}" | cut -d \. -f 1)
 MINOR=$(echo "${NEW_VER}" | cut -d \. -f 2)
 PATCH=$(echo "${NEW_VER}" | cut -d \. -f 3)
+PIP_PATCH="${PATCH//-}"
 
 # Update tensorflow/core/public/version.h
 VERSION_H="${TF_SRC_DIR}/core/public/version.h"
@@ -70,7 +71,7 @@ OLD_MAJOR=$(cat ${VERSION_H} | grep -E "^#define TF_MAJOR_VERSION [0-9]+" | \
 cut -d ' ' -f 3)
 OLD_MINOR=$(cat ${VERSION_H} | grep -E "^#define TF_MINOR_VERSION [0-9]+" | \
 cut -d ' ' -f 3)
-OLD_PATCH=$(cat ${VERSION_H} | grep -E "^#define TF_PATCH_VERSION [[:alnum:]]+" | \
+OLD_PATCH=$(cat ${VERSION_H} | grep -E "^#define TF_PATCH_VERSION [[:alnum:]-]+" | \
 cut -d ' ' -f 3)
 
 sed -i -e "s/^#define TF_MAJOR_VERSION ${OLD_MAJOR}/#define TF_MAJOR_VERSION ${MAJOR}/g" ${VERSION_H}
@@ -90,57 +91,24 @@ check_existence file "${CMAKE_SETUP_PY}"
 
 sed -i -e "s/^\_VERSION = [\'\"].*-cmake-experimental[\'\"]/\_VERSION = \'${MAJOR}.${MINOR}.${PATCH}-cmake-experimental\'/g" "${CMAKE_SETUP_PY}"
 
-# Update Dockerfiles in tensorflow/tools/docker/
-TOOLS_DOCKER_DIR="${TF_SRC_DIR}/tools/docker"
-check_existence dir "${TOOLS_DOCKER_DIR}"
-
-# Determine the files that need to be modified
-DOCKERFILES=$(grep -lrE "^ENV TENSORFLOW_VERSION .+" ${TOOLS_DOCKER_DIR})
-for DOCKERF in ${DOCKERFILES}; do
-  sed -i -r -e "s/^ENV TENSORFLOW_VERSION .+/ENV TENSORFLOW_VERSION ${MAJOR}.${MINOR}.${PATCH}/g" "${DOCKERF}"
-done
-
 
 # Update os_setup.md
 OS_SETUP="${TF_SRC_DIR}/g3doc/get_started/os_setup.md"
 check_existence file "${OS_SETUP}"
 
-sed -i -r -e "s/(.*pip[0-9]* install .*tensorflow-)([0-9]+\.[0-9]+\.[[:alnum:]]+)(-.*\.whl)/\1${MAJOR}.${MINOR}.${PATCH}\3/g" "${OS_SETUP}"
-sed -i -r -e "s/(.*export TF_BINARY_URL.*tensorflow-)([0-9]+\.[0-9]+\.[[:alnum:]]+)(-.*\.whl)/\1${MAJOR}.${MINOR}.${PATCH}\3/g" "${OS_SETUP}"
-sed -i -r -e "s/(.*\`)([0-9]+\.[0-9]+\.[[:alnum:]]+)(-gpu.*)/\1${MAJOR}.${MINOR}.${PATCH}\3/g" "${OS_SETUP}"
+sed -i -r -e "s/(.*pip[0-9]* install .*tensorflow-)([0-9]+\.[0-9]+\.[[:alnum:]]+)(-.*\.whl)/\1${MAJOR}.${MINOR}.${PIP_PATCH}\3/g" "${OS_SETUP}"
+sed -i -r -e "s/(.*pip[0-9]* install .*tensorflow_gpu-)([0-9]+\.[0-9]+\.[[:alnum:]]+)(-.*\.whl)/\1${MAJOR}.${MINOR}.${PIP_PATCH}\3/g" "${OS_SETUP}"
+sed -i -r -e "s/(.*export TF_BINARY_URL.*tensorflow-)([0-9]+\.[0-9]+\.[[:alnum:]]+)(-.*\.whl)/\1${MAJOR}.${MINOR}.${PIP_PATCH}\3/g" "${OS_SETUP}"
+sed -i -r -e "s/(.*export TF_BINARY_URL.*tensorflow_gpu-)([0-9]+\.[0-9]+\.[[:alnum:]]+)(-.*\.whl)/\1${MAJOR}.${MINOR}.${PIP_PATCH}\3/g" "${OS_SETUP}"
+sed -i -r -e "s/(.*\`)([0-9]+\.[0-9]+\.[[:alnum:]-]+)(-gpu.*)/\1${MAJOR}.${MINOR}.${PATCH}\3/g" "${OS_SETUP}"
 
 
 # Update README.md
 README_MD="./README.md"
 check_existence file "${README_MD}"
 
-sed -i -r -e "s/${OLD_MAJOR}\.${OLD_MINOR}\.${OLD_PATCH}/${MAJOR}.${MINOR}.${PATCH}/g" "${README_MD}"
-
-# Update tensorflow/tools/dist_test/Dockerfile
-DIST_TEST_DOCKER_FILE="${TF_SRC_DIR}/tools/dist_test/Dockerfile"
-check_existence file "${DIST_TEST_DOCKER_FILE}"
-
-sed -i -r -e "s/(.*tensorflow-)([0-9]+\.[0-9]+\.[[:alnum:]]+)(-.*\.whl)/\1${MAJOR}.${MINOR}.${PATCH}\3/g" "${DIST_TEST_DOCKER_FILE}"
-
-# Update tensorflow/tools/dist_test/Dockerfile.local
-DIST_TEST_LOCAL_DOCKER_FILE="${TF_SRC_DIR}/tools/dist_test/Dockerfile.local"
-check_existence file "${DIST_TEST_LOCAL_DOCKER_FILE}"
-
-sed -i -r -e "s/(.*tensorflow-)([0-9]+\.[0-9]+\.[[:alnum:]]+)(-.*\.whl)/\1${MAJOR}.${MINOR}.${PATCH}\3/g" "${DIST_TEST_LOCAL_DOCKER_FILE}"
-
-# Update tensorflow/tools/dist_test/server/Dockerfile
-SERVER_DOCKER_FILE="${TF_SRC_DIR}/tools/dist_test/server/Dockerfile"
-
-check_existence file "${SERVER_DOCKER_FILE}"
+sed -i -r -e "s/${OLD_MAJOR}\.${OLD_MINOR}\.([[:alnum:]]+)-/${MAJOR}.${MINOR}.${PIP_PATCH}-/g" "${README_MD}"
 
-sed -i -r -e "s/(.*tensorflow-)([0-9]+\.[0-9]+\.[[:alnum:]]+)(-.*\.whl)/\1${MAJOR}.${MINOR}.${PATCH}\3/g" "${SERVER_DOCKER_FILE}"
-
-# Update tensorflow/tools/dist_test/server/Dockerfile.test
-TEST_SERVER_DOCKER_FILE="${TF_SRC_DIR}/tools/dist_test/server/Dockerfile.test"
-
-check_existence file "${TEST_SERVER_DOCKER_FILE}"
-
-sed -i -r -e "s/(.*tensorflow-)([0-9]+\.[0-9]+\.[[:alnum:]]+)(-.*\.whl)/\1${MAJOR}.${MINOR}.${PATCH}\3/g" "${TEST_SERVER_DOCKER_FILE}"
 
 # Updates to be made if there are major / minor version changes
 MAJOR_MINOR_CHANGE=0
@@ -176,20 +144,24 @@ echo "Patch: ${OLD_PATCH} -> ${PATCH}"
 echo ""
 
 # Look for potentially lingering old version strings in TensorFlow source files
-OLD_VER="${OLD_MAJOR}\.${OLD_MINOR}\.${OLD_PATCH}"
-LINGER_STRS=$(grep -rnoH "${OLD_VER}" "${TF_SRC_DIR}")
-
-if [[ ! -z "${LINGER_STRS}" ]]; then
-  echo "WARNING: Below are potentially instances of lingering old version "\
-"string (${OLD_VER}) in source directory \"${TF_SRC_DIR}/\" that are not "\
-"updated by this script. Please check them manually!"
-  for LINGER_STR in ${LINGER_STRS}; do
-    echo "${LINGER_STR}"
-  done
-else
-  echo "No lingering old version strings found in source directory "\
-"\"${TF_SRC_DIR}/\". Good."
-fi
+declare -a OLD_PATCHES=(${OLD_PATCH} $(echo "${OLD_PATCH//-}"))
+for i in "${OLD_PATCHES[@]}"
+do
+  OLD_VER="${OLD_MAJOR}\.${OLD_MINOR}\.$i"
+  LINGER_STRS=$(grep -rnoH "${OLD_VER}" "${TF_SRC_DIR}")
+
+  if [[ ! -z "${LINGER_STRS}" ]]; then
+    echo "WARNING: Below are potentially instances of lingering old version "\
+  "string (${OLD_VER}) in source directory \"${TF_SRC_DIR}/\" that are not "\
+  "updated by this script. Please check them manually!"
+    for LINGER_STR in ${LINGER_STRS}; do
+      echo "${LINGER_STR}"
+    done
+  else
+    echo "No lingering old version strings \"${OLD_VER}\" found in source directory "\
+  "\"${TF_SRC_DIR}/\". Good."
+  fi
+done
 
 if [[ ${MAJOR_MINOR_CHANGE} == "1" ]]; then
   LINGER_R_STRS=$(grep -rnoH "${OLD_R_MAJOR_MINOR}" "${TF_SRC_DIR}")
diff --git a/tensorflow/tools/docker/Dockerfile b/tensorflow/tools/docker/Dockerfile
index 14e4d03406b1ff..a9852586e933c8 100644
--- a/tensorflow/tools/docker/Dockerfile
+++ b/tensorflow/tools/docker/Dockerfile
@@ -34,8 +34,6 @@ RUN pip --no-cache-dir install \
         && \
     python -m ipykernel.kernelspec
 
-ENV TENSORFLOW_VERSION 0.12.0-rc0
-
 # --- DO NOT EDIT OR DELETE BETWEEN THE LINES --- #
 # These lines will be edited automatically by parameterized_docker_build.sh. #
 # COPY _PIP_FILE_ /
@@ -44,9 +42,11 @@ ENV TENSORFLOW_VERSION 0.12.0-rc0
 
 # Install TensorFlow CPU version from central repo
 RUN pip --no-cache-dir install \
-    http://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-${TENSORFLOW_VERSION}-cp27-none-linux_x86_64.whl
+    http://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.0.0-cp27-none-linux_x86_64.whl
 # --- ~ DO NOT EDIT OR DELETE BETWEEN THE LINES --- #
 
+# RUN ln -s /usr/bin/python3 /usr/bin/python#
+
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
 
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index 497ee7daa90c16..fdc36aef3fc814 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -89,10 +89,12 @@ WORKDIR /tensorflow
 # more difficult to experiment with local changes. Instead, just add
 # the built directory to the path.
 
+ENV CI_BUILD_PYTHON python
+
 RUN tensorflow/tools/ci_build/builds/configured CPU \
     bazel build -c opt tensorflow/tools/pip_package:build_pip_package && \
     bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/pip && \
-    pip install --upgrade /tmp/pip/tensorflow-*.whl && \
+    pip --no-cache-dir install --upgrade /tmp/pip/tensorflow-*.whl && \
     rm -rf /tmp/pip && \
     rm -rf /root/.cache
 # Clean up pip wheel and Bazel cache when done.
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 36b2d25ed2c5d7..ded0539f893993 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -11,7 +11,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libpng12-dev \
         libzmq3-dev \
         pkg-config \
-        python \
         python-dev \
         rsync \
         software-properties-common \
@@ -87,6 +86,7 @@ RUN git clone https://github.com/tensorflow/tensorflow.git && \
 WORKDIR /tensorflow
 
 # Configure the build for our CUDA configuration.
+ENV CI_BUILD_PYTHON python
 ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
 ENV TF_NEED_CUDA 1
 ENV TF_CUDA_COMPUTE_CAPABILITIES=3.0,3.5,5.2
@@ -94,7 +94,7 @@ ENV TF_CUDA_COMPUTE_CAPABILITIES=3.0,3.5,5.2
 RUN tensorflow/tools/ci_build/builds/configured GPU \
     bazel build -c opt --config=cuda tensorflow/tools/pip_package:build_pip_package && \
     bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/pip && \
-    pip install --upgrade /tmp/pip/tensorflow-*.whl && \
+    pip --no-cache-dir install --upgrade /tmp/pip/tensorflow-*.whl && \
     rm -rf /tmp/pip && \
     rm -rf /root/.cache
 # Clean up pip wheel and Bazel cache when done.
diff --git a/tensorflow/tools/docker/Dockerfile.gpu b/tensorflow/tools/docker/Dockerfile.gpu
index 9d325967c9d79c..ca3252e1d934a7 100644
--- a/tensorflow/tools/docker/Dockerfile.gpu
+++ b/tensorflow/tools/docker/Dockerfile.gpu
@@ -34,8 +34,6 @@ RUN pip --no-cache-dir install \
         && \
     python -m ipykernel.kernelspec
 
-ENV TENSORFLOW_VERSION 0.12.0-rc0
-
 # --- DO NOT EDIT OR DELETE BETWEEN THE LINES --- #
 # These lines will be edited automatically by parameterized_docker_build.sh. #
 # COPY _PIP_FILE_ /
@@ -44,9 +42,11 @@ ENV TENSORFLOW_VERSION 0.12.0-rc0
 
 # Install TensorFlow GPU version.
 RUN pip --no-cache-dir install \
-    http://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-${TENSORFLOW_VERSION}-cp27-none-linux_x86_64.whl
+    http://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-0.0.0-cp27-none-linux_x86_64.whl
 # --- ~ DO NOT EDIT OR DELETE BETWEEN THE LINES --- #
 
+# RUN ln -s /usr/bin/python3 /usr/bin/python#
+
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
 
diff --git a/tensorflow/tools/docker/parameterized_docker_build.sh b/tensorflow/tools/docker/parameterized_docker_build.sh
index 8d6099547c08b2..b0c56c89653251 100755
--- a/tensorflow/tools/docker/parameterized_docker_build.sh
+++ b/tensorflow/tools/docker/parameterized_docker_build.sh
@@ -31,9 +31,8 @@
 #
 #   TF_DOCKER_BUILD_CENTRAL_PIP
 #     (Optional)
-#     If set to any non-0 and non-empty value, will attempt to use the PIP file
-#     located on the central repo, instead of locally built pip files.
-#     This option takes effect only for non-devel builds.
+#     If set to a non-empty string, will use it as the URL from which the
+#     pip wheel file will be downloaded (instead of building the pip locally).
 #
 #   TF_DOCKER_BUILD_IMAGE_NAME:
 #     (Optional)
@@ -81,7 +80,6 @@ mark_check_failed() {
 
 TF_DOCKER_BUILD_TYPE=$(to_lower ${TF_DOCKER_BUILD_TYPE})
 TF_DOCKER_BUILD_IS_DEVEL=$(to_lower ${TF_DOCKER_BUILD_IS_DEVEL})
-TF_DOCKER_BUILD_CENTRAL_PIP=$(to_lower ${TF_DOCKER_BUILD_CENTRAL_PIP})
 TF_DOCKER_BUILD_PYTHON_VERSION=$(to_lower ${TF_DOCKER_BUILD_PYTHON_VERSION:-PYTHON2})
 TF_DOCKER_BUILD_OPTIONS=$(to_lower ${TF_DOCKER_BUILD_OPTIONS:-OPT})
 
@@ -144,6 +142,15 @@ else
 "${TF_DOCKER_BUILD_TYPE}"
 fi
 
+if [[ "${TF_DOCKER_BUILD_PYTHON_VERSION}" == "python2" ]]; then
+  :
+elif [[ "${TF_DOCKER_BUILD_PYTHON_VERSION}" == "python3" ]]; then
+  FINAL_TAG="${FINAL_TAG}-py3"
+else
+  die "Unrecognized value in TF_DOCKER_BUILD_PYTHON_VERSION: "\
+"${TF_DOCKER_BUILD_PYTHON_VERSION}"
+fi
+
 # Verify that the original Dockerfile exists
 ORIG_DOCKERFILE="${SCRIPT_DIR}/${ORIG_DOCKERFILE}"
 if [[ ! -f "${ORIG_DOCKERFILE}" ]]; then
@@ -156,21 +163,6 @@ echo "FINAL_TAG: ${FINAL_TAG}"
 echo "Original Dockerfile: ${ORIG_DOCKERFILE}"
 echo ""
 
-
-DO_PIP_BUILD=0
-if [[ ${TF_DOCKER_BUILD_IS_DEVEL} == "yes" ]]; then
-  # Devel builds has pip build instructions in the Dockerfile
-  :
-else
-  if [[ ! -z ${TF_DOCKER_BUILD_CENTRAL_PIP} ]] &&
-     [[ ${TF_DOCKER_BUILD_CENTRAL_PIP} != "0" ]]; then
-    :
-  else
-    DO_PIP_BUILD=1
-  fi
-fi
-
-
 # Create tmp directory for Docker build
 TMP_DIR=$(mktemp -d)
 echo ""
@@ -179,67 +171,96 @@ echo "Docker build will occur in temporary directory: ${TMP_DIR}"
 # Copy all files to tmp directory for Docker build
 cp -r ${SCRIPT_DIR}/* "${TMP_DIR}/"
 
-
-if [[ "${DO_PIP_BUILD}" == "1" ]]; then
+if [[ "${TF_DOCKER_BUILD_IS_DEVEL}" == "no" ]]; then
   DOCKERFILE="${TMP_DIR}/Dockerfile"
 
-  # Perform local build of the required PIP whl file
-  export TF_BUILD_CONTAINER_TYPE=${TF_DOCKER_BUILD_TYPE}
-  export TF_BUILD_PYTHON_VERSION=${TF_DOCKER_BUILD_PYTHON_VERSION}
-  export TF_BUILD_OPTIONS=${TF_DOCKER_BUILD_OPTIONS}
-  export TF_BUILD_IS_PIP="PIP"
-
-  if [[ "${TF_DOCKER_BUILD_TYPE}" == "gpu" ]]; then
-    export TF_BUILD_APPEND_CI_DOCKER_EXTRA_PARAMS=\
-"${TF_BUILD_APPEND_CI_DOCKER_EXTRA_PARAMS} -e TF_CUDA_COMPUTE_CAPABILITIES=3.0,3.5,5.2"
-  fi
-
-  pushd "${SCRIPT_DIR}/../../../"
-  rm -rf pip_test/whl &&
-  tensorflow/tools/ci_build/ci_parameterized_build.sh
-  PIP_BUILD_EXIT_CODE=$?
-  popd
-
-  # Was the pip build successful?
-  if [[ ${PIP_BUILD_EXIT_CODE} != "0" ]]; then
-    die "FAIL: Failed to build pip file locally"
-  fi
-
-  PIP_WHL=$(ls pip_test/whl/*.whl | head -1)
-  if [[ -z "${PIP_WHL}" ]]; then
-    die "ERROR: Cannot locate the locally-built pip whl file"
-  fi
-  echo "Locally-built PIP whl file is at: ${PIP_WHL}"
-
-  # Copy the pip file to tmp directory
-  cp "${PIP_WHL}" "${TMP_DIR}/" || \
-      die "ERROR: Failed to copy wheel file: ${PIP_WHL}"
-
-  # Use string replacement to put the correct file name into the Dockerfile
-  PIP_WHL=$(basename "${PIP_WHL}")
-
-  # Modify the non-devel Dockerfile to point to the correct pip whl file
-  # location
-  sed -e "/# --- DO NOT EDIT OR DELETE BETWEEN THE LINES --- #/,"\
+  if [[ -z "${TF_DOCKER_BUILD_CENTRAL_PIP}" ]]; then
+    # Perform local build of the required PIP whl file
+    export TF_BUILD_CONTAINER_TYPE=${TF_DOCKER_BUILD_TYPE}
+    export TF_BUILD_PYTHON_VERSION=${TF_DOCKER_BUILD_PYTHON_VERSION}
+    export TF_BUILD_OPTIONS=${TF_DOCKER_BUILD_OPTIONS}
+    export TF_BUILD_IS_PIP="PIP"
+
+    if [[ "${TF_DOCKER_BUILD_TYPE}" == "gpu" ]]; then
+      export TF_BUILD_APPEND_CI_DOCKER_EXTRA_PARAMS=\
+  "${TF_BUILD_APPEND_CI_DOCKER_EXTRA_PARAMS} -e TF_CUDA_COMPUTE_CAPABILITIES=3.0,3.5,5.2"
+    fi
+
+    pushd "${SCRIPT_DIR}/../../../"
+    rm -rf pip_test/whl &&
+    tensorflow/tools/ci_build/ci_parameterized_build.sh
+    PIP_BUILD_EXIT_CODE=$?
+    popd
+
+    # Was the pip build successful?
+    if [[ ${PIP_BUILD_EXIT_CODE} != "0" ]]; then
+      die "FAIL: Failed to build pip file locally"
+    fi
+
+    PIP_WHL=$(ls pip_test/whl/*.whl | head -1)
+    if [[ -z "${PIP_WHL}" ]]; then
+      die "ERROR: Cannot locate the locally-built pip whl file"
+    fi
+    echo "Locally-built PIP whl file is at: ${PIP_WHL}"
+
+    # Copy the pip file to tmp directory
+    cp "${PIP_WHL}" "${TMP_DIR}/" || \
+        die "ERROR: Failed to copy wheel file: ${PIP_WHL}"
+
+    # Use string replacement to put the correct file name into the Dockerfile
+    PIP_WHL=$(basename "${PIP_WHL}")
+
+    # Modify the non-devel Dockerfile to point to the correct pip whl file
+    # location
+    sed -e "/# --- DO NOT EDIT OR DELETE BETWEEN THE LINES --- #/,"\
 "/# --- ~ DO NOT EDIT OR DELETE BETWEEN THE LINES --- #/c"\
 "COPY ${PIP_WHL} /\n"\
 "RUN pip --no-cache-dir install /${PIP_WHL}" "${ORIG_DOCKERFILE}" \
     > "${DOCKERFILE}"
+  else
+    echo "Downloading pip wheel from: ${TF_DOCKER_BUILD_CENTRAL_PIP}"
+    echo
+
+    # Modify the non-devel Dockerfile to point to the correct pip whl URL.
+    sed -e "/# --- DO NOT EDIT OR DELETE BETWEEN THE LINES --- #/,"\
+"/# --- ~ DO NOT EDIT OR DELETE BETWEEN THE LINES --- #/c"\
+"RUN pip --no-cache-dir install ${TF_DOCKER_BUILD_CENTRAL_PIP}" "${ORIG_DOCKERFILE}" \
+    > "${DOCKERFILE}"
+  fi
 
   echo "Modified Dockerfile at: ${DOCKERFILE}"
+  echo
+
+  # Modify python/pip version if necessary.
+  if [[ "${TF_DOCKER_BUILD_PYTHON_VERSION}" == "python3" ]]; then
+    sed -i -e 's/python /python3 /g' "${DOCKERFILE}" && \
+        sed -i -e 's/python-dev/python3-dev/g' "${DOCKERFILE}" && \
+        sed -i -e 's/pip /pip3 /g' "${DOCKERFILE}" && \
+        sed -i -e 's^# RUN ln -s /usr/bin/python3 /usr/bin/python#^RUN ln -s /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}" && \
+        echo "Modified Dockerfile for python version "\
+"${TF_DOCKER_BUILD_PYTHON_VERSION} at: ${DOCKERFILE}" || \
+        die "FAILED to modify ${DOCKERFILE} for python3"
+  fi
 else
-  if [[ "${TF_DOCKER_BUILD_IS_DEVEL}" == "yes" ]]; then
-    DOCKERFILE="${TMP_DIR}/Dockerfile"
+  DOCKERFILE="${TMP_DIR}/Dockerfile"
 
-    # Modify the devel Dockerfile to specify the git branch
-    sed -r "s/([\s]*git checkout )(.*)/\1${TF_DOCKER_BUILD_DEVEL_BRANCH}/g" \
-        "${ORIG_DOCKERFILE}" > "${DOCKERFILE}"
-  else
-    DOCKERFILE="${TMP_DIR}/"$(basename "${ORIG_DOCKERFILE}")
+  # Modify the devel Dockerfile to specify the git branch
+  sed -r "s/([\s]*git checkout )(.*)/\1${TF_DOCKER_BUILD_DEVEL_BRANCH}/g" \
+      "${ORIG_DOCKERFILE}" > "${DOCKERFILE}"
+
+  # Modify python/pip version if necessary.
+  if [[ "${TF_DOCKER_BUILD_PYTHON_VERSION}" == "python3" ]]; then
+    sed -i -e 's/python-dev/python-dev python3-dev/g' "${DOCKERFILE}" && \
+        sed -i -e 's/python /python3 /g' "${DOCKERFILE}" && \
+        sed -i -e 's^/tmp/pip^/tmp/pip3^g' "${DOCKERFILE}" && \
+        sed -i -e 's/pip /pip3 /g' "${DOCKERFILE}" && \
+        sed -i -e 's/ENV CI_BUILD_PYTHON python/ENV CI_BUILD_PYTHON python3/g' "${DOCKERFILE}" && \
+        sed -i -e 's^# RUN ln -s /usr/bin/python3 /usr/bin/python#^RUN ln -s /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}" && \
+        echo "Modified Dockerfile further for python version ${TF_DOCKER_BUILD_PYTHON_VERSION} at: ${DOCKERFILE}" || \
+        die "FAILED to modify ${DOCKERFILE} for python3"
   fi
 fi
 
-
 # Perform docker build
 # Intermediate image name with tag
 IMG="${USER}/tensorflow:${FINAL_TAG}"
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index d0c813e84ffac7..31836fe432f3f1 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -92,6 +92,7 @@ filegroup(
         "@grpc//:LICENSE",
         "@highwayhash//:LICENSE",
         "@jpeg//:LICENSE.md",
+        "@libxsmm_archive//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
         "@nanopb_git//:LICENSE.txt",
         "@png_archive//:LICENSE",
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index a22279f35d7453..551787a1b81204 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -28,6 +28,19 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     build_file = str(Label("//:eigen.BUILD")),
   )
 
+  native.new_http_archive(
+    name = "libxsmm_archive",
+    url = "https://github.com/hfp/libxsmm/archive/1.5.tar.gz",
+    sha256 = "c52568c5e0e8dc9d8fcf869a716d73598e52f71c3d83af5a4c0b3be81403b423",
+    strip_prefix = "libxsmm-1.5",
+    build_file = str(Label("//:libxsmm.BUILD")),
+  )
+
+  native.bind(
+    name = "xsmm_avx",
+    actual = "@libxsmm_archive//:xsmm_avx",
+  )
+
   native.http_archive(
     name = "com_googlesource_code_re2",
     url = "http://github.com/google/re2/archive/b94b7cd42e9f02673cd748c1ac1d16db4052514c.tar.gz",
@@ -88,7 +101,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
 
   native.new_http_archive(
     name = "gif_archive",
-    url = "http://cdimage.debian.org/mirror/xbmc.org/build-deps/sources/giflib-5.1.4.tar.gz",
+    url = "http://ufpr.dl.sourceforge.net/project/giflib/giflib-5.1.4.tar.gz",
     sha256 = "34a7377ba834397db019e8eb122e551a49c98f49df75ec3fcc92b9a794a4f6d1",
     strip_prefix = "giflib-5.1.4",
     build_file = str(Label("//:gif.BUILD")),
@@ -148,7 +161,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   native.new_http_archive(
     name = "swig",
     sha256 = "58a475dbbd4a4d7075e5fe86d4e54c9edde39847cdb96a3053d87cb64a23a453",
-    url = "http://cdimage.debian.org/mirror/xbmc.org/build-deps/sources/swig-3.0.8.tar.gz",
+    url = "http://ufpr.dl.sourceforge.net/project/swig/swig/swig-3.0.8/swig-3.0.8.tar.gz",
     strip_prefix = "swig-3.0.8",
     build_file = str(Label("//third_party:swig.BUILD")),
   )
diff --git a/third_party/gpus/cuda/BUILD.tpl b/third_party/gpus/cuda/BUILD.tpl
index 82a96e35152b7c..20920f7fca30fb 100644
--- a/third_party/gpus/cuda/BUILD.tpl
+++ b/third_party/gpus/cuda/BUILD.tpl
@@ -132,3 +132,9 @@ cc_library(
     data = ["lib/%{cupti_lib}"],
     visibility = ["//visibility:public"],
 )
+
+cc_library(
+    name = "libdevice_root",
+    data = glob(["nvvm/libdevice/*.bc"]),
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 955951a9b6054b..6d7754a897e123 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -194,13 +194,16 @@ def _cuda_version(repository_ctx, cuda_toolkit_path, cpu_value):
     auto_configure_fail(
         "CUDA version detected from nvcc (%s) does not match " +
         "TF_CUDA_VERSION (%s)" % (version, environ_version))
+
+  if cpu_value == "Windows":
+    version = "64_" + version.replace(".", "")
   return version
 
 
 _DEFINE_CUDNN_MAJOR = "#define CUDNN_MAJOR"
 
 
-def _cudnn_version(repository_ctx, cudnn_install_basedir):
+def _cudnn_version(repository_ctx, cudnn_install_basedir, cpu_value):
   """Detects the version of cuDNN installed on the system.
 
   Args:
@@ -236,8 +239,11 @@ def _cudnn_version(repository_ctx, cudnn_install_basedir):
     environ_version = repository_ctx.os.environ[_TF_CUDNN_VERSION].strip()
   if environ_version and version != environ_version:
     auto_configure_fail(
-        "cuDNN version detected from %s (%s) does not match " +
-        "TF_CUDNN_VERSION (%s)" % (str(cudnn_h_path), version, environ_version))
+        ("cuDNN version detected from %s (%s) does not match " +
+        "TF_CUDNN_VERSION (%s)") % (str(cudnn_h_path), version, environ_version))
+
+  if cpu_value == "Windows":
+    version = "64_" + version
   return version
 
 
@@ -297,6 +303,11 @@ def _lib_name(lib, cpu_value, version="", static=False):
   elif cpu_value == "Windows":
     return "%s.lib" % lib
   elif cpu_value == "Darwin":
+    if static:
+      return "lib%s.a" % lib
+    else:
+      if version:
+        version = ".%s" % version
     return "lib%s%s.dylib" % (lib, version)
   else:
     auto_configure_fail("Invalid cpu_value: %s" % cpu_value)
@@ -496,7 +507,7 @@ def _get_cuda_config(repository_ctx):
   cuda_toolkit_path = _cuda_toolkit_path(repository_ctx)
   cuda_version = _cuda_version(repository_ctx, cuda_toolkit_path, cpu_value)
   cudnn_install_basedir = _cudnn_install_basedir(repository_ctx)
-  cudnn_version = _cudnn_version(repository_ctx, cudnn_install_basedir)
+  cudnn_version = _cudnn_version(repository_ctx, cudnn_install_basedir, cpu_value)
   return struct(
       cuda_toolkit_path = cuda_toolkit_path,
       cudnn_install_basedir = cudnn_install_basedir,
diff --git a/third_party/llvm/llvm.BUILD b/third_party/llvm/llvm.BUILD
index e1c20e82a7276a..bef7b24d706ced 100644
--- a/third_party/llvm/llvm.BUILD
+++ b/third_party/llvm/llvm.BUILD
@@ -280,6 +280,7 @@ cc_binary(
         "lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h",
     ],
     linkopts = [
+        "-lm",
         "-ldl",
         "-lpthread",
     ],