caisq
diff --git a/‎tensorflow/compiler/tests/BUILD
Lines changed: 1 addition & 1 deletion b/‎tensorflow/compiler/tests/BUILD
Lines changed: 1 addition & 1 deletion
diff --git a/‎tensorflow/compiler/tests/random_ops_test.py
Lines changed: 2 additions & 2 deletions b/‎tensorflow/compiler/tests/random_ops_test.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎tensorflow/compiler/tf2xla/BUILD
Lines changed: 5 additions & 3 deletions b/‎tensorflow/compiler/tf2xla/BUILD
Lines changed: 5 additions & 3 deletions
diff --git a/‎tensorflow/compiler/tf2xla/kernels/BUILD
Lines changed: 4 additions & 2 deletions b/‎tensorflow/compiler/tf2xla/kernels/BUILD
Lines changed: 4 additions & 2 deletions
diff --git a/‎tensorflow/compiler/tf2xla/kernels/bcast_ops.cc
Lines changed: 1 addition & 1 deletion b/‎tensorflow/compiler/tf2xla/kernels/bcast_ops.cc
Lines changed: 1 addition & 1 deletion
diff --git a/‎tensorflow/compiler/tf2xla/kernels/elu_op.cc
Lines changed: 1 addition & 1 deletion b/‎tensorflow/compiler/tf2xla/kernels/elu_op.cc
Lines changed: 1 addition & 1 deletion
diff --git a/‎tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc
Lines changed: 4 additions & 4 deletions b/‎tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc
Lines changed: 4 additions & 4 deletions
diff --git a/‎tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
Lines changed: 1 addition & 1 deletion b/‎tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
Lines changed: 1 addition & 1 deletion
diff --git a/‎tensorflow/compiler/tf2xla/kernels/random_ops.cc
Lines changed: 112 additions & 47 deletions b/‎tensorflow/compiler/tf2xla/kernels/random_ops.cc
Lines changed: 112 additions & 47 deletions
diff --git a/‎tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
Lines changed: 1 addition & 1 deletion b/‎tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
Lines changed: 1 addition & 1 deletion
@@ -924,7 +924,7 @@ tf_xla_py_test(
 
 tf_xla_py_test(
     name = "sort_ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["sort_ops_test.py"],
     # Times out in fastbuild mode.
     tags = ["optonly"],
 
@@ -140,10 +140,10 @@ def probit(x, sess=sess):
   def testShuffle1d(self):
     with self.test_session() as sess:
       with self.test_scope():
-        x = math_ops.range(20)
+        x = math_ops.range(1 << 16)
         shuffle = random_ops.random_shuffle(x)
       result = sess.run(shuffle)
-      expected = range(20)
+      expected = range(1 << 16)
       # Compare sets to avoid randomness behavior changes but make sure still
       # have all the values.
       self.assertAllEqual(set(result), set(expected))
 
@@ -162,7 +162,7 @@ cc_library(
         ":sharding_util",
         ":tf2xla_util",
         "//tensorflow/compiler/tf2xla/lib:util",
-        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
@@ -202,7 +202,7 @@ cc_library(
     ],
     visibility = [":friends"],
     deps = [
-        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:core_cpu_internal",
@@ -285,6 +285,7 @@ tf_cc_test(
     deps = [
         ":tf2xla",
         ":tf2xla_proto",
+        "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/client:client_library",
@@ -327,7 +328,7 @@ tf_cc_test(
         "//tensorflow/cc:ops",
         "//tensorflow/cc:resource_variable_ops",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
-        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla/client:client_library",
@@ -364,6 +365,7 @@ tf_cc_test(
     ],
     deps = [
         ":common",
+        "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/core:framework",
         "//tensorflow/core:test",
 
@@ -114,6 +114,7 @@ tf_kernel_library(
         "//tensorflow/compiler/tf2xla/lib:while_loop",
         "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/compiler/xla:array4d",
+        "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -159,7 +160,7 @@ tf_kernel_library(
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/ops:xla_ops",
-        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -175,7 +176,7 @@ tf_kernel_library(
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/ops:xla_ops",
-        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -210,6 +211,7 @@ tf_kernel_library(
         ":index_ops_kernel_argmax_float_2d",
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
 
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/bcast.h"
 
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/no_op.h"
 
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -78,14 +78,14 @@ class ArgMaxCustomCallOp : public XlaOpKernel {
     std::vector<xla::XlaOp> args;
     args.push_back(ctx->Input(0));
     args.push_back(xla::ConstantLiteral(
-        &b, *xla::Literal::CreateR1<int64>(input_shape.dim_sizes())));
+        &b, *xla::LiteralUtil::CreateR1<int64>(input_shape.dim_sizes())));
     if (input_shape.dims() > 1) {
       // Don't bother passing the output shape and dim for the 1d case, since
       // the shape is always a scalar and the dim is always 0.
       args.push_back(xla::ConstantLiteral(
-          &b, *xla::Literal::CreateR1<int64>(output_shape.dim_sizes())));
+          &b, *xla::LiteralUtil::CreateR1<int64>(output_shape.dim_sizes())));
       args.push_back(
-          xla::ConstantLiteral(&b, *xla::Literal::CreateR0<int32>(dim)));
+          xla::ConstantLiteral(&b, *xla::LiteralUtil::CreateR0<int32>(dim)));
     }
 
     xla::Shape xla_shape =
 
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 
@@ -74,56 +74,121 @@ class RandomShuffleOp : public XlaOpKernel {
     for (tensorflow::TensorShapeDim dimension : input_shape) {
       num_elements *= dimension.size;
     }
+
     if (num_elements <= 1 || n <= 1) {
       // No shuffling is required, so copy input directly to output
       ctx->SetOutput(0, input);
-    } else {
-      // Generate the random swaps for the indices.
-      auto swaps_shape = xla::ShapeUtil::MakeShape(xla::S32, {n});
-      auto swaps =
-          xla::RngUniform(xla::ConstantR0<int32>(builder, 0),
-                          xla::ConstantR0<int32>(builder, n), swaps_shape);
-
-      // Generate range(n) as the initial value for the indices to be swapped.
-      xla::XlaOp indices = xla::Iota(builder, xla::S32, n);
-
-      // Swap the indices at i and swaps[i].
-      auto swap_body_fn = [&](xla::XlaOp i,
-                              gtl::ArraySlice<xla::XlaOp> loop_vars,
-                              xla::XlaBuilder* builder)
-          -> xla::StatusOr<std::vector<xla::XlaOp>> {
-        auto swaps = loop_vars[0];
-        auto indices = loop_vars[1];
-        i = xla::Reshape(i, {1});
-        // temp = indices[i]
-        auto temp = xla::DynamicSlice(indices, i, {1});
-        // swap_index = swaps[i]
-        auto swap_index = xla::DynamicSlice(swaps, i, {1});
-        // swap_value = indices[swaps[i]]
-        auto swap_value = xla::DynamicSlice(indices, swap_index, {1});
-        // indices[i] = indices[swaps[i]]
-        indices = xla::DynamicUpdateSlice(indices, swap_value, i);
-        // indices[swaps[i]] = temp
-        indices = xla::DynamicUpdateSlice(indices, temp, swap_index);
-        return std::vector<xla::XlaOp>{swaps, indices};
-      };
-      // for i in range(n):
-      auto swap_loop_result =
-          XlaForEachIndex(n, xla::S32, swap_body_fn, {swaps, indices},
-                          "indices_swap_loop", builder)
-              .ValueOrDie();
-      auto swapped_indices = swap_loop_result[1];
-
-      // Gather the data using the swapped indices as the shuffled order.
-      auto indices_tensor_shape = TensorShape({n});
-      DataType type = ctx->expected_output_dtype(0);
-      xla::XlaOp gather;
-      OP_REQUIRES_OK(ctx, XlaGather(input, input_shape, swapped_indices,
-                                    indices_tensor_shape,
-                                    /*axis=*/0, /*indices_are_nd=*/false, type,
-                                    DT_INT32, builder, &gather));
-      ctx->SetOutput(0, gather);
+      return;
+    }
+
+    if (input_shape.dims() == 1) {
+      // For R1s, shuffle values by sorting instead of the obvious Fisher-Yates
+      // algorithm. Fisher-Yates is simple to implement and correct, but not
+      // easily parallelizable. For a sufficiently parallel architecture, it is
+      // faster to sort many times, than Fisher-Yates shuffle once.
+
+      // Shuffle values by assigning each value a random key and sorting the
+      // keys. Keys can collide causing detectable patterns in the shuffled
+      // output. Collisions translates into more ascending sub-sequences in the
+      // shuffled output than would be expected by chance. To avoid collisions,
+      // the number of possible key values must be sufficiently large.
+
+      // How are more than 2^32 keys created? In each loop iteration, the
+      // algorithm sorts by random keys. Conceptually, the earlier iterations
+      // are sorting on the lower-order bits of larger keys that are never
+      // actually assembled.
+
+      // The expected number of collisions is n - d + d(1 - 1/d)^n, where d is
+      // the number of possible keys and n is the number of values. If d = n^2,
+      // then the limit as n goes to infinity is 1/2. If d = n^3, then the limit
+      // as n goes to infinity is zero.
+
+      // This implementation ensures that the key-space is greater than or equal
+      // to the cube of the number of values. The risk of collisions can be
+      // further reduced by increasing Exponent at the expense of
+      // performance.
+
+      // For Exponent = 2, the expected number of collisions per shuffle is
+      // maximized at n = floor((2^32-1)^(1/2)) = 65535 where the expectation is
+      // about 1/2.
+
+      // For Exponent = 3, the expected number of collisions per shuffle is
+      // maximized at n = floor((2^32-1)^(1/3)) = 1625 where the expectation is
+      // about 1/3255.
+
+      // For Exponent = 4, the expected number of collisions per shuffle is
+      // maximized at n = floor((2^32-1)^(1/4)) = 255 where the expectation is
+      // about 1/132622.
+      constexpr int Exponent = 3;
+      const int rounds = static_cast<int>(
+          std::ceil(Exponent * std::log(num_elements) / std::log(kuint32max)));
+
+      const xla::Shape key_shape =
+          xla::ShapeUtil::MakeShape(xla::U32, {num_elements});
+      xla::XlaOp zero = xla::ConstantR0(builder, 0U);
+
+      // Unfortunately, xla::RngUniform gives values in the half open interval
+      // rather than the closed interval, so instead of 2^32 possible keys there
+      // are only 2^32 - 1 (kuint32max).
+      xla::XlaOp max_value = xla::ConstantR0(builder, kuint32max);
+
+      xla::XlaOp curr = input;
+      for (int i = 0; i < rounds; ++i) {
+        xla::XlaOp keys = xla::RngUniform(zero, max_value, key_shape);
+        xla::XlaOp sorted = xla::Sort(keys, curr);
+        curr = xla::GetTupleElement(sorted, 1);
+      }
+
+      ctx->SetOutput(0, curr);
+      return;
     }
+
+    // The Fisher-Yates algorithm.
+
+    // Generate the random swaps for the indices.
+    auto swaps_shape = xla::ShapeUtil::MakeShape(xla::S32, {n});
+    auto swaps =
+        xla::RngUniform(xla::ConstantR0<int32>(builder, 0),
+                        xla::ConstantR0<int32>(builder, n), swaps_shape);
+
+    // Generate range(n) as the initial value for the indices to be swapped.
+    xla::XlaOp indices = xla::Iota(builder, xla::S32, n);
+
+    // Swap the indices at i and swaps[i].
+    auto swap_body_fn = [&](xla::XlaOp i, gtl::ArraySlice<xla::XlaOp> loop_vars,
+                            xla::XlaBuilder* builder)
+        -> xla::StatusOr<std::vector<xla::XlaOp>> {
+      auto swaps = loop_vars[0];
+      auto indices = loop_vars[1];
+      i = xla::Reshape(i, {1});
+      // temp = indices[i]
+      auto temp = xla::DynamicSlice(indices, i, {1});
+      // swap_index = swaps[i]
+      auto swap_index = xla::DynamicSlice(swaps, i, {1});
+      // swap_value = indices[swaps[i]]
+      auto swap_value = xla::DynamicSlice(indices, swap_index, {1});
+      // indices[i] = indices[swaps[i]]
+      indices = xla::DynamicUpdateSlice(indices, swap_value, i);
+      // indices[swaps[i]] = temp
+      indices = xla::DynamicUpdateSlice(indices, temp, swap_index);
+      return std::vector<xla::XlaOp>{swaps, indices};
+    };
+    // for i in range(n):
+    auto swap_loop_result =
+        XlaForEachIndex(n, xla::S32, swap_body_fn, {swaps, indices},
+                        "indices_swap_loop", builder)
+            .ValueOrDie();
+    auto swapped_indices = swap_loop_result[1];
+
+    // Gather the data using the swapped indices as the shuffled order.
+    auto indices_tensor_shape = TensorShape({n});
+    DataType type = ctx->expected_output_dtype(0);
+    xla::XlaOp gather;
+    OP_REQUIRES_OK(ctx, XlaGather(input, input_shape, swapped_indices,
+                                  indices_tensor_shape,
+                                  /*axis=*/0, /*indices_are_nd=*/false, type,
+                                  DT_INT32, builder, &gather));
+    ctx->SetOutput(0, gather);
   }
 
  private:
@@ -220,5 +285,5 @@ REGISTER_XLA_OP(Name("TruncatedNormal")
                     .TypeConstraint("dtype", DT_FLOAT),
                 TruncatedNormalOp);
 
-}  // anonymous namespace
+}  // namespace
 }  // namespace tensorflow
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 
 namespace tensorflow {