Intel-tensorflow · chuanqi129 · Apr 29, 2020 · Apr 29, 2020 · Apr 29, 2020
diff --git a/tensorflow/core/kernels/random_op_cpu.h b/tensorflow/core/kernels/random_op_cpu.h
@@ -56,6 +56,83 @@ namespace functor {
 using random::PhiloxRandom;
 using random::SingleSampleAdapter;
 
+template <class Generator, typename RealType, class Distribution>
+class DistributionVec {
+ public:
+  explicit DistributionVec(Distribution* dist) { this->dist = dist; }
+
+  typename Distribution::ResultType operator()(Generator* gen) {
+    return (*dist)(gen);
+  }
+
+  void VecCopy(RealType* data, int64 length) {}
+
+ private:
+  Distribution* dist;
+};
+
+template <>
+class DistributionVec<
+    random::PhiloxRandom, bfloat16,
+    random::UniformDistribution<random::PhiloxRandom, bfloat16>> {
+ public:
+  typedef random::UniformDistribution<random::PhiloxRandom, bfloat16>
+      Distribution;
+
+  explicit DistributionVec(Distribution* dist) { this->dist = dist; }
+
+  typename Distribution::ResultType operator()(random::PhiloxRandom* gen) {
+    typename random::PhiloxRandom::ResultType sample = (*gen)();
+    typename Distribution::ResultType result;
+
+    for (int i = 0; i < Distribution::kResultElementCount; ++i) {
+      result[i] = tensorflow::random::InternalUint16ToBfloat16(sample[i]);
+    }
+
+    return result;
+  }
+
+  void VecCopy(bfloat16* data, int64 length) {
+    // The mantissa has an implicit leading 1, so the above code creates a value
+    // in [1, 2). The minus will not cause a rounding that makes the result 1.
+    // Instead it will just be close to 1.
+    auto result_t = typename TTypes<bfloat16>::Tensor(data, length);
+    result_t = result_t - bfloat16(1.0);
+  }
+
+ private:
+  Distribution* dist;
+};
+
+template <>
+class DistributionVec<
+    random::PhiloxRandom, float,
+    random::UniformDistribution<random::PhiloxRandom, float>> {
+ public:
+  typedef random::UniformDistribution<random::PhiloxRandom, float> Distribution;
+
+  explicit DistributionVec(Distribution* dist) { this->dist = dist; }
+
+  typename Distribution::ResultType operator()(random::PhiloxRandom* gen) {
+    typename random::PhiloxRandom::ResultType sample = (*gen)();
+    typename Distribution::ResultType result;
+
+    for (int i = 0; i < Distribution::kResultElementCount; ++i) {
+      result[i] = tensorflow::random::InternalUint32ToFloat(sample[i]);
+    }
+
+    return result;
+  }
+
+  void VecCopy(float* data, int64 length) {
+    auto result_t = typename TTypes<float>::Tensor(data, length);
+    result_t = result_t - 1.0f;
+  }
+
+ private:
+  Distribution* dist;
+};
+
 // The default implementation of the functor, which should never be invoked
 // But we still need to provide implementation for now for the linker to work,
 // since we do not support all the distributions yet.
@@ -91,18 +168,23 @@ struct FillPhiloxRandomTask<Distribution, false> {
 
     // First fill all the full-size groups
     int64 limit_group_full = std::min(limit_group, size / kGroupSize);
+    DistributionVec<random::PhiloxRandom, T, Distribution> dist_vec(&dist);
     for (int64 index = start_group; index < limit_group_full; ++index) {
-      auto samples = dist(&gen);
+      auto samples = dist_vec(&gen);
       std::copy(&samples[0], &samples[0] + kGroupSize, data + offset);
       offset += kGroupSize;
     }
 
     // If there are any remaining elements that need to be filled, process them
+    int64 remaining_size = 0;
     if (limit_group_full < limit_group) {
-      int64 remaining_size = size - limit_group_full * kGroupSize;
-      auto samples = dist(&gen);
+      remaining_size = size - limit_group_full * kGroupSize;
+      auto samples = dist_vec(&gen);
       std::copy(&samples[0], &samples[0] + remaining_size, data + offset);
     }
+    dist_vec.VecCopy(
+        data + start_group * kGroupSize,
+        (limit_group_full - start_group) * kGroupSize + remaining_size);
   }
 };
 
@@ -126,6 +208,8 @@ struct FillPhiloxRandomTask<Distribution, true> {
     // First fill all the full-size groups
     int64 limit_group_full = std::min(limit_group, size / kGroupSize);
     int64 group_index;
+    DistributionVec<SingleSampleAdapter<PhiloxRandom>, T, Distribution>
+        dist_vec(&dist);
     for (group_index = start_group; group_index < limit_group_full;
          ++group_index) {
       // Reset the generator to the beginning of the output group region
@@ -135,21 +219,25 @@ struct FillPhiloxRandomTask<Distribution, true> {
       gen.Skip(group_index * kGeneratorSkipPerOutputGroup);
       SingleSampleAdapter<PhiloxRandom> single_samples(&gen);
 
-      auto samples = dist(&single_samples);
+      auto samples = dist_vec(&single_samples);
       std::copy(&samples[0], &samples[0] + kGroupSize, data + offset);
       offset += kGroupSize;
     }
 
     // If there are any remaining elements that need to be filled, process them
+    int64 remaining_size = 0;
     if (limit_group_full < limit_group) {
       PhiloxRandom gen = base_gen;
       gen.Skip(group_index * kGeneratorSkipPerOutputGroup);
       SingleSampleAdapter<PhiloxRandom> single_samples(&gen);
 
-      int64 remaining_size = size - limit_group_full * kGroupSize;
-      auto samples = dist(&single_samples);
+      remaining_size = size - limit_group_full * kGroupSize;
+      auto samples = dist_vec(&single_samples);
       std::copy(&samples[0], &samples[0] + remaining_size, data + offset);
     }
+    dist_vec.VecCopy(
+        data + start_group * kGroupSize,
+        (limit_group_full - start_group) * kGroupSize + remaining_size);
   }
 };
 

diff --git a/tensorflow/core/lib/random/random_distributions.h b/tensorflow/core/lib/random/random_distributions.h
@@ -23,19 +23,23 @@ limitations under the License.
 #include <algorithm>
 #include <type_traits>
 
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/random/philox_random.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 namespace random {
 
 // Helper function to convert a 16-bit integer to a half between [0..1).
 PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x);
 // Helper function to convert a 16-bit integer to a bfloat16 between [0..1).
-PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x);
+PHILOX_DEVICE_INLINE bfloat16 Uint16ToBfloat16(uint16 x);
+// Helper function to convert a 16-bit integer to a bfloat16 between [1..2).
+PHILOX_DEVICE_INLINE bfloat16 InternalUint16ToBfloat16(uint16 x);
 // Helper function to convert a 32-bit integer to a float between [0..1).
 PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32 x);
+// Helper function to convert a 32-bit integer to a float between [1..2).
+PHILOX_DEVICE_INLINE float InternalUint32ToFloat(uint32 x);
 // Helper function to convert two 32-bit integers to a double between [0..1).
 PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32 x0, uint32 x1);
 
@@ -108,9 +112,11 @@ class UniformDistribution<Generator, bfloat16> {
   ResultType operator()(Generator* gen) {
     typename Generator::ResultType sample = (*gen)();
     ResultType result;
+
     for (int i = 0; i < kResultElementCount; ++i) {
-      result[i] = Uint16ToGfloat16(sample[i]);
+      result[i] = Uint16ToBfloat16(sample[i]);
     }
+
     return result;
   }
 };
@@ -764,9 +770,9 @@ PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x) {
   return result - Eigen::half(1.0);
 }
 
-// Helper function to convert an 16-bit integer to a bfloat16 between [0..1).
-// This can create a uniform distribution of values between [0..1).
-PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x) {
+// Helper function to convert an 16-bit integer to a bfloat16 between [1..2).
+// This can create a uniform distribution of values between [1..2).
+PHILOX_DEVICE_INLINE bfloat16 InternalUint16ToBfloat16(uint16 x) {
   // bfloat are formatted as follows (MSB first):
   //    sign(1) exponent(8) mantissa(7)
   // Conceptually construct the following:
@@ -780,13 +786,20 @@ PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x) {
   bfloat16 result;
   memcpy(&result, &val, sizeof(val));
   // The mantissa has an implicit leading 1, so the above code creates a value
-  // in [1, 2). The minus will not cause a rounding that makes the result 1.
+  // in [1, 2).
+  return result;
+}
+
+// Helper function to convert an 16-bit integer to a bfloat16 between [0..1).
+// This can create a uniform distribution of values between [0..1).
+PHILOX_DEVICE_INLINE bfloat16 Uint16ToBfloat16(uint16 x) {
+  // The minus will not cause a rounding that makes the result 1.
   // Instead it will just be close to 1.
-  return result - bfloat16(1.0);
+  return InternalUint16ToBfloat16(x) - bfloat16(1.0);
 }
 
-// Helper function to convert an 32-bit integer to a float between [0..1).
-PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32 x) {
+// Helper function to convert an 32-bit integer to a float between [1..2).
+PHILOX_DEVICE_INLINE float InternalUint32ToFloat(uint32 x) {
   // IEEE754 floats are formatted as follows (MSB first):
   //    sign(1) exponent(8) mantissa(23)
   // Conceptually construct the following:
@@ -800,7 +813,12 @@ PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32 x) {
   // Assumes that endian-ness is same for float and uint32.
   float result;
   memcpy(&result, &val, sizeof(val));
-  return result - 1.0f;
+  return result;
+}
+
+// Helper function to convert an 32-bit integer to a float between [0..1).
+PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32 x) {
+  return InternalUint32ToFloat(x) - 1.0f;
 }
 
 // Helper function to convert two 32-bit integers to a double between [0..1).