Skip to content

Vectorize RandomUniform format function for float/bfloat16 #24

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 29, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 94 additions & 6 deletions tensorflow/core/kernels/random_op_cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,83 @@ namespace functor {
using random::PhiloxRandom;
using random::SingleSampleAdapter;

template <class Generator, typename RealType, class Distribution>
class DistributionVec {
public:
explicit DistributionVec(Distribution* dist) { this->dist = dist; }

typename Distribution::ResultType operator()(Generator* gen) {
return (*dist)(gen);
}

void VecCopy(RealType* data, int64 length) {}

private:
Distribution* dist;
};

template <>
class DistributionVec<
random::PhiloxRandom, bfloat16,
random::UniformDistribution<random::PhiloxRandom, bfloat16>> {
public:
typedef random::UniformDistribution<random::PhiloxRandom, bfloat16>
Distribution;

explicit DistributionVec(Distribution* dist) { this->dist = dist; }

typename Distribution::ResultType operator()(random::PhiloxRandom* gen) {
typename random::PhiloxRandom::ResultType sample = (*gen)();
typename Distribution::ResultType result;

for (int i = 0; i < Distribution::kResultElementCount; ++i) {
result[i] = tensorflow::random::InternalUint16ToBfloat16(sample[i]);
}

return result;
}

void VecCopy(bfloat16* data, int64 length) {
// The mantissa has an implicit leading 1, so the above code creates a value
// in [1, 2). The minus will not cause a rounding that makes the result 1.
// Instead it will just be close to 1.
auto result_t = typename TTypes<bfloat16>::Tensor(data, length);
result_t = result_t - bfloat16(1.0);
}

private:
Distribution* dist;
};

template <>
class DistributionVec<
random::PhiloxRandom, float,
random::UniformDistribution<random::PhiloxRandom, float>> {
public:
typedef random::UniformDistribution<random::PhiloxRandom, float> Distribution;

explicit DistributionVec(Distribution* dist) { this->dist = dist; }

typename Distribution::ResultType operator()(random::PhiloxRandom* gen) {
typename random::PhiloxRandom::ResultType sample = (*gen)();
typename Distribution::ResultType result;

for (int i = 0; i < Distribution::kResultElementCount; ++i) {
result[i] = tensorflow::random::InternalUint32ToFloat(sample[i]);
}

return result;
}

void VecCopy(float* data, int64 length) {
auto result_t = typename TTypes<float>::Tensor(data, length);
result_t = result_t - 1.0f;
}

private:
Distribution* dist;
};

// The default implementation of the functor, which should never be invoked
// But we still need to provide implementation for now for the linker to work,
// since we do not support all the distributions yet.
Expand Down Expand Up @@ -91,18 +168,23 @@ struct FillPhiloxRandomTask<Distribution, false> {

// First fill all the full-size groups
int64 limit_group_full = std::min(limit_group, size / kGroupSize);
DistributionVec<random::PhiloxRandom, T, Distribution> dist_vec(&dist);
for (int64 index = start_group; index < limit_group_full; ++index) {
auto samples = dist(&gen);
auto samples = dist_vec(&gen);
std::copy(&samples[0], &samples[0] + kGroupSize, data + offset);
offset += kGroupSize;
}

// If there are any remaining elements that need to be filled, process them
int64 remaining_size = 0;
if (limit_group_full < limit_group) {
int64 remaining_size = size - limit_group_full * kGroupSize;
auto samples = dist(&gen);
remaining_size = size - limit_group_full * kGroupSize;
auto samples = dist_vec(&gen);
std::copy(&samples[0], &samples[0] + remaining_size, data + offset);
}
dist_vec.VecCopy(
data + start_group * kGroupSize,
(limit_group_full - start_group) * kGroupSize + remaining_size);
}
};

Expand All @@ -126,6 +208,8 @@ struct FillPhiloxRandomTask<Distribution, true> {
// First fill all the full-size groups
int64 limit_group_full = std::min(limit_group, size / kGroupSize);
int64 group_index;
DistributionVec<SingleSampleAdapter<PhiloxRandom>, T, Distribution>
dist_vec(&dist);
for (group_index = start_group; group_index < limit_group_full;
++group_index) {
// Reset the generator to the beginning of the output group region
Expand All @@ -135,21 +219,25 @@ struct FillPhiloxRandomTask<Distribution, true> {
gen.Skip(group_index * kGeneratorSkipPerOutputGroup);
SingleSampleAdapter<PhiloxRandom> single_samples(&gen);

auto samples = dist(&single_samples);
auto samples = dist_vec(&single_samples);
std::copy(&samples[0], &samples[0] + kGroupSize, data + offset);
offset += kGroupSize;
}

// If there are any remaining elements that need to be filled, process them
int64 remaining_size = 0;
if (limit_group_full < limit_group) {
PhiloxRandom gen = base_gen;
gen.Skip(group_index * kGeneratorSkipPerOutputGroup);
SingleSampleAdapter<PhiloxRandom> single_samples(&gen);

int64 remaining_size = size - limit_group_full * kGroupSize;
auto samples = dist(&single_samples);
remaining_size = size - limit_group_full * kGroupSize;
auto samples = dist_vec(&single_samples);
std::copy(&samples[0], &samples[0] + remaining_size, data + offset);
}
dist_vec.VecCopy(
data + start_group * kGroupSize,
(limit_group_full - start_group) * kGroupSize + remaining_size);
}
};

Expand Down
40 changes: 29 additions & 11 deletions tensorflow/core/lib/random/random_distributions.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,23 @@ limitations under the License.
#include <algorithm>
#include <type_traits>

#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "tensorflow/core/lib/bfloat16/bfloat16.h"
#include "tensorflow/core/lib/random/philox_random.h"
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"

namespace tensorflow {
namespace random {

// Helper function to convert a 16-bit integer to a half between [0..1).
PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x);
// Helper function to convert a 16-bit integer to a bfloat16 between [0..1).
PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x);
PHILOX_DEVICE_INLINE bfloat16 Uint16ToBfloat16(uint16 x);
// Helper function to convert a 16-bit integer to a bfloat16 between [1..2).
PHILOX_DEVICE_INLINE bfloat16 InternalUint16ToBfloat16(uint16 x);
// Helper function to convert a 32-bit integer to a float between [0..1).
PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32 x);
// Helper function to convert a 32-bit integer to a float between [1..2).
PHILOX_DEVICE_INLINE float InternalUint32ToFloat(uint32 x);
// Helper function to convert two 32-bit integers to a double between [0..1).
PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32 x0, uint32 x1);

Expand Down Expand Up @@ -108,9 +112,11 @@ class UniformDistribution<Generator, bfloat16> {
ResultType operator()(Generator* gen) {
typename Generator::ResultType sample = (*gen)();
ResultType result;

for (int i = 0; i < kResultElementCount; ++i) {
result[i] = Uint16ToGfloat16(sample[i]);
result[i] = Uint16ToBfloat16(sample[i]);
}

return result;
}
};
Expand Down Expand Up @@ -764,9 +770,9 @@ PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x) {
return result - Eigen::half(1.0);
}

// Helper function to convert an 16-bit integer to a bfloat16 between [0..1).
// This can create a uniform distribution of values between [0..1).
PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x) {
// Helper function to convert an 16-bit integer to a bfloat16 between [1..2).
// This can create a uniform distribution of values between [1..2).
PHILOX_DEVICE_INLINE bfloat16 InternalUint16ToBfloat16(uint16 x) {
// bfloat are formatted as follows (MSB first):
// sign(1) exponent(8) mantissa(7)
// Conceptually construct the following:
Expand All @@ -780,13 +786,20 @@ PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x) {
bfloat16 result;
memcpy(&result, &val, sizeof(val));
// The mantissa has an implicit leading 1, so the above code creates a value
// in [1, 2). The minus will not cause a rounding that makes the result 1.
// in [1, 2).
return result;
}

// Helper function to convert an 16-bit integer to a bfloat16 between [0..1).
// This can create a uniform distribution of values between [0..1).
PHILOX_DEVICE_INLINE bfloat16 Uint16ToBfloat16(uint16 x) {
// The minus will not cause a rounding that makes the result 1.
// Instead it will just be close to 1.
return result - bfloat16(1.0);
return InternalUint16ToBfloat16(x) - bfloat16(1.0);
}

// Helper function to convert an 32-bit integer to a float between [0..1).
PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32 x) {
// Helper function to convert an 32-bit integer to a float between [1..2).
PHILOX_DEVICE_INLINE float InternalUint32ToFloat(uint32 x) {
// IEEE754 floats are formatted as follows (MSB first):
// sign(1) exponent(8) mantissa(23)
// Conceptually construct the following:
Expand All @@ -800,7 +813,12 @@ PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32 x) {
// Assumes that endian-ness is same for float and uint32.
float result;
memcpy(&result, &val, sizeof(val));
return result - 1.0f;
return result;
}

// Helper function to convert an 32-bit integer to a float between [0..1).
PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32 x) {
return InternalUint32ToFloat(x) - 1.0f;
}

// Helper function to convert two 32-bit integers to a double between [0..1).
Expand Down