Skip to content

[OneDNN] Optimize unsorted segment sum on 1 dim #74

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 32 additions & 2 deletions tensorflow/core/kernels/segment_reduction_ops_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -358,8 +358,12 @@ struct UnsortedSegmentFunctor<CPUDevice, T, Index, InitialValueF, ReductionF> {
const int64_t N = segment_ids.dimension(0);
const int64_t num_segments = output.dimension(0);
const int64_t inner_dim = data.dimension(1);
const T* data_ptr = data.data();
T* out_ptr = output.data();
ReductionF reduction;

bool data_is_1D = data.dimensions()[1] == 1;

// `num_real_segment` counts the rows actually reduced from input,
// the rows with negative segment index will be excluded.
// It will be used for cost model.
Expand Down Expand Up @@ -410,15 +414,29 @@ struct UnsortedSegmentFunctor<CPUDevice, T, Index, InitialValueF, ReductionF> {
}
}
};

auto reductionWorker1D = [&](int64_t begin, int64_t end) -> void {
for (int64_t i = 0; i < N; i++) {
Index j = internal::SubtleMustCopy(segment_ids(i));
// If `j` is in work scope of this worker, do the reduction.
if (j >= begin && j < end) {
reduction(data_ptr[i], out_ptr[j]);
}
}
};
// Reduction functors includes Sum, Max, Min, etc. Simply consider it
// will cost 5 cycles per operation.
const int64_t kAverTaskSize = num_real_segment / num_segments;
const int64_t compute_cycles = 5 * inner_dim * kAverTaskSize;
const int64_t input_bytes = sizeof(T) * inner_dim * kAverTaskSize;
const int64_t output_bytes = sizeof(T) * inner_dim * kAverTaskSize;
const Eigen::TensorOpCost cost(input_bytes, output_bytes, compute_cycles);
cpu_device.parallelFor(num_segments, cost, reductionWorker);
if(data_is_1D) {
cpu_device.parallelFor(num_segments, cost, reductionWorker1D);
}
else {
cpu_device.parallelFor(num_segments, cost, reductionWorker);
}

}
};

Expand All @@ -435,27 +453,39 @@ struct SumOp {
void operator()(const constMatrixChip<T> data, MatrixChip<T> output) {
output += data;
}
void operator()(const T &data, T &output) {
output += data;
}
};

template <typename T>
struct MaxOp {
void operator()(const constMatrixChip<T> data, MatrixChip<T> output) {
output = data.cwiseMax(output);
}
void operator()(const T &data, T &output) {
output = std::max(data, output);
}
};

template <typename T>
struct MinOp {
void operator()(const constMatrixChip<T> data, MatrixChip<T> output) {
output = data.cwiseMin(output);
}
void operator()(const T &data, T &output) {
output = std::min(data, output);
}
};

template <typename T>
struct ProdOp {
void operator()(const constMatrixChip<T> data, MatrixChip<T> output) {
output *= data;
}
void operator()(const T &data, T &output) {
output *= data;
}
};
} // namespace functor

Expand Down