Skip to content

Commit cca5d0a

Browse files
David G. Andersentensorflower-gardener
David G. Andersen
authored andcommitted
Ensure that excessively large sparse matmuls are not executed
on the GPU b/c of 32 bit indexing optimization, and provide a friendly warning instead of wrong results. Change: 118988179
1 parent 5a0e8fb commit cca5d0a

File tree

3 files changed

+45
-8
lines changed

3 files changed

+45
-8
lines changed

tensorflow/core/kernels/BUILD

+1
Original file line numberDiff line numberDiff line change
@@ -1211,6 +1211,7 @@ tf_kernel_libraries(
12111211
"serialize_sparse_op",
12121212
],
12131213
deps = [
1214+
":bounds_check",
12141215
":fill_functor",
12151216
"//tensorflow/core:framework",
12161217
"//tensorflow/core:lib",

tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc

+27-5
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ limitations under the License.
2121

2222
#include "tensorflow/core/framework/op.h"
2323
#include "tensorflow/core/framework/op_kernel.h"
24+
#include "tensorflow/core/kernels/bounds_check.h"
2425
#include "tensorflow/core/kernels/fill_functor.h"
2526

2627
namespace tensorflow {
@@ -111,9 +112,24 @@ class SparseTensorDenseMatMulOp : public OpKernel {
111112
}
112113

113114
Tensor scratch;
114-
int nnz = a_values->NumElements();
115115

116116
if (std::is_same<Device, GPUDevice>::value) {
117+
// The GPU implementation is optimized to use 32 bit indexing, so
118+
// give a friendly error to the programmer early on if they exceed.
119+
OP_REQUIRES(
120+
ctx,
121+
FastBoundsCheck(inner_left, std::numeric_limits<int>::max()) &&
122+
FastBoundsCheck(inner_right, std::numeric_limits<int>::max()) &&
123+
FastBoundsCheck(outer_left, std::numeric_limits<int>::max()) &&
124+
FastBoundsCheck(outer_right, std::numeric_limits<int>::max()) &&
125+
FastBoundsCheck(b->NumElements(),
126+
std::numeric_limits<int>::max()) &&
127+
FastBoundsCheck(out->NumElements(),
128+
std::numeric_limits<int>::max()) &&
129+
FastBoundsCheck(a_values->NumElements(),
130+
std::numeric_limits<int>::max()),
131+
errors::InvalidArgument("Cannot use GPU for > 2^31 entry inputs"));
132+
const int nnz = static_cast<const int>(a_values->NumElements());
117133
// Need nnz length vec scratch space on the GPU.
118134
OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
119135
TensorShape({nnz}), &scratch));
@@ -207,6 +223,7 @@ struct SparseTensorDenseMatMulFunctor<CPUDevice, T, ADJ_A, ADJ_B> {
207223
typename TTypes<T>::Vec scratch) {
208224
const std::size_t nnz = a_values.size();
209225
const std::size_t rhs_right = (ADJ_B ? b.dimension(0) : b.dimension(1));
226+
const std::size_t lhs_right = (ADJ_B ? b.dimension(1) : b.dimension(0));
210227
const int lhs_index_a = ADJ_A ? 1 : 0;
211228
const int rhs_index_a = ADJ_A ? 0 : 1;
212229

@@ -220,8 +237,10 @@ struct SparseTensorDenseMatMulFunctor<CPUDevice, T, ADJ_A, ADJ_B> {
220237
// Disable vectorization if the RHS of output is too small
221238
auto maybe_adjoint_b = MaybeAdjoint<decltype(b), ADJ_B>(b);
222239
for (std::size_t i = 0; i < nnz; ++i) {
223-
const int64 m = a_indices(i, lhs_index_a);
224-
const int64 k = a_indices(i, rhs_index_a);
240+
const int64 m = internal::SubtleMustCopy(a_indices(i, lhs_index_a));
241+
const int64 k = internal::SubtleMustCopy(a_indices(i, rhs_index_a));
242+
CHECK_LT(k, lhs_right);
243+
CHECK_LT(m, out.dimension(0));
225244
const T a_value = ADJ_A ? MaybeConj(a_values(i)) : a_values(i);
226245
for (std::size_t n = 0; n < rhs_right; ++n) {
227246
const T b_value = maybe_adjoint_b(k, n);
@@ -230,15 +249,18 @@ struct SparseTensorDenseMatMulFunctor<CPUDevice, T, ADJ_A, ADJ_B> {
230249
}
231250
} else {
232251
for (std::size_t i = 0; i < nnz; ++i) {
233-
const int64 m = a_indices(i, lhs_index_a);
234-
const int64 k = a_indices(i, rhs_index_a);
252+
const int64 m = internal::SubtleMustCopy(a_indices(i, lhs_index_a));
253+
const int64 k = internal::SubtleMustCopy(a_indices(i, rhs_index_a));
235254
const T a_value = (ADJ_A) ? MaybeConj(a_values(i)) : a_values(i);
255+
CHECK_LT(m, out.dimension(0));
236256
if (ADJ_B) {
257+
CHECK_LT(k, b.dimension(1));
237258
out.template chip<0>(m) +=
238259
b.template chip<1>(k).unaryExpr(
239260
Eigen::internal::scalar_conjugate_op<T>()) *
240261
a_value;
241262
} else {
263+
CHECK_LT(k, b.dimension(0));
242264
out.template chip<0>(m) += b.template chip<0>(k) * a_value;
243265
}
244266
}

tensorflow/core/kernels/sparse_tensor_dense_matmul_op_gpu.cu.cc

+17-3
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ class SparseTensorDenseMatMulGPUGenerator {
4040
rhs_index_a_(ADJ_A ? 0 : 1),
4141
a_indices_(a_indices),
4242
a_values_(a_values),
43+
lhs_right_size(ADJ_B ? b.dimension(1) : b.dimension(0)),
4344
maybe_adjoint_b_(
4445
functor::MaybeAdjoint<typename TTypes<const T, 2>::Tensor32Bit,
4546
ADJ_B>(b)) {}
@@ -49,9 +50,21 @@ class SparseTensorDenseMatMulGPUGenerator {
4950
#ifdef __CUDA_ARCH__
5051
const int j = j_and_ix[0];
5152
const int ix = j_and_ix[1];
52-
const int m = a_indices_(ix, lhs_index_a_);
53-
const int k = a_indices_(ix, rhs_index_a_);
54-
const T b_value = maybe_adjoint_b_(k, j);
53+
int m = a_indices_(ix, lhs_index_a_);
54+
int k = a_indices_(ix, rhs_index_a_);
55+
assert(k < lhs_right_size);
56+
assert(m < out_.dimension(0));
57+
// If asserts are disabled, the caller is violating the sparse
58+
// tensor index contract, and so we return invalid results.
59+
// Force returning NaNs to try to signal that something is amiss.
60+
T b_value;
61+
if (k >= lhs_right_size || m >= out_.dimension(0)) {
62+
m = 0;
63+
k = 0;
64+
b_value = std::numeric_limits<T>::quiet_NaN();
65+
} else {
66+
b_value = maybe_adjoint_b_(k, j);
67+
}
5568
atomicAdd(&out_(m, j), a_values_(ix) * b_value);
5669
#else
5770
assert(false && "This should only be run on the device");
@@ -66,6 +79,7 @@ class SparseTensorDenseMatMulGPUGenerator {
6679
const int rhs_index_a_;
6780
TTypes<const int64, 2>::Tensor32Bit a_indices_;
6881
typename TTypes<const T, 1>::Tensor32Bit a_values_;
82+
const int lhs_right_size;
6983
functor::MaybeAdjoint<typename TTypes<const T, 2>::Tensor32Bit, ADJ_B>
7084
maybe_adjoint_b_;
7185
};

0 commit comments

Comments
 (0)