@@ -21,6 +21,7 @@ limitations under the License.
21
21
22
22
#include " tensorflow/core/framework/op.h"
23
23
#include " tensorflow/core/framework/op_kernel.h"
24
+ #include " tensorflow/core/kernels/bounds_check.h"
24
25
#include " tensorflow/core/kernels/fill_functor.h"
25
26
26
27
namespace tensorflow {
@@ -111,9 +112,24 @@ class SparseTensorDenseMatMulOp : public OpKernel {
111
112
}
112
113
113
114
Tensor scratch;
114
- int nnz = a_values->NumElements ();
115
115
116
116
if (std::is_same<Device, GPUDevice>::value) {
117
+ // The GPU implementation is optimized to use 32 bit indexing, so
118
+ // give a friendly error to the programmer early on if they exceed.
119
+ OP_REQUIRES (
120
+ ctx,
121
+ FastBoundsCheck (inner_left, std::numeric_limits<int >::max ()) &&
122
+ FastBoundsCheck (inner_right, std::numeric_limits<int >::max ()) &&
123
+ FastBoundsCheck (outer_left, std::numeric_limits<int >::max ()) &&
124
+ FastBoundsCheck (outer_right, std::numeric_limits<int >::max ()) &&
125
+ FastBoundsCheck (b->NumElements (),
126
+ std::numeric_limits<int >::max ()) &&
127
+ FastBoundsCheck (out->NumElements (),
128
+ std::numeric_limits<int >::max ()) &&
129
+ FastBoundsCheck (a_values->NumElements (),
130
+ std::numeric_limits<int >::max ()),
131
+ errors::InvalidArgument (" Cannot use GPU for > 2^31 entry inputs" ));
132
+ const int nnz = static_cast <const int >(a_values->NumElements ());
117
133
// Need nnz length vec scratch space on the GPU.
118
134
OP_REQUIRES_OK (ctx, ctx->allocate_temp (DataTypeToEnum<T>::value,
119
135
TensorShape ({nnz}), &scratch));
@@ -207,6 +223,7 @@ struct SparseTensorDenseMatMulFunctor<CPUDevice, T, ADJ_A, ADJ_B> {
207
223
typename TTypes<T>::Vec scratch) {
208
224
const std::size_t nnz = a_values.size ();
209
225
const std::size_t rhs_right = (ADJ_B ? b.dimension (0 ) : b.dimension (1 ));
226
+ const std::size_t lhs_right = (ADJ_B ? b.dimension (1 ) : b.dimension (0 ));
210
227
const int lhs_index_a = ADJ_A ? 1 : 0 ;
211
228
const int rhs_index_a = ADJ_A ? 0 : 1 ;
212
229
@@ -220,8 +237,10 @@ struct SparseTensorDenseMatMulFunctor<CPUDevice, T, ADJ_A, ADJ_B> {
220
237
// Disable vectorization if the RHS of output is too small
221
238
auto maybe_adjoint_b = MaybeAdjoint<decltype (b), ADJ_B>(b);
222
239
for (std::size_t i = 0 ; i < nnz; ++i) {
223
- const int64 m = a_indices (i, lhs_index_a);
224
- const int64 k = a_indices (i, rhs_index_a);
240
+ const int64 m = internal::SubtleMustCopy (a_indices (i, lhs_index_a));
241
+ const int64 k = internal::SubtleMustCopy (a_indices (i, rhs_index_a));
242
+ CHECK_LT (k, lhs_right);
243
+ CHECK_LT (m, out.dimension (0 ));
225
244
const T a_value = ADJ_A ? MaybeConj (a_values (i)) : a_values (i);
226
245
for (std::size_t n = 0 ; n < rhs_right; ++n) {
227
246
const T b_value = maybe_adjoint_b (k, n);
@@ -230,15 +249,18 @@ struct SparseTensorDenseMatMulFunctor<CPUDevice, T, ADJ_A, ADJ_B> {
230
249
}
231
250
} else {
232
251
for (std::size_t i = 0 ; i < nnz; ++i) {
233
- const int64 m = a_indices (i, lhs_index_a);
234
- const int64 k = a_indices (i, rhs_index_a);
252
+ const int64 m = internal::SubtleMustCopy ( a_indices (i, lhs_index_a) );
253
+ const int64 k = internal::SubtleMustCopy ( a_indices (i, rhs_index_a) );
235
254
const T a_value = (ADJ_A) ? MaybeConj (a_values (i)) : a_values (i);
255
+ CHECK_LT (m, out.dimension (0 ));
236
256
if (ADJ_B) {
257
+ CHECK_LT (k, b.dimension (1 ));
237
258
out.template chip <0 >(m) +=
238
259
b.template chip <1 >(k).unaryExpr (
239
260
Eigen::internal::scalar_conjugate_op<T>()) *
240
261
a_value;
241
262
} else {
263
+ CHECK_LT (k, b.dimension (0 ));
242
264
out.template chip <0 >(m) += b.template chip <0 >(k) * a_value;
243
265
}
244
266
}
0 commit comments