forked from pytorch/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathTensorConversions.cpp
374 lines (324 loc) · 12.6 KB
/
TensorConversions.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
#include <ATen/ATen.h>
#include <ATen/NativeFunctions.h>
#include <c10/util/Optional.h>
#include <ATen/quantized/Quantizer.h>
#include <c10/core/impl/DeviceGuardImplInterface.h>
namespace at {
namespace native {
// Take a Device that may not have device_index set (i.e., having it as -1
// representing the current device) and return the corresponding Device
// according to the actual device at the time of this function call. No-op
// if the device_index is set.
static inline Device ensure_has_index(Device device) {
if (device.is_cpu() || device.has_index()) {
return device;
}
const c10::impl::DeviceGuardImplInterface* impl = c10::impl::getDeviceGuardImpl(device.type());
return impl->getDevice();
}
static inline optional<Device> ensure_has_index(optional<Device> device) {
if (!device.has_value()) {
return nullopt;
}
return ensure_has_index(device.value());
}
Tensor _to_copy(
const Tensor& self,
c10::optional<ScalarType> dtype,
c10::optional<Layout> layout,
c10::optional<Device> device,
c10::optional<bool> pin_memory,
bool non_blocking,
c10::optional<c10::MemoryFormat> optional_memory_format) {
TORCH_CHECK(!layout.has_value() || self.layout() == layout.value(),
"to(options) doesn't support converting to a different layout, "
"but got self.layout being ", self.layout(),
" and options.layout set as ", layout.value());
auto options = TensorOptions()
.dtype(dtype)
.layout(layout)
.device(device)
.pinned_memory(pin_memory);
if (options.has_device()) {
options = options.device(ensure_has_index(options.device()));
}
// memory_format is handled separately due to MemoryFormat::Preserve logic
options = self.options().merge_in(options).memory_format(c10::nullopt);
auto memory_format = optional_memory_format.value_or(MemoryFormat::Preserve);
bool pin_out = (non_blocking && self.is_cuda() && options.device().is_cpu() &&
(options.layout() == c10::kStrided));
if (memory_format == MemoryFormat::Preserve) {
if (self.is_non_overlapping_and_dense() && options.device().supports_as_strided()) {
Tensor r;
if (self.is_quantized()) {
r = at::empty_quantized(self.sizes(), self, options);
at::QuantizerPtr quantizer = r.quantizer();
r.copy_(self, non_blocking);
set_quantizer_(r, quantizer);
} else {
r = at::empty_strided(
self.sizes(),
self.strides(),
options.pinned_memory(pin_out));
r.copy_(self, non_blocking);
}
return r;
} else {
memory_format = self.suggest_memory_format();
}
}
// See Note [Explicit nullopt MemoryFormat argument]
auto r = at::empty(self.sizes(),
options.memory_format(memory_format).pinned_memory(pin_out),
c10::nullopt);
r.copy_(self, non_blocking);
return r;
}
template <typename T>
static inline bool is_null_or_equal_to(const c10::optional<T>& test, const T& value) {
if (!test.has_value()) {
return true;
}
return test.value() == value;
}
// NOTE: static runtime's to_maybe_copy_out relies on details of this
// check; if you change how it works, please update static runtime as
// well.
bool to_will_alias(
const Tensor& self,
c10::optional<ScalarType> dtype,
c10::optional<Layout> layout,
c10::optional<Device> device,
bool copy,
c10::optional<c10::MemoryFormat> optional_memory_format) {
auto memory_format = optional_memory_format.value_or(MemoryFormat::Preserve);
return is_null_or_equal_to(dtype, self.dtype().toScalarType()) &&
is_null_or_equal_to(layout, self.layout()) &&
is_null_or_equal_to(device, self.device()) &&
!copy &&
(memory_format == MemoryFormat::Preserve ||
self.suggest_memory_format() == memory_format);
}
static inline Tensor to_impl(
const Tensor& self,
c10::optional<ScalarType> dtype,
c10::optional<Layout> layout,
c10::optional<Device> device,
c10::optional<bool> pin_memory,
bool non_blocking,
bool copy,
c10::optional<c10::MemoryFormat> optional_memory_format) {
// fast path
if (to_will_alias(self, dtype, layout, device, copy, optional_memory_format)) {
return self;
}
return at::_to_copy(
self, dtype, layout, device, pin_memory, non_blocking, optional_memory_format);
}
// If input tensor is fp32, cast it to fp16, otherwise leave it alone.
// (this is intended to be used internally by the JIT autocast implementation)
Tensor _autocast_to_reduced_precision(const Tensor& self, bool cuda_enabled, bool cpu_enabled, ScalarType cuda_dtype, ScalarType cpu_dtype) {
if (self.dtype() == at::ScalarType::Float &&
((self.device().is_cuda() && cuda_enabled) ||
(self.device().is_cpu() && cpu_enabled))
) {
at::ScalarType target = at::ScalarType::Undefined;
if (self.device().is_cuda()) {
target = cuda_dtype;
} else if (self.device().is_cpu()) {
target = cpu_dtype;
}
TORCH_INTERNAL_ASSERT(target != at::ScalarType::Undefined, "_autocast_to_reduced_precision requires legit ScalarType argument for given device");
return to_impl(
self, target, c10::nullopt, c10::nullopt, c10::nullopt, false, false, c10::nullopt);
} else {
return self;
}
}
// If input tensor is fp16, cast it to fp32, otherwise leave it alone.
// (this is intended to be used internally by the JIT autocast implementation)
Tensor _autocast_to_full_precision(const Tensor& self, bool cuda_enabled, bool cpu_enabled) {
if ((self.dtype() == at::ScalarType::Half || self.dtype() == at::ScalarType::BFloat16) &&
((self.device().is_cuda() && cuda_enabled) ||
(self.device().is_cpu() && cpu_enabled))
) {
return to_impl(
self, at::ScalarType::Float, c10::nullopt, c10::nullopt, c10::nullopt, false, false, c10::nullopt);
} else {
return self;
}
}
Tensor to(
const Tensor& self,
c10::optional<ScalarType> dtype,
c10::optional<Layout> layout,
c10::optional<Device> device,
c10::optional<bool> pin_memory,
bool non_blocking,
bool copy,
c10::optional<c10::MemoryFormat> optional_memory_format
) {
return to_impl(
self,
dtype,
layout,
ensure_has_index(device),
pin_memory,
non_blocking,
copy,
optional_memory_format);
}
Tensor to(const Tensor& self, Device device, ScalarType dtype, bool non_blocking, bool copy, c10::optional<c10::MemoryFormat> optional_memory_format) {
return to_impl(
self,
dtype,
nullopt,
ensure_has_index(device),
nullopt,
non_blocking,
copy,
optional_memory_format);
}
Tensor to(const Tensor& self, ScalarType dtype, bool non_blocking, bool copy, c10::optional<c10::MemoryFormat> optional_memory_format) {
return to_impl(
self,
dtype,
nullopt,
nullopt,
nullopt,
non_blocking,
copy,
optional_memory_format);
}
Tensor to(const Tensor& self, const Tensor& other, bool non_blocking, bool copy, c10::optional<c10::MemoryFormat> optional_memory_format) {
auto options = other.options();
return to_impl(
self,
options.dtype().toScalarType(),
options.layout(),
options.device(),
options.pinned_memory(),
non_blocking,
copy,
optional_memory_format);
}
// This op is important primarily for lazy / graph-based backends.
// While this vanilla implementation loops through each tensor and independently converts it to cpu,
// a lazy backend like XLA might need to tell sync updates across tensors.
std::vector<Tensor> _to_cpu(TensorList tensors) {
std::vector<Tensor> cpu_tensors;
for (const auto& t : tensors) {
cpu_tensors.push_back(t.cpu());
}
return cpu_tensors;
}
Tensor to_dense_backward(const Tensor& grad, const Tensor& input_) {
AT_ASSERT(input_.layout() != c10::kStrided);
if (input_.layout() == c10::kSparse) {
auto input = input_.coalesce();
return grad.sparse_mask(input);
} else if (input_.layout() == c10::kMkldnn) {
return grad.to_mkldnn(input_.scalar_type());
} else {
AT_ERROR("Unsupported input layout: ", input_.layout());
}
}
Tensor to_mkldnn_backward(const Tensor& grad, const Tensor& input_) {
AT_ASSERT(input_.layout() == c10::kStrided);
return grad.to_dense(input_.scalar_type());
}
// Computes the strides for view_dtype output when the view dtype is
// smaller than the original dtype
inline DimVector compute_strides_for_view_dtype_downsize(IntArrayRef old_strides, int64_t size_ratio, ScalarType old_dtype, ScalarType new_dtype) {
const int64_t ndim = old_strides.size();
TORCH_CHECK(
old_strides[ndim - 1] == 1,
"self.stride(-1) must be 1 to view ", old_dtype, " as ", new_dtype,
" (different element sizes), but got ", old_strides[ndim - 1]);
DimVector new_strides(ndim);
for (int64_t dim_idx = 0; dim_idx < ndim - 1; dim_idx++) {
new_strides[dim_idx] = old_strides[dim_idx] * size_ratio;
}
new_strides[ndim - 1] = 1;
return new_strides;
}
// Computes the strides for view_dtype output when the view dtype is
// larger than the original dtype
inline DimVector compute_strides_for_view_dtype_upsize(IntArrayRef old_strides, int64_t size_ratio, ScalarType old_dtype, ScalarType new_dtype) {
const int64_t ndim = old_strides.size();
TORCH_CHECK(
old_strides[ndim - 1] == 1,
"self.stride(-1) must be 1 to view ", old_dtype, " as ", new_dtype,
" (different element sizes), but got ", old_strides[ndim - 1]);
DimVector new_strides(ndim);
for (int64_t dim_idx = 0; dim_idx < ndim - 1; dim_idx++) {
TORCH_CHECK(
(old_strides[dim_idx] % size_ratio) == 0,
"self.stride(", dim_idx, ") must be divisible by ", size_ratio,
" to view ", old_dtype, " as ", new_dtype, " (different element sizes), ",
"but got ", old_strides[dim_idx]);
new_strides[dim_idx] = old_strides[dim_idx] / size_ratio;
}
new_strides[ndim - 1] = 1;
return new_strides;
}
Tensor view_dtype(const Tensor& self, ScalarType dtype) {
if (self.scalar_type() == dtype) {
return self;
}
const auto type_meta = c10::scalarTypeToTypeMeta(dtype);
TORCH_CHECK(!self.is_conj(),
"torch.Tensor.view is not supported for conjugate view tensors when converting to a different dtype.");
TORCH_CHECK(!self.is_neg(),
"torch.Tensor.view is not supported for tensors with negative bit set when converting to a different dtype.");
int64_t self_element_size = self.element_size();
int64_t new_element_size = static_cast<int64_t>(type_meta.itemsize());
Storage storage = self.storage();
auto new_tensor = detail::make_tensor<TensorImpl>(
std::move(storage), self.key_set(), type_meta);
auto* impl = new_tensor.unsafeGetTensorImpl();
if (self_element_size == new_element_size) {
impl->set_storage_offset(self.storage_offset());
impl->set_sizes_and_strides(self.sizes(), self.strides());
} else if (self.dim() == 0) {
TORCH_CHECK(false,
"self.dim() cannot be 0 to view ", self.scalar_type(), " as ",
dtype, " (different element sizes)");
} else if (self_element_size > new_element_size) {
// Downsizing element size
int64_t size_ratio = self_element_size / new_element_size;
auto new_strides = compute_strides_for_view_dtype_downsize(
self.strides(), size_ratio, self.scalar_type(), dtype);
auto old_sizes = self.sizes();
DimVector new_sizes(self.dim());
std::copy(old_sizes.begin(), old_sizes.end(), new_sizes.begin());
new_sizes[self.dim() - 1] *= size_ratio;
auto new_storage_offset = size_ratio * self.storage_offset();
impl->set_storage_offset(new_storage_offset);
impl->set_sizes_and_strides(new_sizes, new_strides);
} else {
// Upsizing element size
int64_t size_ratio = new_element_size / self_element_size;
TORCH_CHECK(
(self.size(-1) % size_ratio) == 0,
"self.size(-1) must be divisible by ", size_ratio, " to view ",
self.scalar_type(), " as ", dtype, " (different element sizes), ",
"but got ", self.size(-1));
TORCH_CHECK(
(self.storage_offset() % size_ratio) == 0,
"self.storage_offset() must be divisible by ", size_ratio, " to view ",
self.scalar_type(), " as ", dtype, " (different element sizes), but got ",
self.storage_offset());
auto new_strides = compute_strides_for_view_dtype_upsize(
self.strides(), size_ratio, self.scalar_type(), dtype);
auto old_sizes = self.sizes();
DimVector new_sizes(self.dim());
std::copy(old_sizes.begin(), old_sizes.end(), new_sizes.begin());
new_sizes[self.dim() - 1] /= size_ratio;
auto new_storage_offset = self.storage_offset() / size_ratio;
impl->set_storage_offset(new_storage_offset);
impl->set_sizes_and_strides(new_sizes, new_strides);
}
return new_tensor;
}
}} // namespace at::native