forked from pytorch/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathCUDAStream.h
251 lines (216 loc) · 8.78 KB
/
CUDAStream.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
#pragma once
#include <cstdint>
#include <utility>
#include <cuda_runtime_api.h>
#include <c10/core/DeviceGuard.h>
#include <c10/core/Stream.h>
#include <c10/cuda/CUDAFunctions.h>
#include <c10/util/Exception.h>
/*
* Stream pool note.
*
* A CUDAStream is an abstraction of an actual cuStream on the GPU. CUDAStreams
* are backed by cuStreams, but they use several pools to minimize the costs
* associated with creating, retaining, and destroying cuStreams.
*
* There are three pools per device, and a device's pools are lazily created.
*
* The first pool contains only the default stream. When the default stream
* is requested it's returned.
*
* The second pool is the "low priority" or "default priority" streams. In
* HIP builds there is no distinction between streams in this pool and streams
* in the third pool (below). There are 32 of these streams per device, and
* when a stream is requested one of these streams is returned round-robin.
* That is, the first stream requested is at index 0, the second at index 1...
* to index 31, then index 0 again.
*
* This means that if 33 low priority streams are requested, the first and
* last streams requested are actually the same stream (under the covers)
* and kernels enqueued on them cannot run concurrently.
*
* The third pool is the "high priority" streams. The third pool acts like
* the second pool except the streams are created with a higher priority.
*
* These pools suggest that stream users should prefer many short-lived streams,
* as the cost of acquiring and releasing streams is effectively zero. If
* many longer-lived streams are required in performance critical scenarios
* then the functionality here may need to be extended to allow, for example,
* "reserving" a subset of the pool so that other streams do not accidentally
* overlap the performance critical streams.
*
* Note: although the notion of "current stream for device" is thread local
* (every OS thread has a separate current stream, as one might expect),
* the stream pool is global across all threads; stream 0 is always stream 0
* no matter which thread you use it on. Multiple threads can synchronize
* on the same stream. Although the CUDA documentation is not very clear
* on the matter, streams are thread safe; e.g., it is safe to enqueue
* a kernel on the same stream from two different threads.
*/
namespace c10 {
namespace cuda {
// Value object representing a CUDA stream. This is just a wrapper
// around c10::Stream, but it comes with a little extra CUDA-specific
// functionality (conversion to cudaStream_t), and a guarantee that
// the wrapped c10::Stream really is a CUDA stream.
class C10_CUDA_API CUDAStream {
public:
enum Unchecked { UNCHECKED };
/// Construct a CUDAStream from a Stream. This construction is checked,
/// and will raise an error if the Stream is not, in fact, a CUDA stream.
explicit CUDAStream(Stream stream) : stream_(stream) {
TORCH_CHECK(stream_.device_type() == DeviceType::CUDA);
}
/// Construct a CUDAStream from a Stream with no error checking.
/// This constructor uses the "named" constructor idiom, and can
/// be invoked as: CUDAStream(CUDAStream::UNCHECKED, stream)
explicit CUDAStream(Unchecked, Stream stream) : stream_(stream) {}
bool operator==(const CUDAStream& other) const noexcept {
return unwrap() == other.unwrap();
}
bool operator!=(const CUDAStream& other) const noexcept {
return unwrap() != other.unwrap();
}
/// Implicit conversion to cudaStream_t.
operator cudaStream_t() const {
return stream();
}
/// Implicit conversion to Stream (a.k.a., forget that the stream is a
/// CUDA stream).
operator Stream() const {
return unwrap();
}
/// Get the CUDA device index that this stream is associated with.
DeviceIndex device_index() const {
return stream_.device_index();
}
/// Get the full Device that this stream is associated with. The Device
/// is guaranteed to be a CUDA device.
Device device() const {
return Device(DeviceType::CUDA, device_index());
}
/// Return the stream ID corresponding to this particular stream.
StreamId id() const {
return stream_.id();
}
bool query() const {
DeviceGuard guard{stream_.device()};
cudaError_t err = cudaStreamQuery(stream());
if (err == cudaSuccess) {
return true;
} else if (err != cudaErrorNotReady) {
C10_CUDA_CHECK(err);
} else {
// ignore and clear the error if not ready
(void)cudaGetLastError();
}
return false;
}
void synchronize() const {
DeviceGuard guard{stream_.device()};
c10::cuda::stream_synchronize(stream());
}
int priority() const {
DeviceGuard guard{stream_.device()};
int priority = 0;
C10_CUDA_CHECK(cudaStreamGetPriority(stream(), &priority));
return priority;
}
/// Explicit conversion to cudaStream_t.
cudaStream_t stream() const;
/// Explicit conversion to Stream.
Stream unwrap() const {
return stream_;
}
/// Reversibly pack a CUDAStream into a uint64_t representation. This may
/// be helpful when storing a CUDAStream in a C struct, where you cannot
/// conveniently place the CUDAStream object itself (which is morally
/// equivalent, but unfortunately is not POD due to the fact that it
/// has constructors.)
///
/// The CUDAStream can be unpacked using unpack(). The format of
/// the uint64_t is unspecified and may be changed.
uint64_t pack() const noexcept {
return stream_.pack();
}
// Unpack a CUDAStream from the uint64_t representation generated by pack().
static CUDAStream unpack(uint64_t bits) {
return CUDAStream(Stream::unpack(bits));
}
static std::tuple<int, int> priority_range() {
// Note: this returns the range of priority **supported by PyTorch**, not
// the range of priority **supported by CUDA**. The former is a subset of
// the latter. Currently PyTorch only supports 0 and -1, which are "low" and
// "high" priority.
int least_priority, greatest_priority;
C10_CUDA_CHECK(
cudaDeviceGetStreamPriorityRange(&least_priority, &greatest_priority));
TORCH_INTERNAL_ASSERT(
least_priority >= 0, "Unexpected CUDA stream priority range");
TORCH_INTERNAL_ASSERT(
greatest_priority <= -1, "Unexpected CUDA stream priority range");
return std::make_tuple(0, -1);
}
// Deleted for now; use CUDAEvent::block instead
// void synchronize_with(const CUDAEvent& event) const;
private:
Stream stream_;
};
/**
* Get a new stream from the CUDA stream pool. You can think of this
* as "creating" a new stream, but no such creation actually happens;
* instead, streams are preallocated from the pool and returned in a
* round-robin fashion.
*
* You can request a stream from the high priority pool by setting
* isHighPriority to true, or a stream for a specific device by setting device
* (defaulting to the current CUDA stream.)
*/
TORCH_API CUDAStream
getStreamFromPool(const bool isHighPriority = false, DeviceIndex device = -1);
/**
* Get a CUDAStream from a externally allocated one.
*
* This is mainly for interoperability with different libraries where we
* want to operate on a non-torch allocated stream for data exchange or similar
* purposes
*/
TORCH_API CUDAStream
getStreamFromExternal(cudaStream_t ext_stream, DeviceIndex device_index);
/**
* Get the default CUDA stream, for the passed CUDA device, or for the
* current device if no device index is passed. The default stream is
* where most computation occurs when you aren't explicitly using
* streams.
*/
TORCH_API CUDAStream getDefaultCUDAStream(DeviceIndex device_index = -1);
/**
* Get the current CUDA stream, for the passed CUDA device, or for the
* current device if no device index is passed. The current CUDA stream
* will usually be the default CUDA stream for the device, but it may
* be different if someone called 'setCurrentCUDAStream' or used 'StreamGuard'
* or 'CUDAStreamGuard'.
*/
TORCH_API CUDAStream getCurrentCUDAStream(DeviceIndex device_index = -1);
/**
* Set the current stream on the device of the passed in stream to be
* the passed in stream. Yes, you read that right: this function
* has *nothing* to do with the current device: it toggles the current
* stream of the device of the passed stream.
*
* Confused? Avoid using this function; prefer using 'CUDAStreamGuard' instead
* (which will switch both your current device and current stream in the way you
* expect, and reset it back to its original state afterwards).
*/
TORCH_API void setCurrentCUDAStream(CUDAStream stream);
C10_API std::ostream& operator<<(std::ostream& stream, const CUDAStream& s);
} // namespace cuda
} // namespace c10
namespace std {
template <>
struct hash<c10::cuda::CUDAStream> {
size_t operator()(c10::cuda::CUDAStream s) const noexcept {
return std::hash<c10::Stream>{}(s.unwrap());
}
};
} // namespace std