6
6
7
7
using namespace at ;
8
8
9
- enum Mode {rNearest, rStochastic};
9
+ enum Mode
10
+ {
11
+ rNearest,
12
+ rStochastic
13
+ };
10
14
11
15
#define CHECK_CONTIGUOUS (x ) AT_CHECK(x.is_contiguous(), #x " must be contiguous" )
12
16
#define CHECK_CPU (x ) AT_CHECK(!x.type().is_cuda(), #x " must be a CPU tensor" )
13
- #define CHECK_INPUT (x ) CHECK_CPU(x); CHECK_CONTIGUOUS(x);
17
+ #define CHECK_INPUT (x ) \
18
+ CHECK_CPU (x); \
19
+ CHECK_CONTIGUOUS (x);
14
20
15
- #define RFLOAT_TO_BITS (x ) (*reinterpret_cast <unsigned int *>(x))
16
- #define RBITS_TO_FLOAT (x ) (*reinterpret_cast <float *>(x))
17
- #define FLOAT_TO_BITS (f, i ) assert(sizeof f == sizeof i); std::memcpy(&i, &f, sizeof i)
18
- #define BITS_TO_FLOAT (i, f ) assert(sizeof f == sizeof i); std::memcpy(&f, &i, sizeof f)
21
+ #define RFLOAT_TO_BITS (x ) (*reinterpret_cast <unsigned int *>(x))
22
+ #define RBITS_TO_FLOAT (x ) (*reinterpret_cast <float *>(x))
23
+ #define FLOAT_TO_BITS (f, i ) \
24
+ assert (sizeof f == sizeof i); \
25
+ std::memcpy (&i, &f, sizeof i)
26
+ #define BITS_TO_FLOAT (i, f ) \
27
+ assert (sizeof f == sizeof i); \
28
+ std::memcpy (&f, &i, sizeof f)
19
29
20
30
std::random_device rd;
21
31
std::mt19937 gen (rd());
22
32
std::uniform_int_distribution<> dis (0 );
23
33
24
34
template <typename T>
25
- T clamp_helper (T a, T min, T max) {
26
- if (a > max) return max;
27
- else if (a < min) return min;
28
- else return a;
35
+ T clamp_helper (T a, T min, T max)
36
+ {
37
+ if (a > max)
38
+ return max;
39
+ else if (a < min)
40
+ return min;
41
+ else
42
+ return a;
29
43
}
30
44
31
45
template <typename T>
32
- T clamp_mask_helper (T a, T min, T max, uint8_t * mask) {
33
- if (a > max) {
46
+ T clamp_mask_helper (T a, T min, T max, uint8_t *mask)
47
+ {
48
+ if (a > max)
49
+ {
34
50
*mask = 1 ;
35
51
return max;
36
52
}
37
- else if (a < min) {
53
+ else if (a < min)
54
+ {
38
55
*mask = 1 ;
39
56
return min;
40
57
}
41
- else return a;
58
+ else
59
+ return a;
42
60
}
43
61
44
- std::tuple<Tensor, Tensor> fixed_point_quantize_stochastic_mask (Tensor a, int wl, int fl, bool symmetric) {
62
+ std::tuple<Tensor, Tensor> fixed_point_quantize_stochastic_mask (Tensor a, int wl, int fl, bool symmetric)
63
+ {
45
64
CHECK_INPUT (a);
46
65
auto r = rand_like (a);
47
66
auto a_array = a.data <float >();
@@ -54,14 +73,16 @@ std::tuple<Tensor, Tensor> fixed_point_quantize_stochastic_mask(Tensor a, int wl
54
73
int sigma = -fl;
55
74
float t_min, t_max;
56
75
fixed_min_max (wl, fl, symmetric, &t_min, &t_max);
57
- for (int64_t i=0 ; i < size; i++) {
76
+ for (int64_t i = 0 ; i < size; i++)
77
+ {
58
78
o_array[i] = round (a_array[i], r_array[i], sigma);
59
- o_array[i] = clamp_mask_helper<float >(o_array[i], t_min, t_max, m_array+ i);
79
+ o_array[i] = clamp_mask_helper<float >(o_array[i], t_min, t_max, m_array + i);
60
80
}
61
81
return std::make_tuple (o, m);
62
82
}
63
83
64
- std::tuple<Tensor, Tensor> fixed_point_quantize_nearest_mask (Tensor a, int wl, int fl, bool symmetric) {
84
+ std::tuple<Tensor, Tensor> fixed_point_quantize_nearest_mask (Tensor a, int wl, int fl, bool symmetric)
85
+ {
65
86
CHECK_INPUT (a);
66
87
auto a_array = a.data <float >();
67
88
auto o = zeros_like (a);
@@ -72,14 +93,16 @@ std::tuple<Tensor, Tensor> fixed_point_quantize_nearest_mask(Tensor a, int wl, i
72
93
int sigma = -fl;
73
94
float t_min, t_max;
74
95
fixed_min_max (wl, fl, symmetric, &t_min, &t_max);
75
- for (int64_t i=0 ; i < size; i++) {
96
+ for (int64_t i = 0 ; i < size; i++)
97
+ {
76
98
o_array[i] = round (a_array[i], 0.5 , sigma);
77
- o_array[i] = clamp_mask_helper<float >(o_array[i], t_min, t_max, m_array+ i);
99
+ o_array[i] = clamp_mask_helper<float >(o_array[i], t_min, t_max, m_array + i);
78
100
}
79
101
return std::make_tuple (o, m);
80
102
}
81
103
82
- Tensor fixed_point_quantize_stochastic (Tensor a, int wl, int fl, bool clamp, bool symmetric) {
104
+ Tensor fixed_point_quantize_stochastic (Tensor a, int wl, int fl, bool clamp, bool symmetric)
105
+ {
83
106
CHECK_INPUT (a);
84
107
auto r = rand_like (a);
85
108
auto a_array = a.data <float >();
@@ -90,16 +113,19 @@ Tensor fixed_point_quantize_stochastic(Tensor a, int wl, int fl, bool clamp, boo
90
113
int sigma = -fl;
91
114
float t_min, t_max;
92
115
fixed_min_max (wl, fl, symmetric, &t_min, &t_max);
93
- for (int64_t i=0 ; i < size; i++) {
116
+ for (int64_t i = 0 ; i < size; i++)
117
+ {
94
118
o_array[i] = round (a_array[i], r_array[i], sigma);
95
- if (clamp) {
119
+ if (clamp)
120
+ {
96
121
o_array[i] = clamp_helper (o_array[i], t_min, t_max);
97
122
}
98
123
}
99
124
return o;
100
125
}
101
126
102
- Tensor fixed_point_quantize_nearest (Tensor a, int wl, int fl, bool clamp, bool symmetric) {
127
+ Tensor fixed_point_quantize_nearest (Tensor a, int wl, int fl, bool clamp, bool symmetric)
128
+ {
103
129
CHECK_INPUT (a);
104
130
auto a_array = a.data <float >();
105
131
Tensor o = zeros_like (a);
@@ -108,31 +134,39 @@ Tensor fixed_point_quantize_nearest(Tensor a, int wl, int fl, bool clamp, bool s
108
134
int sigma = -fl;
109
135
float t_min, t_max;
110
136
fixed_min_max (wl, fl, symmetric, &t_min, &t_max);
111
- for (int64_t i=0 ; i < size; i++) {
137
+ for (int64_t i = 0 ; i < size; i++)
138
+ {
112
139
o_array[i] = round (a_array[i], 0.5 , sigma);
113
- if (clamp) {
140
+ if (clamp)
141
+ {
114
142
o_array[i] = clamp_helper (o_array[i], t_min, t_max);
115
143
}
116
144
}
117
145
return o;
118
146
}
119
147
120
- unsigned int round_bitwise (unsigned int target, int man_bits, Mode rounding){
121
- unsigned int mask = (1 << (23 -man_bits)) - 1 ;
148
+ unsigned int round_bitwise (unsigned int target, int man_bits, Mode rounding)
149
+ {
150
+ unsigned int mask = (1 << (23 - man_bits)) - 1 ;
122
151
unsigned int rand_prob;
123
- if (rounding == rStochastic) {
152
+ if (rounding == rStochastic)
153
+ {
124
154
rand_prob = (dis (gen)) & mask;
125
- } else {
126
- rand_prob = 1 << (23 -man_bits-1 );
127
155
}
128
- unsigned int add_r = target+rand_prob;
156
+ else
157
+ {
158
+ rand_prob = 1 << (23 - man_bits - 1 );
159
+ }
160
+ unsigned int add_r = target + rand_prob;
129
161
unsigned int quantized = add_r & ~mask;
130
162
return quantized;
131
163
}
132
164
133
- void block_quantize_helper (float * input, float * output, float * max_elem,
134
- int wl, int size, Mode rounding) {
135
- for (int64_t i=0 ; i < size; i++) {
165
+ void block_quantize_helper (float *input, float *output, float *max_elem,
166
+ int wl, int size, Mode rounding)
167
+ {
168
+ for (int64_t i = 0 ; i < size; i++)
169
+ {
136
170
137
171
unsigned int max_num;
138
172
FLOAT_TO_BITS (max_elem[i], max_num);
@@ -141,31 +175,37 @@ void block_quantize_helper(float* input, float* output, float* max_elem,
141
175
BITS_TO_FLOAT (max_exp, base_float);
142
176
base_float *= 6 ;
143
177
144
- float target_rebase = input[i]+ base_float;
178
+ float target_rebase = input[i] + base_float;
145
179
unsigned int target_bits;
146
180
FLOAT_TO_BITS (target_rebase, target_bits);
147
181
unsigned int quantized_bits = round_bitwise (target_bits, wl, rounding); // -1 sign, -1 virtual, +2 base
148
182
float quantized_rebase;
149
183
BITS_TO_FLOAT (quantized_bits, quantized_rebase);
150
- float quantized = quantized_rebase- base_float;
184
+ float quantized = quantized_rebase - base_float;
151
185
152
186
unsigned int quantize_bits;
153
187
FLOAT_TO_BITS (quantized, quantize_bits);
154
- unsigned int clip_quantize = clip_max_exponent (wl- 2 , max_exp, quantize_bits);
188
+ unsigned int clip_quantize = clip_max_exponent (wl - 2 , max_exp, quantize_bits);
155
189
BITS_TO_FLOAT (clip_quantize, quantized);
156
190
157
191
output[i] = quantized;
158
192
}
159
193
}
160
194
161
- Tensor get_max_entry (Tensor a, int dim) {
195
+ Tensor get_max_entry (Tensor a, int dim)
196
+ {
162
197
Tensor max_entry;
163
- if (dim == -1 ) {
198
+ if (dim == -1 )
199
+ {
164
200
max_entry = at::max (at::abs (a)).expand_as (a).contiguous ();
165
- } else if (dim == 0 ) {
201
+ }
202
+ else if (dim == 0 )
203
+ {
166
204
Tensor input_view = a.view ({a.size (0 ), -1 });
167
205
max_entry = std::get<0 >(input_view.max (1 , true )).abs ().expand_as (input_view).view_as (a).contiguous ();
168
- } else {
206
+ }
207
+ else
208
+ {
169
209
Tensor input_transpose = a.transpose (0 , dim);
170
210
Tensor input_view = input_transpose.contiguous ().view ({input_transpose.size (0 ), -1 });
171
211
Tensor max_transpose = std::get<0 >(input_view.max (1 , true )).abs ().expand_as (input_view).view_as (input_transpose);
@@ -174,7 +214,8 @@ Tensor get_max_entry(Tensor a, int dim) {
174
214
return max_entry;
175
215
}
176
216
177
- Tensor block_quantize_nearest (Tensor a, int wl, int dim) {
217
+ Tensor block_quantize_nearest (Tensor a, int wl, int dim)
218
+ {
178
219
CHECK_INPUT (a);
179
220
auto a_array = a.data <float >();
180
221
Tensor o = zeros_like (a);
@@ -188,7 +229,8 @@ Tensor block_quantize_nearest(Tensor a, int wl, int dim) {
188
229
return o;
189
230
}
190
231
191
- Tensor block_quantize_stochastic (Tensor a, int wl, int dim) {
232
+ Tensor block_quantize_stochastic (Tensor a, int wl, int dim)
233
+ {
192
234
CHECK_INPUT (a);
193
235
auto a_array = a.data <float >();
194
236
Tensor o = zeros_like (a);
@@ -203,15 +245,16 @@ Tensor block_quantize_stochastic(Tensor a, int wl, int dim) {
203
245
return o;
204
246
}
205
247
206
-
207
- Tensor float_quantize_stochastic (Tensor a, int man_bits, int exp_bits) {
248
+ Tensor float_quantize_stochastic (Tensor a, int man_bits, int exp_bits)
249
+ {
208
250
// use external random number right now
209
251
auto a_array = a.data <float >();
210
252
auto o = zeros_like (a);
211
253
auto o_array = o.data <float >();
212
254
int size = a.numel ();
213
255
214
- for (int64_t i=0 ; i < size; i++) {
256
+ for (int64_t i = 0 ; i < size; i++)
257
+ {
215
258
unsigned int target;
216
259
FLOAT_TO_BITS (a_array[i], target);
217
260
unsigned int quantize_bits = round_bitwise (target, man_bits, rStochastic);
@@ -223,13 +266,15 @@ Tensor float_quantize_stochastic(Tensor a, int man_bits, int exp_bits) {
223
266
return o;
224
267
}
225
268
226
- Tensor float_quantize_nearest (Tensor a, int man_bits, int exp_bits) {
269
+ Tensor float_quantize_nearest (Tensor a, int man_bits, int exp_bits)
270
+ {
227
271
auto a_array = a.data <float >();
228
272
auto o = zeros_like (a);
229
273
auto o_array = o.data <float >();
230
274
int size = a.numel ();
231
275
232
- for (int64_t i=0 ; i < size; i++) {
276
+ for (int64_t i = 0 ; i < size; i++)
277
+ {
233
278
unsigned int target;
234
279
FLOAT_TO_BITS (a_array[i], target);
235
280
unsigned int quantize_bits = round_bitwise (target, man_bits, rNearest);
@@ -241,7 +286,8 @@ Tensor float_quantize_nearest(Tensor a, int man_bits, int exp_bits) {
241
286
return o;
242
287
}
243
288
244
- PYBIND11_MODULE (TORCH_EXTENSION_NAME, m) {
289
+ PYBIND11_MODULE (TORCH_EXTENSION_NAME, m)
290
+ {
245
291
m.def (" fixed_point_quantize_stochastic_mask" , &fixed_point_quantize_stochastic_mask, " Fixed Point Number Stochastic Quantization with Mask (CPU)" );
246
292
m.def (" fixed_point_quantize_stochastic" , &fixed_point_quantize_stochastic, " Fixed Point Number Stochastic Quantization (CPU)" );
247
293
m.def (" block_quantize_stochastic" , &block_quantize_stochastic, " Block Floating Point Number Stochastic Quantization (CPU)" );
0 commit comments