-
Notifications
You must be signed in to change notification settings - Fork 7.1k
Add quantized version of nms #3601
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 12 commits
1f2ecd8
b63bac6
41a4927
cdb44fc
fb451cc
d92171c
2eef613
51943d8
f8a56d7
4c75c56
2ac5313
f47d238
4b31259
7c18714
31d76ce
618bbe1
7e27337
5450689
4cda6e5
dd46376
aef04fd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
#include <ATen/ATen.h> | ||
#include <ATen/native/quantized/affine_quantizer.h> | ||
#include <torch/library.h> | ||
|
||
namespace vision { | ||
namespace ops { | ||
|
||
namespace { | ||
|
||
template <typename scalar_t> | ||
at::Tensor qnms_kernel_impl( | ||
const at::Tensor& dets, | ||
const at::Tensor& scores, | ||
double iou_threshold) { | ||
|
||
TORCH_CHECK(!dets.is_cuda(), "dets must be a CPU tensor"); | ||
TORCH_CHECK(!scores.is_cuda(), "scores must be a CPU tensor"); | ||
TORCH_CHECK( | ||
dets.scalar_type() == scores.scalar_type(), | ||
"dets should have the same type as scores"); | ||
|
||
if (dets.numel() == 0) | ||
return at::empty({0}, dets.options().dtype(at::kLong)); | ||
|
||
const auto ndets = dets.size(0); | ||
|
||
auto x1_t = dets.select(1, 0).contiguous(); | ||
auto y1_t = dets.select(1, 1).contiguous(); | ||
auto x2_t = dets.select(1, 2).contiguous(); | ||
auto y2_t = dets.select(1, 3).contiguous(); | ||
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); | ||
at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte)); | ||
at::Tensor keep_t = at::zeros({ndets}, dets.options().dtype(at::kLong)); | ||
at::Tensor areas_t = at::zeros({ndets}, dets.options().dtype(at::kFloat)); | ||
|
||
auto suppressed = suppressed_t.data_ptr<uint8_t>(); | ||
auto keep = keep_t.data_ptr<int64_t>(); | ||
auto order = order_t.data_ptr<int64_t>(); | ||
auto x1 = x1_t.data_ptr<scalar_t>(); | ||
auto y1 = y1_t.data_ptr<scalar_t>(); | ||
auto x2 = x2_t.data_ptr<scalar_t>(); | ||
auto y2 = y2_t.data_ptr<scalar_t>(); | ||
auto areas = areas_t.data_ptr<float>(); | ||
|
||
auto areas_a = areas_t.accessor<float, 1>(); | ||
for (int64_t i = 0; i < ndets; i++) { | ||
// Note: To get the exact area we'd need to multiply by scale**2, but this | ||
// would get canceled out in the computation of ovr below. | ||
// So we leave that out. | ||
areas_a[i] = (x2[i].val_ - x1[i].val_) * (y2[i].val_ - y1[i].val_); | ||
} | ||
|
||
int64_t num_to_keep = 0; | ||
|
||
for (int64_t _i = 0; _i < ndets; _i++) { | ||
auto i = order[_i]; | ||
if (suppressed[i] == 1) | ||
continue; | ||
keep[num_to_keep++] = i; | ||
|
||
auto ix1val = x1[i].val_; | ||
auto iy1val = y1[i].val_; | ||
auto ix2val = x2[i].val_; | ||
auto iy2val = y2[i].val_; | ||
auto iarea = areas[i]; | ||
|
||
for (int64_t _j = _i + 1; _j < ndets; _j++) { | ||
auto j = order[_j]; | ||
if (suppressed[j] == 1) | ||
continue; | ||
auto xx1 = std::max(ix1val, x1[j].val_); | ||
auto yy1 = std::max(iy1val, y1[j].val_); | ||
auto xx2 = std::min(ix2val, x2[j].val_); | ||
auto yy2 = std::min(iy2val, y2[j].val_); | ||
|
||
auto w = std::max(0, xx2 - xx1); // * scale (gets canceled below) | ||
auto h = std::max(0, yy2 - yy1); // * scale (gets canceled below) | ||
auto inter = w * h; | ||
double ovr = inter / (iarea + areas[j] - inter); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. don't you need to cast one of those terms to Otherwise, if the exact value of |
||
if (ovr > iou_threshold) | ||
suppressed[j] = 1; | ||
} | ||
} | ||
return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep); | ||
} | ||
|
||
at::Tensor qnms_kernel( | ||
const at::Tensor& dets, | ||
const at::Tensor& scores, | ||
double iou_threshold) { | ||
TORCH_CHECK( | ||
dets.dim() == 2, "boxes should be a 2d tensor, got ", dets.dim(), "D"); | ||
TORCH_CHECK( | ||
dets.size(1) == 4, | ||
"boxes should have 4 elements in dimension 1, got ", | ||
dets.size(1)); | ||
TORCH_CHECK( | ||
scores.dim() == 1, | ||
"scores should be a 1d tensor, got ", | ||
scores.dim(), | ||
"D"); | ||
TORCH_CHECK( | ||
dets.size(0) == scores.size(0), | ||
"boxes and scores should have same number of elements in ", | ||
"dimension 0, got ", | ||
dets.size(0), | ||
" and ", | ||
scores.size(0)); | ||
|
||
auto result = at::empty({0}); | ||
|
||
AT_DISPATCH_QINT_TYPES(dets.scalar_type(), "qnms_kernel", [&] { | ||
result = qnms_kernel_impl<scalar_t>(dets, scores, iou_threshold); | ||
}); | ||
return result; | ||
} | ||
|
||
} // namespace | ||
|
||
TORCH_LIBRARY_IMPL(torchvision, QuantizedCPU, m) { | ||
m.impl(TORCH_SELECTIVE_NAME("torchvision::nms"), TORCH_FN(qnms_kernel)); | ||
} | ||
|
||
} // namespace ops | ||
} // namespace vision |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In fact I think it's possible to get an underflow on unsigned types here, even when boxes aren't degenerate: if box j is strictly left from box i and they don't overlap, then we'll have
xx2 < xx1
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes, good catch, this happens more often than just with degenerate boxes here
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Trying to come up with a reproducible example of an underflow was quite informative, and I came to the conclusion that:
The reason for 1) is that the result of
a - b
is anint
, even if botha
andb
areuint8_t
: TIL about integral promotion (https://stackoverflow.com/questions/21421998/subtraction-operation-on-unsigned-char). Sincequint_8
is the only supported quantized unsigned type, such values can always be represented asint
s, and integral promotion will always happen.Regarding 2): It's possible to get an underflow by doing some explicit casts to
uint8_t
in theqnms
code. The following boxes and scores:should produce
10 5 0 5
forxx1, xx2, yy1, yy2
and thus lead to an underflow when computingw = xx2 - xx1
.However, the underflow leads to
w = 251
andinter = some_big_value = 1255
and thusovr ~= -2
(note the negative sign because ofsome_big_value
that gets subtracted to the denominator).Hence the
if (ovr > iou_threshold)
condition is never verified when there's an underflow, and that's actually correct since the correct value forovr
should have been0
anyway: underflows can only happen when the boxes are disjoint along at least one of the dimensions.So the results are unchanged, and it's thus impossible to actually test against underflows... I think.
EDIT: we now explicitly cast coordinates to float32 for allowing vectorization, so there's definitely no risk of underflow anymore when computing
w
orh
.