Skip to content

Add quantized version of nms #3601

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
Mar 30, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,11 @@ def get_extensions():

main_file = glob.glob(os.path.join(extensions_dir, '*.cpp')) + glob.glob(os.path.join(extensions_dir, 'ops',
'*.cpp'))
source_cpu = glob.glob(os.path.join(extensions_dir, 'ops', 'autograd', '*.cpp')) + glob.glob(
os.path.join(extensions_dir, 'ops', 'cpu', '*.cpp'))
source_cpu = (
glob.glob(os.path.join(extensions_dir, 'ops', 'autograd', '*.cpp')) +
glob.glob(os.path.join(extensions_dir, 'ops', 'cpu', '*.cpp')) +
glob.glob(os.path.join(extensions_dir, 'ops', 'quantized', 'cpu', '*.cpp'))
)

is_rocm_pytorch = False
if torch.__version__ >= '1.5':
Expand Down
23 changes: 23 additions & 0 deletions test/test_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,6 +418,29 @@ def test_nms(self):
self.assertRaises(RuntimeError, ops.nms, torch.rand(3, 4), torch.rand(3, 2), 0.5)
self.assertRaises(RuntimeError, ops.nms, torch.rand(3, 4), torch.rand(4), 0.5)

def test_qnms(self):
# Note: we compare qnms vs nms instead of qnms vs reference implementation.
# This is because with the int convertion, the trick used in _create_tensors_with_iou
# doesn't really work (in fact, nms vs reference implem will also fail with ints)
err_msg = 'NMS and QNMS give different results for IoU={}'
for iou in [0.2, 0.5, 0.8]:
for scale, zero_point in ((1, 0), (2, 50), (3, 10)):
boxes, scores = self._create_tensors_with_iou(1000, iou)
scores *= 100 # otherwise most scores would be 0 or 1 after int convertion

qboxes = torch.quantize_per_tensor(boxes, scale=scale, zero_point=zero_point,
dtype=torch.quint8)
qscores = torch.quantize_per_tensor(scores, scale=scale, zero_point=zero_point,
dtype=torch.quint8)

boxes = qboxes.dequantize()
scores = qscores.dequantize()

keep = ops.nms(boxes, scores, iou)
qkeep = ops.nms(qboxes, qscores, iou)

self.assertTrue(torch.allclose(qkeep, keep), err_msg.format(iou))

@unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
def test_nms_cuda(self, dtype=torch.float64):
tol = 1e-3 if dtype is torch.half else 1e-5
Expand Down
129 changes: 129 additions & 0 deletions torchvision/csrc/ops/quantized/cpu/qnms_kernel.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
#include <ATen/ATen.h>
#include <ATen/native/quantized/affine_quantizer.h>
#include <torch/library.h>

namespace vision {
namespace ops {

namespace {

template <typename scalar_t>
at::Tensor qnms_kernel_impl(
const at::Tensor& dets,
const at::Tensor& scores,
double iou_threshold) {
TORCH_CHECK(!dets.is_cuda(), "dets must be a CPU tensor");
TORCH_CHECK(!scores.is_cuda(), "scores must be a CPU tensor");
TORCH_CHECK(
dets.scalar_type() == scores.scalar_type(),
"dets should have the same type as scores");

if (dets.numel() == 0)
return at::empty({0}, dets.options().dtype(at::kLong));

const auto ndets = dets.size(0);

auto x1_t = dets.select(1, 0).contiguous();
auto y1_t = dets.select(1, 1).contiguous();
auto x2_t = dets.select(1, 2).contiguous();
auto y2_t = dets.select(1, 3).contiguous();
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte));
at::Tensor keep_t = at::zeros({ndets}, dets.options().dtype(at::kLong));
at::Tensor areas_t = at::zeros({ndets}, dets.options().dtype(at::kFloat));

auto suppressed = suppressed_t.data_ptr<uint8_t>();
auto keep = keep_t.data_ptr<int64_t>();
auto order = order_t.data_ptr<int64_t>();
auto x1 = x1_t.data_ptr<scalar_t>();
auto y1 = y1_t.data_ptr<scalar_t>();
auto x2 = x2_t.data_ptr<scalar_t>();
auto y2 = y2_t.data_ptr<scalar_t>();
auto areas = areas_t.data_ptr<float>();

for (int64_t i = 0; i < ndets; i++) {
// Note 1: To get the exact area we'd need to multiply by scale**2, but this
// would get canceled out in the computation of ovr below. So we leave that
// out.
// Note 2: degenerate boxes (x2 < x1 or y2 < y1) may underflow, although
// integral promotion rules will likely prevent it (see
// https://stackoverflow.com/questions/32959564/subtraction-of-two-unsigned-gives-signed
// for more details).
areas[i] = (x2[i].val_ - x1[i].val_) * (y2[i].val_ - y1[i].val_);
}

int64_t num_to_keep = 0;

for (int64_t _i = 0; _i < ndets; _i++) {
auto i = order[_i];
if (suppressed[i] == 1)
continue;
keep[num_to_keep++] = i;

// We explicitely cast coordinates to float so that the code can be
// vectorized.
float ix1val = x1[i].val_;
float iy1val = y1[i].val_;
float ix2val = x2[i].val_;
float iy2val = y2[i].val_;
float iarea = areas[i];

for (int64_t _j = _i + 1; _j < ndets; _j++) {
auto j = order[_j];
if (suppressed[j] == 1)
continue;
float xx1 = std::max(ix1val, (float)x1[j].val_);
float yy1 = std::max(iy1val, (float)y1[j].val_);
float xx2 = std::min(ix2val, (float)x2[j].val_);
float yy2 = std::min(iy2val, (float)y2[j].val_);

auto w = std::max(0.f, xx2 - xx1); // * scale (gets canceled below)
auto h = std::max(0.f, yy2 - yy1); // * scale (gets canceled below)
auto inter = w * h;
auto ovr = inter / (iarea + areas[j] - inter);
if (ovr > iou_threshold)
suppressed[j] = 1;
}
}
return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep);
}

at::Tensor qnms_kernel(
const at::Tensor& dets,
const at::Tensor& scores,
double iou_threshold) {
TORCH_CHECK(
dets.dim() == 2, "boxes should be a 2d tensor, got ", dets.dim(), "D");
TORCH_CHECK(
dets.size(1) == 4,
"boxes should have 4 elements in dimension 1, got ",
dets.size(1));
TORCH_CHECK(
scores.dim() == 1,
"scores should be a 1d tensor, got ",
scores.dim(),
"D");
TORCH_CHECK(
dets.size(0) == scores.size(0),
"boxes and scores should have same number of elements in ",
"dimension 0, got ",
dets.size(0),
" and ",
scores.size(0));

auto result = at::empty({0});

AT_DISPATCH_QINT_TYPES(dets.scalar_type(), "qnms_kernel", [&] {
result = qnms_kernel_impl<scalar_t>(dets, scores, iou_threshold);
});
return result;
}

} // namespace

TORCH_LIBRARY_IMPL(torchvision, QuantizedCPU, m) {
m.impl(TORCH_SELECTIVE_NAME("torchvision::nms"), TORCH_FN(qnms_kernel));
}

} // namespace ops
} // namespace vision