Skip to content

Commit f3bafef

Browse files
authored
[ARM CPU] Add ACL deconvolution operation (openvinotoolkit#18655)
1 parent d51fc7a commit f3bafef

File tree

8 files changed

+657
-50
lines changed

8 files changed

+657
-50
lines changed

src/plugins/intel_cpu/src/nodes/deconv.cpp

+152-43
Large diffs are not rendered by default.

src/plugins/intel_cpu/src/nodes/deconv.h

+8-7
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
#include <vector>
1212
#include "common/dnnl_executor.h"
1313

14+
#include "executors/deconv_list.hpp"
15+
1416
namespace ov {
1517
namespace intel_cpu {
1618
namespace node {
@@ -20,6 +22,7 @@ class Deconvolution : public Node {
2022
Deconvolution(const std::shared_ptr<ngraph::Node>& op, const GraphContext::CPtr context);
2123

2224
void getSupportedDescriptors() override;
25+
void initSupportedPrimitiveDescriptors() override;
2326
void createDescriptor(const std::vector<MemoryDescPtr>& inputDesc,
2427
const std::vector<MemoryDescPtr>& outputDesc) override;
2528
void createPrimitive() override;
@@ -41,7 +44,7 @@ class Deconvolution : public Node {
4144
bool canFuse(const NodePtr& node) const override;
4245

4346
const VectorDims& getWeightDims() const { return getInputShapeAtPort(1).getStaticDims(); }
44-
const std::vector<ptrdiff_t>& getStride() const { return stride; }
47+
const std::vector<ptrdiff_t>& getStride() const { return deconvAttrs.stride; }
4548

4649
void prepareParams() override;
4750
void execute(dnnl::stream strm) override;
@@ -55,6 +58,7 @@ class Deconvolution : public Node {
5558
AttrPtr initPrimitiveAttr() override;
5659
AttrPtr makePrimitiveAttr(const VectorDims& dims);
5760
std::vector<dnnl::memory::format_tag> getAvailableFormatsForDims(const Shape& dims) const override;
61+
std::shared_ptr<DeconvExecutor> execPtrDeconv = nullptr;
5862

5963
private:
6064
using executorPtr = std::shared_ptr<DnnlExecutor>;
@@ -89,16 +93,13 @@ class Deconvolution : public Node {
8993
size_t groupNum = 1;
9094
size_t IC = 0;
9195
size_t OC = 0;
92-
std::vector<ptrdiff_t> kernel;
93-
std::vector<ptrdiff_t> stride;
94-
std::vector<ptrdiff_t> dilation;
95-
ov::CoordinateDiff paddingL;
96-
ov::CoordinateDiff paddingR;
97-
ov::CoordinateDiff outputPadding;
9896
std::vector<int32_t> lastOutputSpatialDims;
9997
VectorDims int8WeightDims;
10098
VectorDims expectedBiasDims {};
10199

100+
bool useACL = false;
101+
DeconvAttrs deconvAttrs;
102+
102103
Shape inShape;
103104

104105
AttrPtr pAttr;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,248 @@
1+
// Copyright (C) 2023 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
//
4+
5+
#include "acl_deconv.hpp"
6+
#include "ie_parallel.hpp"
7+
8+
namespace ov {
9+
namespace intel_cpu {
10+
11+
using namespace arm_compute;
12+
13+
ACLDeconvTensorInfo getACLDeconvTensorInfo(const DeconvAttrs& deconvAttrs,
14+
const std::vector<MemoryDescPtr>& srcDescs,
15+
const std::vector<MemoryDescPtr>& dstDescs) {
16+
auto srcDims = srcDescs[0]->getShape().getDims();
17+
auto weiDims = srcDescs[1]->getShape().getDims();
18+
// swap input and output channels dimensions to be align with ACL
19+
// weights tensor shape is changed because ACL expects [O, I, H, W] tensor while OV uses [I, O, H, W] tensor
20+
std::swap(weiDims[0], weiDims[1]);
21+
auto dstDims = dstDescs[0]->getShape().getDims();
22+
23+
VectorDims biasDims;
24+
TensorInfo biasTensorInfo;
25+
26+
if (deconvAttrs.withBiasesParam) {
27+
biasDims = srcDescs[2]->getShape().getStaticDims();
28+
biasTensorInfo = TensorInfo(shapeCast(biasDims), 1,
29+
precisionToAclDataType(srcDescs[2]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[2]));
30+
}
31+
32+
TensorInfo srcTensorInfo = TensorInfo(shapeCast(srcDims), 1,
33+
precisionToAclDataType(srcDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[0]));
34+
TensorInfo weiTensorInfo = TensorInfo(shapeCast(weiDims), 1,
35+
precisionToAclDataType(srcDescs[1]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[1]));
36+
TensorInfo dstTensorInfo = TensorInfo(shapeCast(dstDims), 1,
37+
precisionToAclDataType(dstDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(dstDescs[0]));
38+
39+
unsigned int pad_l =
40+
(deconvAttrs.paddingL.size() > 1) ? static_cast<unsigned int>(deconvAttrs.paddingL.at(1)) : static_cast<unsigned int>(deconvAttrs.paddingL.at(0));
41+
unsigned int pad_r =
42+
(deconvAttrs.paddingR.size() > 1) ? static_cast<unsigned int>(deconvAttrs.paddingR.at(1)) : static_cast<unsigned int>(deconvAttrs.paddingR.at(0));
43+
unsigned int pad_t = static_cast<unsigned int>(deconvAttrs.paddingL.at(0));
44+
unsigned int pad_b = static_cast<unsigned int>(deconvAttrs.paddingR.at(0));
45+
unsigned int stride_x = (deconvAttrs.stride.size() > 1) ? deconvAttrs.stride.at(1) : deconvAttrs.stride.at(0);
46+
unsigned int stride_y = deconvAttrs.stride.at(0);
47+
PadStrideInfo deconv_info(stride_x, stride_y, pad_l, pad_r, pad_t, pad_b, DimensionRoundingType::FLOOR);
48+
49+
return ACLDeconvTensorInfo{srcTensorInfo, weiTensorInfo, biasTensorInfo, dstTensorInfo, deconv_info};
50+
}
51+
52+
AclDeconvExecutor::AclDeconvExecutor(const ExecutorContext::CPtr context) : DeconvExecutor(context) {}
53+
54+
bool AclDeconvExecutor::init(const DeconvAttrs& deconvAttrs,
55+
const std::vector<MemoryDescPtr>& srcDescs,
56+
const std::vector<MemoryDescPtr>& dstDescs,
57+
const dnnl::primitive_attr &attr) {
58+
this->deconvAttrs = deconvAttrs;
59+
ACLDeconvTensorInfo aclDeconvTensorInfo = getACLDeconvTensorInfo(deconvAttrs, srcDescs, dstDescs);
60+
TensorInfo srcTensorInfo = aclDeconvTensorInfo.srcTensorInfo;
61+
TensorInfo weiTensorInfo = aclDeconvTensorInfo.weiTensorInfo;
62+
TensorInfo biasTensorInfo = aclDeconvTensorInfo.biasTensorInfo;
63+
TensorInfo dstTensorInfo = aclDeconvTensorInfo.dstTensorInfo;
64+
PadStrideInfo deconv_info = aclDeconvTensorInfo.deconv_info;
65+
66+
arm_compute::Status status = arm_compute::NEDeconvolutionLayer::validate(&srcTensorInfo,
67+
&weiTensorInfo,
68+
deconvAttrs.withBiasesParam ? &biasTensorInfo : nullptr,
69+
&dstTensorInfo,
70+
deconv_info);
71+
if (!status) {
72+
DEBUG_LOG("NEDeconvolutionLayer validation failed: ", status.error_description());
73+
return false;
74+
}
75+
76+
srcTensor.allocator()->init(srcTensorInfo);
77+
weiTensor.allocator()->init(weiTensorInfo);
78+
dstTensor.allocator()->init(dstTensorInfo);
79+
if (deconvAttrs.withBiasesParam)
80+
biasTensor.allocator()->init(biasTensorInfo);
81+
82+
deconv = std::make_unique<arm_compute::NEDeconvolutionLayer>();
83+
deconv->configure(&srcTensor, &weiTensor, deconvAttrs.withBiasesParam ? &biasTensor : nullptr, &dstTensor, deconv_info);
84+
85+
// weights tensor shape is changed because ACL expects [O, I, H, W] tensor while OV uses [I, O, H, W] tensor
86+
weiBuffer = std::vector<float>(srcDescs[1]->getShape().getStaticDims()[0] *
87+
srcDescs[1]->getShape().getStaticDims()[1] *
88+
srcDescs[1]->getShape().getStaticDims()[2] *
89+
srcDescs[1]->getShape().getStaticDims()[3]);
90+
return true;
91+
}
92+
93+
static void transpose_to_1023(const MemoryCPtr& srcMemPtr, std::vector<float>& dst_data) {
94+
const auto src_data = reinterpret_cast<float*>(srcMemPtr->getData());
95+
96+
const int DIM0 = srcMemPtr->getStaticDims()[0];
97+
const int DIM1 = srcMemPtr->getStaticDims()[1];
98+
const int DIM2 = srcMemPtr->getStaticDims()[2];
99+
const int DIM3 = srcMemPtr->getStaticDims()[3];
100+
101+
parallel_for3d(DIM0, DIM1, DIM2, [&](const int dim0, const int dim1, const int dim2) {
102+
for (int dim3 = 0; dim3 < DIM3; ++dim3) {
103+
const int src_off = dim0 * DIM1 * DIM2 * DIM3 +
104+
dim1 * DIM2 * DIM3 +
105+
dim2 * DIM3 +
106+
dim3;
107+
const int dst_off = dim1 * DIM0 * DIM2 * DIM3 +
108+
dim0 * DIM2 * DIM3 +
109+
dim2 * DIM3 +
110+
dim3;
111+
112+
dst_data[dst_off] = src_data[src_off];
113+
}
114+
});
115+
}
116+
117+
void AclDeconvExecutor::exec(const std::vector<MemoryCPtr>& src, const std::vector<MemoryPtr>& dst, const void *post_ops_data_) {
118+
// TODO: Remove transpose from exec
119+
transpose_to_1023(src[1], weiBuffer);
120+
121+
srcTensor.allocator()->import_memory(src[0]->getData());
122+
dstTensor.allocator()->import_memory(dst[0]->getData());
123+
weiTensor.allocator()->import_memory(weiBuffer.data());
124+
if (deconvAttrs.withBiasesParam)
125+
biasTensor.allocator()->import_memory(src[2]->getData());
126+
deconv->run();
127+
128+
srcTensor.allocator()->free();
129+
dstTensor.allocator()->free();
130+
weiTensor.allocator()->free();
131+
if (deconvAttrs.withBiasesParam)
132+
biasTensor.allocator()->free();
133+
}
134+
135+
bool AclDeconvExecutorBuilder::customIsSupported(const DeconvAttrs &deconvAttrs,
136+
const std::vector<MemoryDescPtr> &srcDescs,
137+
const std::vector<MemoryDescPtr> &dstDescs) {
138+
if ((srcDescs[0]->getShape().getDims().size() != 3 && srcDescs[0]->getShape().getDims().size() != 4) ||
139+
dstDescs[0]->getShape().getDims().size() != srcDescs[0]->getShape().getDims().size() ||
140+
srcDescs[1]->getShape().getDims().size() != 4) {
141+
DEBUG_LOG("AclDeconvExecutor does not support dimension:",
142+
" src[0]=", srcDescs[0]->getShape().getDims().size(),
143+
" src[1]=", srcDescs[1]->getShape().getDims().size(),
144+
" dst[0]=", dstDescs[0]->getShape().getDims().size());
145+
return false;
146+
}
147+
148+
// TODO: Ticket CVS-114087 - enable FP16 when check FP16 scoup
149+
if (!(one_of(srcDescs[0]->getPrecision(), /*InferenceEngine::Precision::FP16, */InferenceEngine::Precision::FP32) &&
150+
srcDescs[0]->getPrecision() == srcDescs[1]->getPrecision() &&
151+
srcDescs[1]->getPrecision() == dstDescs[0]->getPrecision())) {
152+
DEBUG_LOG("AclDeconvExecutor does not support precisions:",
153+
" src[0]=", srcDescs[0]->getPrecision(),
154+
" src[1]=", srcDescs[1]->getPrecision(),
155+
" dst[0]=", dstDescs[0]->getPrecision());
156+
return false;
157+
}
158+
159+
if (deconvAttrs.withBiasesParam && srcDescs[2]->getPrecision() != srcDescs[0]->getPrecision()) {
160+
DEBUG_LOG("AclDeconvExecutor does not support precisions:",
161+
" src[2]=", srcDescs[2]->getPrecision());
162+
return false;
163+
}
164+
165+
if (!(srcDescs[0]->hasLayoutType(LayoutType::ncsp) &&
166+
srcDescs[1]->hasLayoutType(LayoutType::ncsp) &&
167+
dstDescs[0]->hasLayoutType(LayoutType::ncsp)) &&
168+
!(srcDescs[0]->hasLayoutType(LayoutType::nspc) &&
169+
srcDescs[1]->hasLayoutType(LayoutType::nspc) &&
170+
dstDescs[0]->hasLayoutType(LayoutType::nspc))) {
171+
DEBUG_LOG("AclDeconvExecutor does not support layouts:",
172+
" src[0]=", srcDescs[0]->serializeFormat(),
173+
" src[1]=", srcDescs[1]->serializeFormat(),
174+
" dst=", dstDescs[0]->serializeFormat());
175+
return false;
176+
}
177+
178+
if (deconvAttrs.withBiasesParam &&
179+
!(srcDescs[2]->hasLayoutType(LayoutType::ncsp)) &&
180+
!(srcDescs[2]->hasLayoutType(LayoutType::nspc))) {
181+
DEBUG_LOG("AclDeconvExecutor does not support layouts:",
182+
" src[0]=", srcDescs[0]->serializeFormat(),
183+
" src[1]=", srcDescs[1]->serializeFormat(),
184+
" src[2]=", srcDescs[2]->serializeFormat(),
185+
" dst=", dstDescs[0]->serializeFormat());
186+
return false;
187+
}
188+
189+
ACLDeconvTensorInfo aclDeconvTensorInfo = getACLDeconvTensorInfo(deconvAttrs, srcDescs, dstDescs);
190+
TensorInfo srcTensorInfo = aclDeconvTensorInfo.srcTensorInfo;
191+
TensorInfo weiTensorInfo = aclDeconvTensorInfo.weiTensorInfo;
192+
TensorInfo biasTensorInfo = aclDeconvTensorInfo.biasTensorInfo;
193+
TensorInfo dstTensorInfo = aclDeconvTensorInfo.dstTensorInfo;
194+
PadStrideInfo deconv_info = aclDeconvTensorInfo.deconv_info;
195+
196+
unsigned int kernel_x = (deconvAttrs.kernel.size() > 1) ? deconvAttrs.kernel.at(1) : deconvAttrs.kernel.at(0);
197+
unsigned int kernel_y = deconvAttrs.kernel.at(0);
198+
199+
// After stride=8 up-sampling in ACL Deconvolution layer slower than reference
200+
if (deconv_info.stride().first >= 8 || deconv_info.stride().second >= 8) return false;
201+
202+
unsigned int dilation_x = (deconvAttrs.dilation.size() > 1) ? deconvAttrs.dilation.at(1) : deconvAttrs.dilation.at(0);
203+
unsigned int dilation_y = deconvAttrs.dilation.at(0);
204+
if (!one_of(dilation_x, static_cast<unsigned int >(0), static_cast<unsigned int >(1)) ||
205+
!one_of(dilation_y, static_cast<unsigned int >(0), static_cast<unsigned int >(1))) return false;
206+
207+
size_t in_h = srcDescs[0]->hasLayoutType(LayoutType::ncsp) ? srcDescs[0]->getShape().getDims()[2] : srcDescs[0]->getShape().getDims()[1];
208+
size_t in_w = srcDescs[0]->hasLayoutType(LayoutType::ncsp) ? srcDescs[0]->getShape().getDims()[3] : srcDescs[0]->getShape().getDims()[2];
209+
210+
// Validate function has bug (https://github.com/ARM-software/ComputeLibrary/issues/1061) with error exception.
211+
// We copy deconvolution_output_dimensions function for get correct validation
212+
// TODO: remove after fix
213+
if (validate_deconvolution_output_dimensions(in_w, in_h, kernel_x, kernel_y, deconv_info)) {
214+
DEBUG_LOG("NEDeconvolutionLayer arm_compute::deconvolution_output_dimensions failed");
215+
return false;
216+
}
217+
218+
arm_compute::Status status = arm_compute::NEDeconvolutionLayer::validate(&srcTensorInfo,
219+
&weiTensorInfo,
220+
deconvAttrs.withBiasesParam ? &biasTensorInfo : nullptr,
221+
&dstTensorInfo,
222+
deconv_info);
223+
if (!status) {
224+
DEBUG_LOG("NEDeconvolutionLayer validation failed: ", status.error_description());
225+
return false;
226+
}
227+
228+
return true;
229+
}
230+
231+
bool AclDeconvExecutorBuilder::validate_deconvolution_output_dimensions(unsigned int in_width, unsigned int in_height,
232+
unsigned int kernel_width,
233+
unsigned int kernel_height,
234+
const PadStrideInfo &pad_stride_info) {
235+
const unsigned int pad_left = pad_stride_info.pad_left();
236+
const unsigned int pad_top = pad_stride_info.pad_top();
237+
const unsigned int pad_right = pad_stride_info.pad_right();
238+
const unsigned int pad_bottom = pad_stride_info.pad_bottom();
239+
const unsigned int stride_x = pad_stride_info.stride().first;
240+
const unsigned int stride_y = pad_stride_info.stride().second;
241+
242+
if (!((in_width < 1 || in_height < 1) ||
243+
(((in_width - 1) * stride_x + kernel_width) < (pad_left + pad_right)) ||
244+
(((in_height - 1) * stride_y + kernel_height) < (pad_top + pad_bottom)))) { return false; }
245+
return true;
246+
}
247+
} // namespace intel_cpu
248+
} // namespace ov
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
// Copyright (C) 2023 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
//
4+
5+
#pragma once
6+
7+
#include "nodes/executors/deconv.hpp"
8+
#include "arm_compute/runtime/NEON/NEFunctions.h"
9+
#include "utils/debug_capabilities.h"
10+
#include "acl_utils.hpp"
11+
12+
namespace ov {
13+
namespace intel_cpu {
14+
15+
struct ACLDeconvTensorInfo {
16+
arm_compute::TensorInfo srcTensorInfo;
17+
arm_compute::TensorInfo weiTensorInfo;
18+
arm_compute::TensorInfo biasTensorInfo;
19+
arm_compute::TensorInfo dstTensorInfo;
20+
arm_compute::PadStrideInfo deconv_info;
21+
};
22+
23+
ACLDeconvTensorInfo getACLDeconvTensorInfo(const DeconvAttrs& deconvAttrs,
24+
const std::vector<MemoryDescPtr>& srcDescs,
25+
const std::vector<MemoryDescPtr>& dstDescs);
26+
27+
class AclDeconvExecutor : public DeconvExecutor {
28+
public:
29+
explicit AclDeconvExecutor(const ExecutorContext::CPtr context);
30+
bool init(const DeconvAttrs& deconvAttrs,
31+
const std::vector<MemoryDescPtr>& srcDescs,
32+
const std::vector<MemoryDescPtr>& dstDescs,
33+
const dnnl::primitive_attr &attr) override;
34+
void exec(const std::vector<MemoryCPtr>& src,
35+
const std::vector<MemoryPtr>& dst,
36+
const void *post_ops_data_) override;
37+
38+
impl_desc_type getImplType() const override {
39+
return implType;
40+
}
41+
42+
private:
43+
DeconvAttrs deconvAttrs;
44+
impl_desc_type implType = impl_desc_type::acl;
45+
46+
arm_compute::Tensor srcTensor;
47+
arm_compute::Tensor weiTensor;
48+
arm_compute::Tensor biasTensor;
49+
arm_compute::Tensor dstTensor;
50+
std::unique_ptr<arm_compute::NEDeconvolutionLayer> deconv = nullptr;
51+
52+
std::vector<float> weiBuffer;
53+
};
54+
55+
class AclDeconvExecutorBuilder : public DeconvExecutorBuilder {
56+
public:
57+
static bool customIsSupported(const DeconvAttrs& deconvAttrs,
58+
const std::vector<MemoryDescPtr>& srcDescs,
59+
const std::vector<MemoryDescPtr>& dstDescs);
60+
61+
bool isSupported(const DeconvAttrs& deconvAttrs,
62+
const std::vector<MemoryDescPtr>& srcDescs,
63+
const std::vector<MemoryDescPtr>& dstDescs) const override {
64+
return customIsSupported(deconvAttrs, srcDescs, dstDescs);
65+
}
66+
67+
DeconvExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const override {
68+
return std::make_shared<AclDeconvExecutor>(context);
69+
}
70+
71+
private:
72+
static bool validate_deconvolution_output_dimensions(unsigned int in_width, unsigned int in_height,
73+
unsigned int kernel_width, unsigned int kernel_height,
74+
const arm_compute::PadStrideInfo &pad_stride_info);
75+
};
76+
77+
} // namespace intel_cpu
78+
} // namespace ov
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
// Copyright (C) 2023 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
//
4+
5+
#include "deconv.hpp"
6+
7+
namespace ov {
8+
namespace intel_cpu {
9+
10+
using namespace InferenceEngine;
11+
12+
} // namespace intel_cpu
13+
} // namespace ov

0 commit comments

Comments
 (0)