Skip to content

Commit

Permalink
[ARM CPU] Add ACL deconvolution operation (openvinotoolkit#18655)
Browse files Browse the repository at this point in the history
  • Loading branch information
allnes authored Aug 2, 2023
1 parent d51fc7a commit f3bafef
Show file tree
Hide file tree
Showing 8 changed files with 657 additions and 50 deletions.
195 changes: 152 additions & 43 deletions src/plugins/intel_cpu/src/nodes/deconv.cpp

Large diffs are not rendered by default.

15 changes: 8 additions & 7 deletions src/plugins/intel_cpu/src/nodes/deconv.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
#include <vector>
#include "common/dnnl_executor.h"

#include "executors/deconv_list.hpp"

namespace ov {
namespace intel_cpu {
namespace node {
Expand All @@ -20,6 +22,7 @@ class Deconvolution : public Node {
Deconvolution(const std::shared_ptr<ngraph::Node>& op, const GraphContext::CPtr context);

void getSupportedDescriptors() override;
void initSupportedPrimitiveDescriptors() override;
void createDescriptor(const std::vector<MemoryDescPtr>& inputDesc,
const std::vector<MemoryDescPtr>& outputDesc) override;
void createPrimitive() override;
Expand All @@ -41,7 +44,7 @@ class Deconvolution : public Node {
bool canFuse(const NodePtr& node) const override;

const VectorDims& getWeightDims() const { return getInputShapeAtPort(1).getStaticDims(); }
const std::vector<ptrdiff_t>& getStride() const { return stride; }
const std::vector<ptrdiff_t>& getStride() const { return deconvAttrs.stride; }

void prepareParams() override;
void execute(dnnl::stream strm) override;
Expand All @@ -55,6 +58,7 @@ class Deconvolution : public Node {
AttrPtr initPrimitiveAttr() override;
AttrPtr makePrimitiveAttr(const VectorDims& dims);
std::vector<dnnl::memory::format_tag> getAvailableFormatsForDims(const Shape& dims) const override;
std::shared_ptr<DeconvExecutor> execPtrDeconv = nullptr;

private:
using executorPtr = std::shared_ptr<DnnlExecutor>;
Expand Down Expand Up @@ -89,16 +93,13 @@ class Deconvolution : public Node {
size_t groupNum = 1;
size_t IC = 0;
size_t OC = 0;
std::vector<ptrdiff_t> kernel;
std::vector<ptrdiff_t> stride;
std::vector<ptrdiff_t> dilation;
ov::CoordinateDiff paddingL;
ov::CoordinateDiff paddingR;
ov::CoordinateDiff outputPadding;
std::vector<int32_t> lastOutputSpatialDims;
VectorDims int8WeightDims;
VectorDims expectedBiasDims {};

bool useACL = false;
DeconvAttrs deconvAttrs;

Shape inShape;

AttrPtr pAttr;
Expand Down
248 changes: 248 additions & 0 deletions src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,248 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "acl_deconv.hpp"
#include "ie_parallel.hpp"

namespace ov {
namespace intel_cpu {

using namespace arm_compute;

ACLDeconvTensorInfo getACLDeconvTensorInfo(const DeconvAttrs& deconvAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs) {
auto srcDims = srcDescs[0]->getShape().getDims();
auto weiDims = srcDescs[1]->getShape().getDims();
// swap input and output channels dimensions to be align with ACL
// weights tensor shape is changed because ACL expects [O, I, H, W] tensor while OV uses [I, O, H, W] tensor
std::swap(weiDims[0], weiDims[1]);
auto dstDims = dstDescs[0]->getShape().getDims();

VectorDims biasDims;
TensorInfo biasTensorInfo;

if (deconvAttrs.withBiasesParam) {
biasDims = srcDescs[2]->getShape().getStaticDims();
biasTensorInfo = TensorInfo(shapeCast(biasDims), 1,
precisionToAclDataType(srcDescs[2]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[2]));
}

TensorInfo srcTensorInfo = TensorInfo(shapeCast(srcDims), 1,
precisionToAclDataType(srcDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[0]));
TensorInfo weiTensorInfo = TensorInfo(shapeCast(weiDims), 1,
precisionToAclDataType(srcDescs[1]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[1]));
TensorInfo dstTensorInfo = TensorInfo(shapeCast(dstDims), 1,
precisionToAclDataType(dstDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(dstDescs[0]));

unsigned int pad_l =
(deconvAttrs.paddingL.size() > 1) ? static_cast<unsigned int>(deconvAttrs.paddingL.at(1)) : static_cast<unsigned int>(deconvAttrs.paddingL.at(0));
unsigned int pad_r =
(deconvAttrs.paddingR.size() > 1) ? static_cast<unsigned int>(deconvAttrs.paddingR.at(1)) : static_cast<unsigned int>(deconvAttrs.paddingR.at(0));
unsigned int pad_t = static_cast<unsigned int>(deconvAttrs.paddingL.at(0));
unsigned int pad_b = static_cast<unsigned int>(deconvAttrs.paddingR.at(0));
unsigned int stride_x = (deconvAttrs.stride.size() > 1) ? deconvAttrs.stride.at(1) : deconvAttrs.stride.at(0);
unsigned int stride_y = deconvAttrs.stride.at(0);
PadStrideInfo deconv_info(stride_x, stride_y, pad_l, pad_r, pad_t, pad_b, DimensionRoundingType::FLOOR);

return ACLDeconvTensorInfo{srcTensorInfo, weiTensorInfo, biasTensorInfo, dstTensorInfo, deconv_info};
}

AclDeconvExecutor::AclDeconvExecutor(const ExecutorContext::CPtr context) : DeconvExecutor(context) {}

bool AclDeconvExecutor::init(const DeconvAttrs& deconvAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs,
const dnnl::primitive_attr &attr) {
this->deconvAttrs = deconvAttrs;
ACLDeconvTensorInfo aclDeconvTensorInfo = getACLDeconvTensorInfo(deconvAttrs, srcDescs, dstDescs);
TensorInfo srcTensorInfo = aclDeconvTensorInfo.srcTensorInfo;
TensorInfo weiTensorInfo = aclDeconvTensorInfo.weiTensorInfo;
TensorInfo biasTensorInfo = aclDeconvTensorInfo.biasTensorInfo;
TensorInfo dstTensorInfo = aclDeconvTensorInfo.dstTensorInfo;
PadStrideInfo deconv_info = aclDeconvTensorInfo.deconv_info;

arm_compute::Status status = arm_compute::NEDeconvolutionLayer::validate(&srcTensorInfo,
&weiTensorInfo,
deconvAttrs.withBiasesParam ? &biasTensorInfo : nullptr,
&dstTensorInfo,
deconv_info);
if (!status) {
DEBUG_LOG("NEDeconvolutionLayer validation failed: ", status.error_description());
return false;
}

srcTensor.allocator()->init(srcTensorInfo);
weiTensor.allocator()->init(weiTensorInfo);
dstTensor.allocator()->init(dstTensorInfo);
if (deconvAttrs.withBiasesParam)
biasTensor.allocator()->init(biasTensorInfo);

deconv = std::make_unique<arm_compute::NEDeconvolutionLayer>();
deconv->configure(&srcTensor, &weiTensor, deconvAttrs.withBiasesParam ? &biasTensor : nullptr, &dstTensor, deconv_info);

// weights tensor shape is changed because ACL expects [O, I, H, W] tensor while OV uses [I, O, H, W] tensor
weiBuffer = std::vector<float>(srcDescs[1]->getShape().getStaticDims()[0] *
srcDescs[1]->getShape().getStaticDims()[1] *
srcDescs[1]->getShape().getStaticDims()[2] *
srcDescs[1]->getShape().getStaticDims()[3]);
return true;
}

static void transpose_to_1023(const MemoryCPtr& srcMemPtr, std::vector<float>& dst_data) {
const auto src_data = reinterpret_cast<float*>(srcMemPtr->getData());

const int DIM0 = srcMemPtr->getStaticDims()[0];
const int DIM1 = srcMemPtr->getStaticDims()[1];
const int DIM2 = srcMemPtr->getStaticDims()[2];
const int DIM3 = srcMemPtr->getStaticDims()[3];

parallel_for3d(DIM0, DIM1, DIM2, [&](const int dim0, const int dim1, const int dim2) {
for (int dim3 = 0; dim3 < DIM3; ++dim3) {
const int src_off = dim0 * DIM1 * DIM2 * DIM3 +
dim1 * DIM2 * DIM3 +
dim2 * DIM3 +
dim3;
const int dst_off = dim1 * DIM0 * DIM2 * DIM3 +
dim0 * DIM2 * DIM3 +
dim2 * DIM3 +
dim3;

dst_data[dst_off] = src_data[src_off];
}
});
}

void AclDeconvExecutor::exec(const std::vector<MemoryCPtr>& src, const std::vector<MemoryPtr>& dst, const void *post_ops_data_) {
// TODO: Remove transpose from exec
transpose_to_1023(src[1], weiBuffer);

srcTensor.allocator()->import_memory(src[0]->getData());
dstTensor.allocator()->import_memory(dst[0]->getData());
weiTensor.allocator()->import_memory(weiBuffer.data());
if (deconvAttrs.withBiasesParam)
biasTensor.allocator()->import_memory(src[2]->getData());
deconv->run();

srcTensor.allocator()->free();
dstTensor.allocator()->free();
weiTensor.allocator()->free();
if (deconvAttrs.withBiasesParam)
biasTensor.allocator()->free();
}

bool AclDeconvExecutorBuilder::customIsSupported(const DeconvAttrs &deconvAttrs,
const std::vector<MemoryDescPtr> &srcDescs,
const std::vector<MemoryDescPtr> &dstDescs) {
if ((srcDescs[0]->getShape().getDims().size() != 3 && srcDescs[0]->getShape().getDims().size() != 4) ||
dstDescs[0]->getShape().getDims().size() != srcDescs[0]->getShape().getDims().size() ||
srcDescs[1]->getShape().getDims().size() != 4) {
DEBUG_LOG("AclDeconvExecutor does not support dimension:",
" src[0]=", srcDescs[0]->getShape().getDims().size(),
" src[1]=", srcDescs[1]->getShape().getDims().size(),
" dst[0]=", dstDescs[0]->getShape().getDims().size());
return false;
}

// TODO: Ticket CVS-114087 - enable FP16 when check FP16 scoup
if (!(one_of(srcDescs[0]->getPrecision(), /*InferenceEngine::Precision::FP16, */InferenceEngine::Precision::FP32) &&
srcDescs[0]->getPrecision() == srcDescs[1]->getPrecision() &&
srcDescs[1]->getPrecision() == dstDescs[0]->getPrecision())) {
DEBUG_LOG("AclDeconvExecutor does not support precisions:",
" src[0]=", srcDescs[0]->getPrecision(),
" src[1]=", srcDescs[1]->getPrecision(),
" dst[0]=", dstDescs[0]->getPrecision());
return false;
}

if (deconvAttrs.withBiasesParam && srcDescs[2]->getPrecision() != srcDescs[0]->getPrecision()) {
DEBUG_LOG("AclDeconvExecutor does not support precisions:",
" src[2]=", srcDescs[2]->getPrecision());
return false;
}

if (!(srcDescs[0]->hasLayoutType(LayoutType::ncsp) &&
srcDescs[1]->hasLayoutType(LayoutType::ncsp) &&
dstDescs[0]->hasLayoutType(LayoutType::ncsp)) &&
!(srcDescs[0]->hasLayoutType(LayoutType::nspc) &&
srcDescs[1]->hasLayoutType(LayoutType::nspc) &&
dstDescs[0]->hasLayoutType(LayoutType::nspc))) {
DEBUG_LOG("AclDeconvExecutor does not support layouts:",
" src[0]=", srcDescs[0]->serializeFormat(),
" src[1]=", srcDescs[1]->serializeFormat(),
" dst=", dstDescs[0]->serializeFormat());
return false;
}

if (deconvAttrs.withBiasesParam &&
!(srcDescs[2]->hasLayoutType(LayoutType::ncsp)) &&
!(srcDescs[2]->hasLayoutType(LayoutType::nspc))) {
DEBUG_LOG("AclDeconvExecutor does not support layouts:",
" src[0]=", srcDescs[0]->serializeFormat(),
" src[1]=", srcDescs[1]->serializeFormat(),
" src[2]=", srcDescs[2]->serializeFormat(),
" dst=", dstDescs[0]->serializeFormat());
return false;
}

ACLDeconvTensorInfo aclDeconvTensorInfo = getACLDeconvTensorInfo(deconvAttrs, srcDescs, dstDescs);
TensorInfo srcTensorInfo = aclDeconvTensorInfo.srcTensorInfo;
TensorInfo weiTensorInfo = aclDeconvTensorInfo.weiTensorInfo;
TensorInfo biasTensorInfo = aclDeconvTensorInfo.biasTensorInfo;
TensorInfo dstTensorInfo = aclDeconvTensorInfo.dstTensorInfo;
PadStrideInfo deconv_info = aclDeconvTensorInfo.deconv_info;

unsigned int kernel_x = (deconvAttrs.kernel.size() > 1) ? deconvAttrs.kernel.at(1) : deconvAttrs.kernel.at(0);
unsigned int kernel_y = deconvAttrs.kernel.at(0);

// After stride=8 up-sampling in ACL Deconvolution layer slower than reference
if (deconv_info.stride().first >= 8 || deconv_info.stride().second >= 8) return false;

unsigned int dilation_x = (deconvAttrs.dilation.size() > 1) ? deconvAttrs.dilation.at(1) : deconvAttrs.dilation.at(0);
unsigned int dilation_y = deconvAttrs.dilation.at(0);
if (!one_of(dilation_x, static_cast<unsigned int >(0), static_cast<unsigned int >(1)) ||
!one_of(dilation_y, static_cast<unsigned int >(0), static_cast<unsigned int >(1))) return false;

size_t in_h = srcDescs[0]->hasLayoutType(LayoutType::ncsp) ? srcDescs[0]->getShape().getDims()[2] : srcDescs[0]->getShape().getDims()[1];
size_t in_w = srcDescs[0]->hasLayoutType(LayoutType::ncsp) ? srcDescs[0]->getShape().getDims()[3] : srcDescs[0]->getShape().getDims()[2];

// Validate function has bug (https://github.com/ARM-software/ComputeLibrary/issues/1061) with error exception.
// We copy deconvolution_output_dimensions function for get correct validation
// TODO: remove after fix
if (validate_deconvolution_output_dimensions(in_w, in_h, kernel_x, kernel_y, deconv_info)) {
DEBUG_LOG("NEDeconvolutionLayer arm_compute::deconvolution_output_dimensions failed");
return false;
}

arm_compute::Status status = arm_compute::NEDeconvolutionLayer::validate(&srcTensorInfo,
&weiTensorInfo,
deconvAttrs.withBiasesParam ? &biasTensorInfo : nullptr,
&dstTensorInfo,
deconv_info);
if (!status) {
DEBUG_LOG("NEDeconvolutionLayer validation failed: ", status.error_description());
return false;
}

return true;
}

bool AclDeconvExecutorBuilder::validate_deconvolution_output_dimensions(unsigned int in_width, unsigned int in_height,
unsigned int kernel_width,
unsigned int kernel_height,
const PadStrideInfo &pad_stride_info) {
const unsigned int pad_left = pad_stride_info.pad_left();
const unsigned int pad_top = pad_stride_info.pad_top();
const unsigned int pad_right = pad_stride_info.pad_right();
const unsigned int pad_bottom = pad_stride_info.pad_bottom();
const unsigned int stride_x = pad_stride_info.stride().first;
const unsigned int stride_y = pad_stride_info.stride().second;

if (!((in_width < 1 || in_height < 1) ||
(((in_width - 1) * stride_x + kernel_width) < (pad_left + pad_right)) ||
(((in_height - 1) * stride_y + kernel_height) < (pad_top + pad_bottom)))) { return false; }
return true;
}
} // namespace intel_cpu
} // namespace ov
78 changes: 78 additions & 0 deletions src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "nodes/executors/deconv.hpp"
#include "arm_compute/runtime/NEON/NEFunctions.h"
#include "utils/debug_capabilities.h"
#include "acl_utils.hpp"

namespace ov {
namespace intel_cpu {

struct ACLDeconvTensorInfo {
arm_compute::TensorInfo srcTensorInfo;
arm_compute::TensorInfo weiTensorInfo;
arm_compute::TensorInfo biasTensorInfo;
arm_compute::TensorInfo dstTensorInfo;
arm_compute::PadStrideInfo deconv_info;
};

ACLDeconvTensorInfo getACLDeconvTensorInfo(const DeconvAttrs& deconvAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs);

class AclDeconvExecutor : public DeconvExecutor {
public:
explicit AclDeconvExecutor(const ExecutorContext::CPtr context);
bool init(const DeconvAttrs& deconvAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs,
const dnnl::primitive_attr &attr) override;
void exec(const std::vector<MemoryCPtr>& src,
const std::vector<MemoryPtr>& dst,
const void *post_ops_data_) override;

impl_desc_type getImplType() const override {
return implType;
}

private:
DeconvAttrs deconvAttrs;
impl_desc_type implType = impl_desc_type::acl;

arm_compute::Tensor srcTensor;
arm_compute::Tensor weiTensor;
arm_compute::Tensor biasTensor;
arm_compute::Tensor dstTensor;
std::unique_ptr<arm_compute::NEDeconvolutionLayer> deconv = nullptr;

std::vector<float> weiBuffer;
};

class AclDeconvExecutorBuilder : public DeconvExecutorBuilder {
public:
static bool customIsSupported(const DeconvAttrs& deconvAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs);

bool isSupported(const DeconvAttrs& deconvAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs) const override {
return customIsSupported(deconvAttrs, srcDescs, dstDescs);
}

DeconvExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const override {
return std::make_shared<AclDeconvExecutor>(context);
}

private:
static bool validate_deconvolution_output_dimensions(unsigned int in_width, unsigned int in_height,
unsigned int kernel_width, unsigned int kernel_height,
const arm_compute::PadStrideInfo &pad_stride_info);
};

} // namespace intel_cpu
} // namespace ov
13 changes: 13 additions & 0 deletions src/plugins/intel_cpu/src/nodes/executors/deconv.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "deconv.hpp"

namespace ov {
namespace intel_cpu {

using namespace InferenceEngine;

} // namespace intel_cpu
} // namespace ov
Loading

0 comments on commit f3bafef

Please sign in to comment.