Skip to content

Commit

Permalink
[CPU][ARM] Add ACL executor for Convert (openvinotoolkit#17323)
Browse files Browse the repository at this point in the history
* seaprate executors + add acl executor for convert

* update convert

* enabled tests and lot of changes in acl executor

* fixed different signedness comparison

* added expectedPrimitiveType method

* fixed comments

* fp16 WAs

* enable fp16 convert tests

* Revert "enable fp16 convert tests"

This reverts commit 037af67.

* Revert "fp16 WAs"

This reverts commit 3db3d42.

* fixed comments

* updated expected privitive to ref

* fixed comments

* getDescWithType name refactoring

* GetPtr to getData refactoring

* GetPtr to getData refactoring

---------

Co-authored-by: Aleksandr Voron <aleksandr.voron@intel.com>
  • Loading branch information
allnes and alvoron authored Jul 13, 2023
1 parent d026ae7 commit 6822eb6
Show file tree
Hide file tree
Showing 15 changed files with 778 additions and 247 deletions.
47 changes: 32 additions & 15 deletions src/plugins/intel_cpu/src/nodes/convert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

#include <dnnl_extension_utils.h>
#include "convert.h"
#include "common/cpu_convert.h"
#include "common/blocked_desc_creator.h"
#include <ngraph/opsets/opset1.hpp>
#include <ie_ngraph_utils.hpp>
Expand Down Expand Up @@ -41,13 +40,13 @@ Convert::Convert(const std::shared_ptr<ngraph::Node>& op, const GraphContext::CP
}

auto convert = ov::as_type_ptr<const ngraph::opset1::Convert>(op);
origPrc = details::convertPrecision(convert->get_destination_type());
convertParams.origPrc = details::convertPrecision(convert->get_destination_type());
}

Convert::Convert(const Shape &shape, const InferenceEngine::Precision &inPrc, const InferenceEngine::Precision &outPrc,
const std::string &nodeName, const GraphContext::CPtr context)
: Node("Convert", nodeName, context)
, origPrc(outPrc) {
: Node("Convert", nodeName, context) {
convertParams.origPrc = outPrc;
inputShapes.push_back(shape);
addOriginalInputPrecision(inPrc);
outputShapes.push_back(shape);
Expand Down Expand Up @@ -96,6 +95,16 @@ void Convert::initSupportedPrimitiveDescriptors() {
canInitExternalDesc &= isSupportedDesc(*output);
}

auto supportedPrimitiveDescriptorsBuilder = [this](NodeConfig config) {
MemoryDescPtr srcMemoryDesc = config.inConfs[0].getMemDesc();
MemoryDescPtr dstMemoryDesc = config.outConfs[0].getMemDesc();
convertParams.srcPrc = srcMemoryDesc->getPrecision();
convertParams.dstPrc = dstMemoryDesc->getPrecision();
auto factory = std::make_shared<ConvertExecutorFactory>(convertParams, srcMemoryDesc, dstMemoryDesc,
std::make_shared<ExecutorContext>(context, getImplPriority()));
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown, factory);
};

// if input and output pointers are not null and not contain extra data, then the inp/output tensor descriptors were set using setDescs method, so
// they should be used as the actual descriptors.
if (canInitExternalDesc) {
Expand All @@ -106,7 +115,7 @@ void Convert::initSupportedPrimitiveDescriptors() {
dataConfigOut.setMemDesc(config.inConfs[0].getMemDesc());
dataConfigOut.setMemDesc(dataConfigOut.getMemDesc()->cloneWithNewPrecision(output->getPrecision()));
config.outConfs.push_back(dataConfigOut);
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown);
supportedPrimitiveDescriptorsBuilder(config);
} else if (inputShapes.size() == 1 && outputShapes.size() == 1) {
const Shape& insShape = getInputShapeAtPort(0);
auto insPrecision = getOriginalInputPrecisionAtPort(0);
Expand All @@ -123,13 +132,27 @@ void Convert::initSupportedPrimitiveDescriptors() {
config.inConfs[0].setMemDesc(std::make_shared<CpuBlockedMemoryDesc>(itr->second->createDesc(insPrecision, insShape)));
config.outConfs[0].setMemDesc(std::make_shared<CpuBlockedMemoryDesc>(itr->second->createDesc(outPrecision, outputShape)));

supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown);
supportedPrimitiveDescriptorsBuilder(config);
}
} else {
IE_THROW() << errorPrefix << " has incorrect number of input/output edges";
}
}

void Convert::prepareParams() {
auto& parentMem = getParentEdgeAt(0)->getMemory();
convertParams.size = parentMem.getDescWithType<BlockedMemoryDesc>()->getPaddedElementsCount();

auto selectedPD = getSelectedPrimitiveDescriptor();
MemoryDescPtr srcDesc = getParentEdgeAt(0)->getMemoryPtr()->getDescPtr();
MemoryDescPtr dstDesc = getChildEdgeAt(0)->getMemoryPtr()->getDescPtr();
execPtr = selectedPD->getExecutorFactoryAs<ConvertExecutorFactory>()->makeExecutor(convertParams,
srcDesc,
dstDesc,
{});
selectedPD->setImplementationType(execPtr->getImplType());
}

void Convert::executeDynamicImpl(dnnl::stream strm) {
execute(strm);
}
Expand All @@ -144,15 +167,9 @@ void Convert::execute(dnnl::stream strm) {
if (parentPaddElemCount != childPaddElemCount)
IE_THROW() << errorPrefix << " has different elements number in input and output buffers";

void* srcPtr = parentMem.getData();
void* dstPtr = childMem.getData();

cpu_convert(srcPtr,
dstPtr,
parentMem.getDesc().getPrecision(),
origPrc,
childMem.getDesc().getPrecision(),
parentPaddElemCount);
MemoryCPtr srcMemory = getParentEdgeAt(0)->getMemoryPtr();
MemoryPtr dstMemory = getChildEdgeAt(0)->getMemoryPtr();
execPtr->exec(srcMemory, dstMemory);
}

bool Convert::created() const {
Expand Down
8 changes: 6 additions & 2 deletions src/plugins/intel_cpu/src/nodes/convert.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <node.h>
#include <string>
#include <vector>
#include "executors/convert_list.hpp"

namespace ov {
namespace intel_cpu {
Expand All @@ -21,6 +22,7 @@ class Convert : public Node {

void getSupportedDescriptors() override;
void initSupportedPrimitiveDescriptors() override;
void prepareParams() override;
void execute(dnnl::stream strm) override;
void executeDynamicImpl(dnnl::stream strm) override;
bool created() const override;
Expand All @@ -40,7 +42,7 @@ class Convert : public Node {
const MemoryDesc& getInput() const { return *input; }
const MemoryDesc& getOutput() const { return *output; }

bool needPrepareParams() const override { return false; }
bool needPrepareParams() const override { return inputShapesModified(); }

static bool isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept;

Expand All @@ -49,7 +51,9 @@ class Convert : public Node {
private:
MemoryDescPtr input;
MemoryDescPtr output;
InferenceEngine::Precision origPrc;
ConvertParams convertParams;
std::shared_ptr<ConvertExecutor> execPtr = nullptr;
NodeConfig config;

std::string errorPrefix;
};
Expand Down
124 changes: 124 additions & 0 deletions src/plugins/intel_cpu/src/nodes/executors/acl/acl_convert.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "acl_convert.hpp"
#include "acl_utils.hpp"

bool ov::intel_cpu::ACLConvertExecutor::init(const ov::intel_cpu::ConvertParams& convertParams,
const MemoryDescPtr& srcDesc,
const MemoryDescPtr& dstDesc,
const dnnl::primitive_attr& attr) {
aclConvertParams = convertParams;

auto srcPrecision = precisionToAclDataType(aclConvertParams.srcPrc);
auto dstPrecision = precisionToAclDataType(aclConvertParams.dstPrc);
isCopyOp = aclConvertParams.srcPrc == aclConvertParams.dstPrc;
// NECast does not support S8. It could be replaced with QASYMM8_SIGNED
if (!isCopyOp && srcPrecision == arm_compute::DataType::S8) {
srcPrecision = arm_compute::DataType::QASYMM8_SIGNED;
}
if (!isCopyOp && dstPrecision == arm_compute::DataType::S8) {
dstPrecision = arm_compute::DataType::QASYMM8_SIGNED;
}
auto srcDims = srcDesc->getShape().getStaticDims();
auto dstDims = dstDesc->getShape().getStaticDims();
auto srcDataLayout = getAclDataLayoutByMemoryDesc(srcDesc);
auto dstDataLayout = getAclDataLayoutByMemoryDesc(dstDesc);
auto srcTensorInfo = arm_compute::TensorInfo(shapeCast(srcDims), 1, srcPrecision, srcDataLayout);
auto dstTensorInfo = arm_compute::TensorInfo(shapeCast(dstDims), 1, dstPrecision, dstDataLayout);
if (isCopyOp) {
arm_compute::Status s = arm_compute::NECopy::validate(&srcTensorInfo, &dstTensorInfo);
if (!s) {
DEBUG_LOG("NECopy validation failed: ", s.error_description());
return false;
}
} else {
arm_compute::Status s = arm_compute::NECast::validate(&srcTensorInfo, &dstTensorInfo, arm_compute::ConvertPolicy::SATURATE);
if (!s) {
DEBUG_LOG("NECast validation failed: ", s.error_description());
return false;
}
}

srcTensor.allocator()->init(srcTensorInfo);
dstTensor.allocator()->init(dstTensorInfo);

if (isCopyOp) {
acl_copy = std::make_unique<arm_compute::NECopy>();
acl_copy->configure(&srcTensor, &dstTensor);
} else {
acl_cast = std::make_unique<arm_compute::NECast>();
acl_cast->configure(&srcTensor, &dstTensor, arm_compute::ConvertPolicy::SATURATE);
}
return true;
}

void ov::intel_cpu::ACLConvertExecutor::exec(const MemoryCPtr& src, const MemoryPtr& dst) {
srcTensor.allocator()->import_memory(src->getData());
dstTensor.allocator()->import_memory(dst->getData());

if (isCopyOp) {
acl_copy->run();
} else {
acl_cast->run();
}

srcTensor.allocator()->free();
dstTensor.allocator()->free();
}

bool ov::intel_cpu::ACLConvertExecutorBuilder::isSupported(const ConvertParams& convertParams,
const MemoryDescPtr& srcDesc,
const MemoryDescPtr& dstDesc) const {
if (convertParams.srcPrc != convertParams.dstPrc) {
if (!one_of(convertParams.srcPrc,
InferenceEngine::Precision::I8,
InferenceEngine::Precision::U8,
InferenceEngine::Precision::U16,
InferenceEngine::Precision::I16,
InferenceEngine::Precision::FP16,
InferenceEngine::Precision::I32,
InferenceEngine::Precision::FP32)) {
DEBUG_LOG("NECopy does not support source precision: ", convertParams.srcPrc.name());
return false;
}
if ((convertParams.srcPrc == InferenceEngine::Precision::I8 && !one_of(convertParams.dstPrc,
InferenceEngine::Precision::I16,
InferenceEngine::Precision::I32,
InferenceEngine::Precision::FP16,
InferenceEngine::Precision::FP32)) ||
(convertParams.srcPrc == InferenceEngine::Precision::U8 && !one_of(convertParams.dstPrc,
InferenceEngine::Precision::U16,
InferenceEngine::Precision::I16,
InferenceEngine::Precision::I32,
InferenceEngine::Precision::FP16,
InferenceEngine::Precision::FP32)) ||
(convertParams.srcPrc == InferenceEngine::Precision::U16 && !one_of(convertParams.dstPrc,
InferenceEngine::Precision::U8,
InferenceEngine::Precision::U32)) ||
(convertParams.srcPrc == InferenceEngine::Precision::I16 && !one_of(convertParams.dstPrc,
InferenceEngine::Precision::I8,
InferenceEngine::Precision::U8,
InferenceEngine::Precision::I32)) ||
(convertParams.srcPrc == InferenceEngine::Precision::FP16 && !one_of(convertParams.dstPrc,
InferenceEngine::Precision::I8,
InferenceEngine::Precision::FP32,
InferenceEngine::Precision::I32,
InferenceEngine::Precision::U8)) ||
(convertParams.srcPrc == InferenceEngine::Precision::I32 && !one_of(convertParams.dstPrc,
InferenceEngine::Precision::I8,
InferenceEngine::Precision::FP16,
InferenceEngine::Precision::FP32,
InferenceEngine::Precision::U8)) ||
(convertParams.srcPrc == InferenceEngine::Precision::FP32 && !one_of(convertParams.dstPrc,
InferenceEngine::Precision::BF16,
InferenceEngine::Precision::FP16,
InferenceEngine::Precision::I32))) {
DEBUG_LOG("NECopy does not support passed combination of source and destination precisions. ",
"source precision: ", convertParams.srcPrc.name(), " destination precsion: ", convertParams.dstPrc.name());
return false;
}
}
return true;
}
43 changes: 43 additions & 0 deletions src/plugins/intel_cpu/src/nodes/executors/acl/acl_convert.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "nodes/executors/convert.hpp"
#include "utils/debug_capabilities.h"
#include "arm_compute/runtime/NEON/NEFunctions.h"

namespace ov {
namespace intel_cpu {

class ACLConvertExecutor : public ConvertExecutor {
public:
using ConvertExecutor::ConvertExecutor;
bool init(const ConvertParams& convertParams,
const MemoryDescPtr& srcDesc,
const MemoryDescPtr& dstDesc,
const dnnl::primitive_attr &attr) override;
void exec(const MemoryCPtr& src, const MemoryPtr& dst) override;
impl_desc_type getImplType() const override { return implDescType; };
protected:
ConvertParams aclConvertParams;
bool isCopyOp;
static const impl_desc_type implDescType = impl_desc_type::acl;
arm_compute::Tensor srcTensor, dstTensor;
std::unique_ptr<arm_compute::NECopy> acl_copy;
std::unique_ptr<arm_compute::NECast> acl_cast;
};

class ACLConvertExecutorBuilder : public ConvertExecutorBuilder {
public:
bool isSupported(const ConvertParams& convertParams,
const MemoryDescPtr& srcDesc,
const MemoryDescPtr& dstDesc) const override;
ConvertExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const override {
return std::make_shared<ACLConvertExecutor>(context);
}
};

} // namespace intel_cpu
} // namespace ov
23 changes: 23 additions & 0 deletions src/plugins/intel_cpu/src/nodes/executors/common/ref_convert.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "ref_convert.hpp"
#include "nodes/common/cpu_convert.h"

bool ov::intel_cpu::CommonConvertExecutor::init(const ov::intel_cpu::ConvertParams& convertParams,
const MemoryDescPtr& srcDesc,
const MemoryDescPtr& dstDesc,
const dnnl::primitive_attr& attr) {
commonConvertParams = convertParams;
return true;
}

void ov::intel_cpu::CommonConvertExecutor::exec(const MemoryCPtr& src, const MemoryPtr& dst) {
cpu_convert(src->getData(),
dst->getData(),
commonConvertParams.srcPrc,
commonConvertParams.origPrc,
commonConvertParams.dstPrc,
commonConvertParams.size);
}
42 changes: 42 additions & 0 deletions src/plugins/intel_cpu/src/nodes/executors/common/ref_convert.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "nodes/executors/convert.hpp"

namespace ov {
namespace intel_cpu {

class CommonConvertExecutor : public ConvertExecutor {
public:
using ConvertExecutor::ConvertExecutor;
bool init(const ConvertParams& convertParams,
const MemoryDescPtr& srcDesc,
const MemoryDescPtr& dstDesc,
const dnnl::primitive_attr &attr) override;
void exec(const MemoryCPtr& src, const MemoryPtr& dst) override;
impl_desc_type getImplType() const override { return implDescType; };
protected:
ConvertParams commonConvertParams;
static const impl_desc_type implDescType = impl_desc_type::ref;
const ExecutorContext::CPtr convertContext;
};


class CommonConvertExecutorBuilder : public ConvertExecutorBuilder {
public:
~CommonConvertExecutorBuilder() = default;
bool isSupported(const ConvertParams& convertParams,
const MemoryDescPtr& srcDesc,
const MemoryDescPtr& dstDesc) const override {
return true;
}
ConvertExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const override {
return std::make_shared<CommonConvertExecutor>(context);
}
};

} // namespace intel_cpu
} // namespace ov
7 changes: 7 additions & 0 deletions src/plugins/intel_cpu/src/nodes/executors/convert.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "convert.hpp"

ov::intel_cpu::ConvertExecutor::ConvertExecutor(const ov::intel_cpu::ExecutorContext::CPtr context) : convertContext(context) {}
Loading

0 comments on commit 6822eb6

Please sign in to comment.