Skip to content

Commit

Permalink
[CPU][ARM] Add ACL executor for Transpose (openvinotoolkit#17322)
Browse files Browse the repository at this point in the history
* separate executors + add acl executor fot transpose

* correct axisCast

* update transpose executors list

* update new changes

* enable tests

* fix fortting

* fixed test shapes and transpose generalization

* fixed different signedness error

* size_t usage in loop counters

* undo unwanted changes

* fixed comments

* added i8 and fp32 to blocked x86 tests

* fixed comments

* fixed comments

* extracted general reference executor from PermuteKernel

* fix mayiuse in JitTransposeExecutorBuilder::isSupported

* getDescWithType name refactoring

* refactoring

* removed 2nd executor creation in transpose node

* Moved RefOptimizedTranspose to the top

* fixed comments

---------

Co-authored-by: Aleksandr Voron <aleksandr.voron@intel.com>
  • Loading branch information
allnes and alvoron authored Jul 14, 2023
1 parent 73ed804 commit 238c7fa
Show file tree
Hide file tree
Showing 23 changed files with 1,311 additions and 697 deletions.
176 changes: 5 additions & 171 deletions src/plugins/intel_cpu/src/nodes/common/permute_kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@

#include "cpu/x64/jit_generator.hpp"
#include <common/primitive_hashing_utils.hpp>
#include "nodes/executors/transpose.hpp"
#include "nodes/executors/common/ref_transpose.hpp"

using namespace InferenceEngine;
using namespace dnnl;
Expand Down Expand Up @@ -146,121 +148,7 @@ struct jit_uni_permute_kernel_f32 : public jit_uni_permute_kernel, public jit_ge
#endif // OPENVINO_ARCH_X86_64

PermuteKernel::PermuteKernel(const PermuteParams& params) : params(params) {
prepareParams();
}

void PermuteKernel::prepareParams() {
SizeVector src_block_strides(params.src_block_dims.size(), 1);
SizeVector dst_block_strides(params.dst_block_dims.size(), 1);
for (int i = params.src_block_dims.size() - 2; i >= 0; i--)
src_block_strides[i] = src_block_strides[i + 1] * params.src_block_dims[i + 1];
for (int i = params.dst_block_dims.size() - 2; i >= 0; i--)
dst_block_strides[i] = dst_block_strides[i + 1] * params.dst_block_dims[i + 1];

SizeVector new_dst_block_strides = dst_block_strides;
SizeVector new_dst_block_order = params.dst_block_order;
SizeVector new_dst_block_dims = params.dst_block_dims;
SizeVector new_src_block_strides(dst_block_strides.size());
SizeVector mask(dst_block_strides.size());

SizeVector tmp_order;
for (size_t i = 0; i < params.dst_block_order.size(); i++) {
tmp_order.push_back(params.order[params.dst_block_order[i]]);
}

for (int i = tmp_order.size() - 1; i >= 0; i--) {
int pos = std::distance(std::find(
params.src_block_order.rbegin(), params.src_block_order.rend(), tmp_order[i]), params.src_block_order.rend() - 1);
if (pos != -1) {
new_src_block_strides[i] = src_block_strides[pos];
params.src_block_order.erase(params.src_block_order.begin() + pos);
src_block_strides.erase(src_block_strides.begin() + pos);
mask[i] = 0;
} else {
new_src_block_strides[i] = new_src_block_strides[tmp_order.size() - 1] * params.dst_block_dims[tmp_order.size() - 1];
mask[i] = 1;
mask[tmp_order.size() - 1] = 1;
}
}
if (!params.src_block_order.empty()) {
int pos = std::distance(tmp_order.begin(), std::find(tmp_order.begin(), tmp_order.end(), params.src_block_order[0]));
new_src_block_strides.insert(new_src_block_strides.begin() + pos,
src_block_strides[0]);
new_dst_block_strides.insert(new_dst_block_strides.begin() + pos,
new_dst_block_strides[pos] * params.src_block_dims[params.src_block_dims.size() - 1]);
new_dst_block_order.insert(new_dst_block_order.begin() + pos,
new_dst_block_order[pos]);
new_dst_block_dims.insert(new_dst_block_dims.begin() + pos + 1,
params.src_block_dims[params.src_block_dims.size() - 1]);
new_dst_block_dims[pos] = div_up(new_dst_block_dims[pos], new_dst_block_dims[pos + 1]);
mask.insert(mask.begin() + pos + 1, 1);
mask[pos] = 1;
}

SizeVector sorted_src_strides;
SizeVector sorted_dst_strides;
SizeVector sorted_order;
SizeVector sorted_dst_dims;

// support dynamic batch
int batch_ord = std::distance(params.order.begin(), std::find(params.order.begin(), params.order.end(), 0));
int batch_count = 0;
int batch_pos = 0;
for (size_t i = 0; i < new_dst_block_order.size(); i++) {
if (static_cast<int>(new_dst_block_order[i]) == batch_ord) {
batch_count++;
batch_pos = i;
}
}
if (batch_count == 1) {
sorted_src_strides.push_back(new_src_block_strides[batch_pos]);
sorted_dst_strides.push_back(new_dst_block_strides[batch_pos]);
sorted_order.push_back(new_dst_block_order[batch_pos]);
sorted_dst_dims.push_back(new_dst_block_dims[batch_pos]);
jcp.supported_dynamic_batch = true;
}

int n2 = 0;
for (size_t i = 0; i < mask.size(); i++) {
if (mask[i] == 0) {
n2++;
if (batch_count == 1 && static_cast<int>(new_dst_block_order[i]) == batch_ord) {
continue;
}
sorted_src_strides.push_back(new_src_block_strides[i]);
sorted_dst_strides.push_back(new_dst_block_strides[i]);
sorted_order.push_back(new_dst_block_order[i]);
sorted_dst_dims.push_back(new_dst_block_dims[i]);
}
}
for (size_t i = 0; i < mask.size(); i++) {
if (mask[i] == 1) {
sorted_src_strides.push_back(new_src_block_strides[i]);
sorted_dst_strides.push_back(new_dst_block_strides[i]);
sorted_order.push_back(new_dst_block_order[i]);
sorted_dst_dims.push_back(new_dst_block_dims[i]);
}
}

int max_threads = parallel_get_max_threads();
const int n_max = 3; // max count dims for parallel
int n = 0;
int work_amount = sorted_dst_dims[0];
for (size_t i = 1; i < sorted_dst_dims.size() && n < n_max; i++) {
n++;
if (work_amount >= 4 * max_threads) { // 4 * max_threads is a specially selected value for best performance
break;
}
work_amount *= sorted_dst_dims[i];
}

jcp.src_strides = sorted_src_strides;
jcp.dst_strides = sorted_dst_strides;
jcp.dst_block_dims = sorted_dst_dims;
jcp.n = std::min(n, n2);
jcp.ndims = sorted_order.size();
jcp.data_size = params.data_size;

jcp = TransposeExecutor::prepareParams(params);
#if defined(OPENVINO_ARCH_X86_64)
if (mayiuse(cpu::x64::avx512_core)) {
permute_kernel.reset(new jit_uni_permute_kernel_f32<cpu::x64::avx512_core>(jcp));
Expand All @@ -281,7 +169,7 @@ void PermuteKernel::execute(const uint8_t* src_data, uint8_t* dst_data, const in
return;
}

referenceExecute(src_data, dst_data, mb);
RefTransposeExecutor::referenceExecute(src_data, dst_data, jcp, mb);
}

void PermuteKernel::execute(const uint8_t* src_data, uint8_t* dst_data) {
Expand All @@ -291,7 +179,7 @@ void PermuteKernel::execute(const uint8_t* src_data, uint8_t* dst_data) {
return;
}

referenceExecute(src_data, dst_data, dst_dims[0]);
RefTransposeExecutor::referenceExecute(src_data, dst_data, jcp, dst_dims[0]);
}

void PermuteKernel::optimizedExecute(const uint8_t* src_data, uint8_t* dst_data, const int mb) {
Expand Down Expand Up @@ -343,60 +231,6 @@ void PermuteKernel::optimizedExecute(const uint8_t* src_data, uint8_t* dst_data,
return;
}

static inline size_t parallel_init(size_t start, size_t nDims, const SizeVector& dims, SizeVector& indexes) {
for (int j = nDims - 1; j >= 0; j--) {
indexes[j] = start % dims[j];
start = start / dims[j];
}
return start;
}

static inline void parallel_step(size_t nDims, const SizeVector& dims, SizeVector& indexes) {
for (int j = nDims - 1; j >= 0; --j) {
++indexes[j];
if (indexes[j] < dims[j])
break;
else
indexes[j] = 0;
}
}

void PermuteKernel::referenceExecute(const uint8_t* src_data, uint8_t* dst_data, const int mb) {
SizeVector dst_dims = jcp.dst_block_dims;
const SizeVector dst_strides = jcp.dst_strides;
const SizeVector src_strides = jcp.src_strides;
const size_t data_size = jcp.data_size;
const size_t ndims = dst_dims.size();

if (static_cast<int>(dst_dims[0]) != mb)
dst_dims[0] = mb;

size_t work_amount = std::accumulate(dst_dims.begin(), dst_dims.end(), 1, std::multiplies<size_t>());

auto get_idx = [ndims, data_size](const SizeVector& indexes, const SizeVector& strides) {
size_t idx = 0;
for (size_t i = 0; i < ndims; ++i)
idx += indexes[i] * strides[i];
return idx * data_size;
};

parallel_nt(0, [&](const int ithr, const int nthr) {
size_t start = 0, end = 0;
SizeVector indexes(ndims, 0);
splitter(work_amount, nthr, ithr, start, end);

parallel_init(start, ndims, dst_dims, indexes);

for (size_t iwork = start; iwork < end; ++iwork) {
const size_t dst_idx = get_idx(indexes, dst_strides);
const size_t src_idx = get_idx(indexes, src_strides);
cpu_memcpy(&dst_data[dst_idx], &src_data[src_idx], data_size);

parallel_step(ndims, dst_dims, indexes);
}
});
}

size_t PermuteParams::hash() const {
using namespace dnnl::impl;
using namespace dnnl::impl::primitive_hashing;
Expand Down
3 changes: 0 additions & 3 deletions src/plugins/intel_cpu/src/nodes/common/permute_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,7 @@ class PermuteKernel {
}

private:
void prepareParams();

void optimizedExecute(const uint8_t* src_data, uint8_t* dst_data, const int mb);
void referenceExecute(const uint8_t* src_data, uint8_t* dst_data, const int mb);

jit_permute_config_params jcp = {};
std::shared_ptr<jit_uni_permute_kernel> permute_kernel;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,8 @@ bool ov::intel_cpu::ACLInterpolateExecutor::init(const InterpolateAttrs &interpo
auto dstDims = dstDescs[0]->getShape().getDims();

if (srcDescs[0]->hasLayoutType(LayoutType::nspc) && dstDescs[0]->hasLayoutType(LayoutType::nspc)) {
auto mover = [](VectorDims &_shape) {
std::swap(_shape[1], _shape[2]);
std::swap(_shape[2], _shape[3]);
};
mover(srcDims);
mover(dstDims);
changeLayoutToNhwc(srcDims);
changeLayoutToNhwc(dstDims);
}

auto srcTensorInfo = arm_compute::TensorInfo(shapeCast(srcDims), 1,
Expand Down
67 changes: 67 additions & 0 deletions src/plugins/intel_cpu/src/nodes/executors/acl/acl_transpose.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "acl_transpose.hpp"
#include "acl_utils.hpp"

bool ov::intel_cpu::ACLTransposeExecutor::init(const ov::intel_cpu::TransposeParams &transposeParams,
const std::vector<MemoryDescPtr> &srcDescs,
const std::vector<MemoryDescPtr> &dstDescs,
const dnnl::primitive_attr &attr) {
auto inputOrder = transposeParams.permuteParams.order;
if (inputOrder.empty()) {
inputOrder.resize(srcDescs[0]->getShape().getRank());
std::iota(inputOrder.begin(), inputOrder.end(), 0);
}

std::vector<int> vec;
auto srcDims = srcDescs[0]->getShape().getStaticDims();
auto dstDims = dstDescs[0]->getShape().getStaticDims();
if (srcDescs[0]->hasLayoutType(LayoutType::nspc)) {
changeLayoutToNhwc(srcDims);
changeLayoutToNhwc(dstDims);
for (int i = inputOrder.size() - 1; i >= 0 ; --i) {
auto it = find(srcDims.rbegin(), srcDims.rend(), dstDims[i]);
int index = it - srcDims.rbegin();
vec.push_back(index);
}
} else {
for (unsigned int i = 0; i < inputOrder.size(); ++i) {
vec.push_back(axisCast(inputOrder[i], inputOrder.size()));
}
std::reverse(vec.begin(), vec.end());
}
arm_compute::PermutationVector order;
for (unsigned int i = 0; i < inputOrder.size(); ++i) {
order.set(i, vec[i]);
}
auto srcTensorInfo = arm_compute::TensorInfo(shapeCast(srcDims), 1,
precisionToAclDataType(srcDescs[0]->getPrecision()),
getAclDataLayoutByMemoryDesc(srcDescs[0]));
auto dstTensorInfo = arm_compute::TensorInfo(shapeCast(dstDims), 1,
precisionToAclDataType(dstDescs[0]->getPrecision()),
getAclDataLayoutByMemoryDesc(dstDescs[0]));
arm_compute::Status status = arm_compute::NEPermute::validate(&srcTensorInfo, &dstTensorInfo, order);
if (!status) {
DEBUG_LOG("NEPermute validation failed: ", status.error_description());
return false;
}
srcTensor.allocator()->init(srcTensorInfo);
dstTensor.allocator()->init(dstTensorInfo);

acl_permute = std::make_unique<arm_compute::NEPermute>();
acl_permute->configure(&srcTensor, &dstTensor, order);
return true;
}

void ov::intel_cpu::ACLTransposeExecutor::exec(const std::vector<MemoryCPtr> &src, const std::vector<MemoryPtr> &dst,
const int MB) {
srcTensor.allocator()->import_memory(src[0]->getData());
dstTensor.allocator()->import_memory(dst[0]->getData());

acl_permute->run();

srcTensor.allocator()->free();
dstTensor.allocator()->free();
}
57 changes: 57 additions & 0 deletions src/plugins/intel_cpu/src/nodes/executors/acl/acl_transpose.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "nodes/executors/transpose.hpp"
#include "utils/debug_capabilities.h"

namespace ov {
namespace intel_cpu {

class ACLTransposeExecutor : public TransposeExecutor {
public:
using TransposeExecutor::TransposeExecutor;

bool init(const TransposeParams& transposeParams,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs,
const dnnl::primitive_attr &attr) override;
void exec(const std::vector<MemoryCPtr>& src, const std::vector<MemoryPtr>& dst, const int MB) override;
impl_desc_type getImplType() const override { return implType; }
private:
static const impl_desc_type implType = impl_desc_type::acl;
arm_compute::Tensor srcTensor, dstTensor;
std::unique_ptr<arm_compute::NEPermute> acl_permute;
};

class ACLTransposeExecutorBuilder : public TransposeExecutorBuilder {
public:
bool isSupported(const TransposeParams& transposeParams,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs) const override {
if (!(srcDescs[0]->hasLayoutType(LayoutType::ncsp) &&
dstDescs[0]->hasLayoutType(LayoutType::ncsp)) &&
!(srcDescs[0]->hasLayoutType(LayoutType::nspc) &&
dstDescs[0]->hasLayoutType(LayoutType::nspc))) {
DEBUG_LOG("NEPermute does not support precisions:",
" src: ", srcDescs[0]->serializeFormat(),
" dst: ", dstDescs[0]->serializeFormat());
return false;
}
if (srcDescs[0]->getShape().getRank() > 4) {
DEBUG_LOG("NEPermute supports up to 4D input tensor. Passed tensor rank: ",
srcDescs[0]->getShape().getRank());
return false;
}
return true;
}

TransposeExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const override {
return std::make_shared<ACLTransposeExecutor>(context);
}
};

} // namespace intel_cpu
} // namespace ov
11 changes: 11 additions & 0 deletions src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,17 @@
namespace ov {
namespace intel_cpu {

/**
* @brief ACL handles NHWC specifically, it thinks it is NCHW, so we need to change layout manually:
* NCHW (0, 1, 2, 3) -> NHWC (0, 2, 3, 1)
* @param shape shape to convert
* @return none
*/
inline void changeLayoutToNhwc(VectorDims& shape) {
std::swap(shape[1], shape[2]);
std::swap(shape[2], shape[3]);
}

/**
* @brief Return ComputeLibrary TensorShape with reverted layout schema used in ACL
* @param dims vector of dimensions to convert
Expand Down
Loading

0 comments on commit 238c7fa

Please sign in to comment.